# Analysis of Patient Data

# Pre-step

### Research Question

    Determine the distribution and correlation of variables in the cleaned patient data.


### Dataset

    patient data.csv


### Libraries

In [69]:
import pandas as pd
import plotly.express as px
import plotly.figure_factory as ff
import warnings

In [70]:
warnings.filterwarnings("ignore")

# Data Pre-processing 

In [71]:
patient_df = pd.read_csv("./patient_data.csv")
patient_df.shape

(100, 16)

In [72]:
patient_df.head()

Unnamed: 0,patient_id,allergies,blood_type,closest_hospital,community,country,date_of_birth,disabilities,name,gender,has_children,has_dependent,height,marital_status,medical_problems,weight
0,1,,AB-,1,rural,nigeria,2006-11-02 00:00:00,,ACRA SMITH,Male,No,Yes,221,single,underweight,73
1,2,,O-,12,rural,nigeria,1990-12-09 00:00:00,scoliosis,ADOLPH NEILON,Female,Yes,Yes,212,single,drug addiction,70
2,3,,AB-,2,urban,nigeria,1990-01-26 00:00:00,atherosclerosis,ANKI HENDRI,Male,No,No,226,single,overweight,84
3,4,pollen,A-,12,rural,nigeria,2006-06-15 00:00:00,delirium,GARBO SHARRON,Male,Yes,Yes,169,single,overweight,63
4,5,flowers,AB-,3,suburban,nigeria,1987-07-18 00:00:00,delirium,ADONAI NEIVA,Male,No,Yes,196,married,,85


In [73]:
patient_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 16 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   patient_id        100 non-null    int64 
 1   allergies         73 non-null     object
 2   blood_type        100 non-null    object
 3   closest_hospital  100 non-null    int64 
 4   community         100 non-null    object
 5   country           100 non-null    object
 6   date_of_birth     100 non-null    object
 7   disabilities      82 non-null     object
 8   name              99 non-null     object
 9   gender            100 non-null    object
 10  has_children      100 non-null    object
 11  has_dependent     100 non-null    object
 12  height            100 non-null    int64 
 13  marital_status    100 non-null    object
 14  medical_problems  75 non-null     object
 15  weight            100 non-null    int64 
dtypes: int64(4), object(12)
memory usage: 12.6+ KB


### Check for duplicates

In [74]:
patient_df[patient_df.duplicated() == True]

Unnamed: 0,patient_id,allergies,blood_type,closest_hospital,community,country,date_of_birth,disabilities,name,gender,has_children,has_dependent,height,marital_status,medical_problems,weight


### Check for missing values

In [75]:
nan = pd.DataFrame({
    'Missing Values': patient_df.isnull().sum().sort_values(ascending=False)
})

nan[nan['Missing Values'] > 0]

Unnamed: 0,Missing Values
allergies,27
medical_problems,25
disabilities,18
name,1


In [76]:
patient_df.dropna(axis=0, inplace=True)
patient_df.reset_index(drop=True, inplace=True)

In [77]:
nan = pd.DataFrame({
    'Missing Values': patient_df.isnull().sum().sort_values(ascending=False)
})

nan[nan['Missing Values'] > 0]

Unnamed: 0,Missing Values


In [78]:
patient_df.shape

(47, 16)

In [79]:
patient_df.head()

Unnamed: 0,patient_id,allergies,blood_type,closest_hospital,community,country,date_of_birth,disabilities,name,gender,has_children,has_dependent,height,marital_status,medical_problems,weight
0,4,pollen,A-,12,rural,nigeria,2006-06-15 00:00:00,delirium,GARBO SHARRON,Male,Yes,Yes,169,single,overweight,63
1,6,nuts,B-,10,urban,nigeria,2010-03-21 00:00:00,alzheimer's,ANKUR HENDRY,Female,No,Yes,167,single,underweight,79
2,8,nuts,A+,5,urban,nigeria,2012-08-24 00:00:00,arthritis,ADONIJAH NEKA,Male,No,No,147,divorced,drug addiction,69
3,12,pollen,AB-,5,rural,nigeria,1996-08-11 00:00:00,ruptured discs,ANNABELLA HENNESSEY,Female,No,Yes,151,widowed,underweight,86
4,13,pollen,B+,15,urban,nigeria,1996-01-13 00:00:00,ruptured discs,GARETH SHASHWAT,Female,No,Yes,207,divorced,none,91


In [80]:
patient_df.to_csv('Patient_data_clean.csv', index=False)

In [81]:
patient_df = pd.read_csv('./Patient_data_clean.csv', index_col='patient_id')
patient_df.shape

(47, 15)

### Describtive statistics

In [82]:
patient_df.describe()

Unnamed: 0,closest_hospital,height,weight
count,47.0,47.0,47.0
mean,6.361702,202.787234,83.0
std,4.336025,36.82859,11.090537
min,1.0,133.0,60.0
25%,2.5,172.5,77.0
50%,5.0,200.0,83.0
75%,10.0,236.0,90.0
max,15.0,260.0,102.0


# Data Visualization

## Distribution plots 

### Scatter matrix of "closest_hospital", "height" and "weight" by gender

In [83]:
fig = px.scatter_matrix(patient_df,
    dimensions=["closest_hospital", "height", "weight"],
    color="gender")
fig.show()

### Histogram of "closest_hospital" by gender

In [84]:
px.histogram(patient_df, x="closest_hospital", color="gender", marginal="box", hover_data=patient_df.columns)

### Histogram of "height" by gender

In [85]:
px.histogram(patient_df, x="height", color="gender", marginal="box", hover_data=patient_df.columns)

### Histogram of "weight" by gender

In [86]:
px.histogram(patient_df, x="weight", color="gender", marginal="box", hover_data=patient_df.columns)

### Scatter matrix of "closest_hospital", "height" and "weight" by medical_problems

In [87]:
fig = px.scatter_matrix(patient_df,
    dimensions=["closest_hospital", "height", "weight"],
    color="medical_problems")
fig.show()

### Histogram of "closest_hospital" by medical_problems

In [88]:
px.histogram(patient_df, x="closest_hospital", y="height", color="medical_problems", marginal="box", hover_data=patient_df.columns)

### Histogram of "height" by medical_problems

In [89]:
px.histogram(patient_df, x="closest_hospital", y="weight", color="medical_problems", marginal="box", hover_data=patient_df.columns)

### Histogram of "weight" by medical_problems

In [90]:
px.histogram(patient_df, x="height", y="weight", color="medical_problems", marginal="box", hover_data=patient_df.columns)

### Distplot and Box plot for checking outlier 

#### closest_hospital

In [91]:
# Histogram data
hist_data = [patient_df["closest_hospital"]]

# Group data
group_labels = ["closest_hospital"]
colors = ['green']

# Distplot 
fig = ff.create_distplot(hist_data, group_labels, bin_size=2,
                         #curve_type="normal", 
                         colors=colors)

# Title
fig.update_layout(title_text="Histogram and Kernel Probability Density Function Plot of closest_hospital")
fig.show()

In [92]:
fig = px.box(patient_df["closest_hospital"], y="closest_hospital")
fig.show()

#### height

In [93]:
# Histogram data
hist_data = [patient_df["height"]]

# Group data
group_labels = ["height"]
colors = ['green']

# Distplot 
fig = ff.create_distplot(hist_data, group_labels, bin_size=20,
                         #curve_type="normal", 
                         colors=colors)

# Title
fig.update_layout(title_text="Histogram and Kernel Probability Density Function Plot of height")
fig.show()

In [94]:
fig = px.box(patient_df["height"], y="height")
fig.show()

#### weight

In [95]:
# Histogram data
hist_data = [patient_df["weight"]]

# Group data
group_labels = ["weight"]
colors = ['green']

# Distplot 
fig = ff.create_distplot(hist_data, group_labels, bin_size=5,
                         #curve_type="normal", 
                         colors=colors)

# Title
fig.update_layout(title_text="Histogram and Kernel Probability Density Function Plot of weight")
fig.show()

In [96]:
fig = px.box(patient_df["weight"], y="weight")
fig.show()

# Data Analysis

## Correlation analysis 

### Correlation matrix

In [97]:
correlation = patient_df.corr()
fig = px.imshow(correlation, text_auto=True, aspect="auto")
fig.show()

### Transformation of variables

    Transform the catagorical variables in the patient_df to dummy-variables format.


In [98]:
print(patient_df['community'].unique())
print(patient_df['disabilities'].unique())
print(patient_df['gender'].unique())
print(patient_df['has_children'].unique())
print(patient_df['has_dependent'].unique())
print(patient_df['marital_status'].unique())
print(patient_df['medical_problems'].unique())

['rural' 'urban' 'suburban']
['delirium' "alzheimer's" 'arthritis' 'ruptured discs' 'none' 'amnesia'
 'fibromas' 'cyclothimic disorder' 'scoliosis' 'atherosclerosis']
['Male' 'Female']
['Yes' 'No']
['Yes' 'No']
['single' 'divorced' 'widowed' 'married']
['overweight' 'underweight' 'drug addiction' 'none' 'mental issue'
 'not immunized' 'physical injury']


In [99]:
patient_df = pd.concat([patient_df, pd.get_dummies(patient_df['community'], prefix='community')],axis=1)
patient_df = pd.concat([patient_df, pd.get_dummies(patient_df['disabilities'], prefix='disabilities')],axis=1)
patient_df = pd.concat([patient_df, pd.get_dummies(patient_df['gender'], prefix='gender')],axis=1)
patient_df = pd.concat([patient_df, pd.get_dummies(patient_df['has_children'], prefix='has_children')],axis=1)
patient_df = pd.concat([patient_df, pd.get_dummies(patient_df['has_dependent'], prefix='has_dependent')],axis=1)
patient_df = pd.concat([patient_df, pd.get_dummies(patient_df['marital_status'], prefix='marital_status')],axis=1)
patient_df = pd.concat([patient_df, pd.get_dummies(patient_df['medical_problems'], prefix='medical_problems')],axis=1)

patient_df.drop(['community', 'disabilities', 'gender', 'has_children', 'has_dependent', 'marital_status', 'medical_problems'], axis=1, inplace=True)

patient_df.shape

(47, 38)

In [100]:
patient_df.head()

Unnamed: 0_level_0,allergies,blood_type,closest_hospital,country,date_of_birth,name,height,weight,community_rural,community_suburban,...,marital_status_married,marital_status_single,marital_status_widowed,medical_problems_drug addiction,medical_problems_mental issue,medical_problems_none,medical_problems_not immunized,medical_problems_overweight,medical_problems_physical injury,medical_problems_underweight
patient_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
4,pollen,A-,12,nigeria,2006-06-15 00:00:00,GARBO SHARRON,169,63,1,0,...,0,1,0,0,0,0,0,1,0,0
6,nuts,B-,10,nigeria,2010-03-21 00:00:00,ANKUR HENDRY,167,79,0,0,...,0,1,0,0,0,0,0,0,0,1
8,nuts,A+,5,nigeria,2012-08-24 00:00:00,ADONIJAH NEKA,147,69,0,0,...,0,0,0,1,0,0,0,0,0,0
12,pollen,AB-,5,nigeria,1996-08-11 00:00:00,ANNABELLA HENNESSEY,151,86,1,0,...,0,0,1,0,0,0,0,0,0,1
13,pollen,B+,15,nigeria,1996-01-13 00:00:00,GARETH SHASHWAT,207,91,0,0,...,0,0,0,0,0,1,0,0,0,0


#### Check for more correlation using the transformed dummy variables

In [101]:
corr = patient_df.corr()
fig = px.imshow(corr, text_auto=True, aspect="auto")
fig.show()

In [102]:
corr.style.background_gradient(cmap='viridis')

Unnamed: 0,closest_hospital,height,weight,community_rural,community_suburban,community_urban,disabilities_alzheimer's,disabilities_amnesia,disabilities_arthritis,disabilities_atherosclerosis,disabilities_cyclothimic disorder,disabilities_delirium,disabilities_fibromas,disabilities_none,disabilities_ruptured discs,disabilities_scoliosis,gender_Female,gender_Male,has_children_No,has_children_Yes,has_dependent_No,has_dependent_Yes,marital_status_divorced,marital_status_married,marital_status_single,marital_status_widowed,medical_problems_drug addiction,medical_problems_mental issue,medical_problems_none,medical_problems_not immunized,medical_problems_overweight,medical_problems_physical injury,medical_problems_underweight
closest_hospital,1.0,0.10409,-0.063741,0.112516,-0.163756,0.033963,0.212208,0.147861,-0.062598,0.080934,-0.270067,0.03137,-0.180888,0.016761,-0.025717,0.129661,-0.01592,0.01592,0.098979,-0.098979,-0.290904,0.290904,0.340902,-0.109527,-0.085966,-0.091709,-0.017393,0.215494,-0.228055,-0.265502,0.131238,0.038854,0.146102
height,0.10409,1.0,0.143757,0.011515,-0.039528,0.022867,-0.060486,-0.092684,-0.029531,0.123162,0.051232,-0.033486,-0.078262,0.182821,-0.132156,-0.010341,-0.071101,0.071101,0.020269,-0.020269,0.035834,-0.035834,-0.056698,-0.102154,0.063585,0.056481,0.101979,-0.051685,-0.036473,0.073192,0.191225,-0.091642,-0.172758
weight,-0.063741,0.143757,1.0,0.0208,0.059541,-0.070209,-0.188681,-0.226418,-0.118993,0.208485,0.226628,-0.230572,-0.168519,0.303247,0.041697,-0.048036,0.30424,-0.30424,0.050522,-0.050522,-0.303177,0.303177,-0.20126,-0.018868,0.008185,0.191763,-0.226628,0.337699,0.10231,0.103127,-0.244061,-0.079329,-0.052299
community_rural,0.112516,0.011515,0.0208,1.0,-0.378456,-0.615309,0.20788,-0.088192,-0.178774,0.445477,0.011641,0.081791,0.011641,-0.244408,-0.045244,-0.144338,-0.156269,0.156269,0.001946,-0.001946,-0.069922,0.069922,-0.088192,-0.088192,-0.086077,0.285192,0.011641,0.098198,0.086077,-0.178774,-0.125135,0.007946,0.011641
community_suburban,-0.163756,-0.039528,0.059541,-0.378456,1.0,-0.496785,-0.190724,-0.190724,0.266806,-0.168594,-0.060875,0.132425,0.08971,0.080987,0.011495,0.132425,-0.29463,0.29463,0.014999,-0.014999,0.08138,-0.08138,-0.027742,0.298223,-0.133126,-0.060875,0.08971,0.051053,-0.078973,0.061234,-0.060875,0.266806,-0.21146
community_urban,0.033963,0.022867,-0.070209,-0.615309,-0.496785,1.0,-0.032485,0.245116,-0.059599,-0.274106,0.040929,-0.189466,-0.087314,0.16019,0.032632,0.022555,0.397436,-0.397436,-0.014599,0.014599,-0.003746,0.003746,0.106315,-0.171286,0.194082,-0.215558,-0.087314,-0.13555,-0.013451,0.115472,0.169172,-0.23467,0.169172
disabilities_alzheimer's,0.212208,-0.060486,-0.188681,0.20788,-0.190724,-0.032485,1.0,-0.119048,-0.090094,-0.105234,-0.131991,-0.072739,-0.131991,-0.179374,-0.105234,-0.072739,-0.171286,0.171286,0.047079,-0.047079,-0.129874,0.129874,0.104762,-0.119048,0.10225,-0.131991,0.074795,0.049487,-0.10225,-0.090094,0.074795,-0.090094,0.074795
disabilities_amnesia,0.147861,-0.092684,-0.226418,-0.088192,-0.190724,0.245116,-0.119048,1.0,-0.090094,-0.105234,-0.131991,-0.072739,-0.131991,-0.179374,-0.105234,-0.072739,0.106315,-0.106315,-0.091216,0.091216,0.012081,-0.012081,0.104762,-0.119048,-0.043379,0.074795,-0.131991,0.049487,0.189008,-0.090094,0.074795,-0.090094,-0.131991
disabilities_arthritis,-0.062598,-0.029531,-0.118993,-0.178774,0.266806,-0.059599,-0.090094,-0.090094,1.0,-0.07964,-0.099889,-0.055048,-0.099889,-0.135748,-0.07964,-0.055048,-0.059599,0.059599,0.244949,-0.244949,0.152383,-0.152383,0.474494,-0.090094,-0.179775,-0.099889,0.421754,-0.109233,-0.003908,-0.068182,-0.099889,-0.068182,-0.099889
disabilities_atherosclerosis,0.080934,0.123162,0.208485,0.445477,-0.168594,-0.274106,-0.105234,-0.105234,-0.07964,1.0,-0.116675,-0.064299,-0.116675,-0.15856,-0.093023,-0.064299,0.032632,-0.032632,-0.019508,0.019508,0.073421,-0.073421,-0.105234,0.142066,-0.102711,0.111814,0.111814,0.300747,-0.219116,-0.07964,-0.116675,0.232283,-0.116675


# Conclusion and Discussion
    The correlation matrix above reveals that the medical problems a patient incounter has a relatively moderate positive corelation to the patient disabilities(arthritis, atherosclerosis, cyclothimic disorder, delirium, scoliosis), gender(male) and marital_status(divorced).
    
    The patient data has only few numerical variables which affected the precision of the analysis interms of creating limitation on either log transformation or robus scaling that can help to reduce outliers and increase precision on the above correlation analysis.  