In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

%matplotlib inline

In [2]:
admission_source_mapping_df = pd.read_csv("Final_Data_Collection/Admission_Source_Mapping.csv")
admission_source_mapping_df

Unnamed: 0,admission_source_id,description
0,1,Physician Referral
1,2,Clinic Referral
2,3,HMO Referral
3,4,Transfer from a hospital
4,5,Transfer from a Skilled Nursing Facility (SNF)
5,6,Transfer from another health care facility
6,7,Emergency Room
7,8,Court/Law Enforcement
8,9,Not Available
9,10,Transfer from critial access hospital


In [3]:
admission_type_mapping_df = pd.read_csv("Final_Data_Collection/Admission_Type_Mapping.csv")
admission_type_mapping_df

Unnamed: 0,admission_type_id,description
0,1,Emergency
1,2,Urgent
2,3,Elective
3,4,Newborn
4,5,Not Available
5,6,
6,7,Trauma Center
7,8,Not Mapped


In [4]:
discharge_df = pd.read_csv("Final_Data_Collection/Discharge_Disposition_Mapping.csv")
discharge_df

Unnamed: 0,discharge_disposition_id,description
0,1,Discharged to home
1,2,Discharged/transferred to another short term h...
2,3,Discharged/transferred to SNF
3,4,Discharged/transferred to ICF
4,5,Discharged/transferred to another type of inpa...
5,6,Discharged/transferred to home with home healt...
6,7,Left AMA
7,8,Discharged/transferred to home under care of H...
8,9,Admitted as an inpatient to this hospital
9,10,Neonate discharged to another hospital for neo...


In [5]:
pd.set_option('display.max_columns', None)
# diabetes_df = pd.read_csv("Final_Data_Collection/Diabetes_Data_1999_2008.csv")
diabetes_df = pd.read_csv("cleaned_data.csv")
diabetes_df

FileNotFoundError: [Errno 2] No such file or directory: 'Final_Data_Collection/cleaned_data.csv'

In [None]:
def col_percent(df, colname, value):
    num_question = len(df[df[colname] == value])
    num_elem = len(df)
    return num_question, num_elem, num_question/num_elem

In [None]:
_, _, weight_question_percentage = col_percent(diabetes_df, "weight", value="?")
print("Percentage ? Weight", weight_question_percentage)

In [None]:
diabetes_df.drop(columns=["weight"], inplace=True)

In [None]:
# Get correlation matrix for diabetes df
corr = diabetes_df.corr() 

# plot the heatmap
sns.heatmap(corr, 
        xticklabels=corr.columns,
        yticklabels=corr.columns)

In [None]:
# Set the figure size
sns.set(rc={'figure.figsize':(11.7,8.27)})

In [None]:
readmitted_df = diabetes_df.loc[diabetes_df["readmitted"] == "NO"]
morethan30_df = diabetes_df.loc[diabetes_df["readmitted"] == ">30"]
lessthan30_df = diabetes_df.loc[diabetes_df["readmitted"] == "<30"]

In [None]:
sns.histplot(
    diabetes_df,
    x="age", hue="readmitted",
    multiple="stack",
)

In [None]:
sns.histplot(
    diabetes_df,
    x="number_inpatient", hue="readmitted",
    multiple="stack",
    binwidth=1
)
plt.xlim(0,10)

In [None]:
sns.histplot(
    diabetes_df,
    x="number_outpatient", hue="readmitted",
    multiple="stack",
    binwidth=1
)
plt.xlim(0,10)

In [None]:
sns.histplot(
    diabetes_df,
    x="number_diagnoses", hue="readmitted",
    multiple="stack",
    binwidth=1
)
plt.xlim(0,10)


In [None]:
sns.histplot(
    diabetes_df,
    x="diag_1", hue="readmitted",
    multiple="stack",
    binwidth=1
)
plt.xlim(0,10)

In [None]:
sns.histplot(
    diabetes_df,
    x="admission_type_id", hue="readmitted",
    multiple="stack",
    binwidth=1
)
plt.xlim(0,10)

In [None]:
total_None, total_elem, max_glu_serum_none_percentage = col_percent(diabetes_df, "max_glu_serum", "None")
print("Percentage ? Max Glu Serum", max_glu_serum_none_percentage)
print("Total None", total_None)

In [None]:
total_None, total_elem, A1Cresult_none_percentage = col_percent(diabetes_df, "A1Cresult", "None")
print("Percentage ? Max Glu Serum", A1Cresult_none_percentage)
print("Total None", total_None, " Dataframe Size", total_elem)

In [None]:
diabetes_df

In [None]:
features = ["race", "gender","age", "admission_source_id", "time_in_hospital", "num_lab_procedures", 
            "num_medications", "number_outpatient", "number_emergency", "number_inpatient", "diag_1", 
           "number_diagnoses", "max_glu_serum", "A1Cresult", "insulin", "change", "diabetesMed",
           "discharge_disposition_id", "readmitted"]

In [None]:
diabetes_features = diabetes_df[features]

In [None]:
diabetes_features.race.unique()

In [None]:
raceMap = {
    "?": -1,
    "Other": 0,
    "Caucasian": 1,
    "AfricanAmerican": 1,
    "Asian": 2,
    "Hispanic": 3,
}

diabetes_features['race'] = diabetes_features['race'].map(raceMap)

In [None]:
diabetes_features.gender.unique()

In [None]:
genderMap = {
    "Unknown/Invalid": -1,
    "Male": 0,
    "Female": 1,
}
diabetes_features['gender'] = diabetes_features['gender'].map(genderMap)

In [None]:
ageMap = {
    "?": -1,
    "[0-10)": 0,
    "[10-20)": 1,
    "[20-30)": 2,
    "[30-40)": 3,
    "[40-50)": 4,
    "[50-60)": 5,
    "[60-70)": 6,
    "[70-80)": 7,
    "[80-90)": 8,
    "[90-100)": 9,
}

diabetes_features['age'] = diabetes_features['age'].map(ageMap)

In [None]:
max_glu_serumMap = {
    "None": -1,
    ">300": 0,
    "Norm": 1,
    ">200": 2
}

diabetes_features["max_glu_serum"] = diabetes_features["max_glu_serum"].map(max_glu_serumMap)

In [None]:
A1CresultMap = {
    "None": -1,
    ">7": 0,
    "Norm": 1,
    ">8": 2
}

diabetes_features["A1Cresult"] = diabetes_features["A1Cresult"].map(A1CresultMap)

In [None]:
insulinMap = {
    "No": 0,
    "Up": 1,
    "Steady": 2,
    "Down": 3
}

diabetes_features["insulin"] = diabetes_features["insulin"].map(insulinMap)

In [None]:
changeMap = {
    "No": 0,
    "Ch": 1,
}

diabetes_features["change"] = diabetes_features["change"].map(changeMap)

In [None]:
diabetesMedMap = {
    "No": 0,
    "Yes": 1,
}

diabetes_features["diabetesMed"] = diabetes_features["diabetesMed"].map(diabetesMedMap)
diabetes_features

In [None]:
readmittedMap = {
    "NO": 0,
    "<30": 1,
    ">30": 2,
}

diabetes_features["readmitted"] = diabetes_features["readmitted"].map(readmittedMap)

In [None]:
diag_1Map = {}
for i, v in enumerate(diabetes_features.diag_1.unique()):
    diag_1Map[v] = i

diabetes_features["diag_1"] = diabetes_features["diag_1"].map(diag_1Map)

In [None]:
diabetes_features.head(1000)

In [None]:
featuresX = ["race", "gender","age", "admission_source_id", "time_in_hospital", "num_lab_procedures", "number_outpatient", "number_emergency", "number_inpatient", "diag_1", 
           "number_diagnoses", "max_glu_serum", "A1Cresult", "insulin", "change", "diabetesMed", "readmitted"]
featuresY = ["num_medications"] # try with "discharge_disposition_id", 

In [None]:
diabetes_features[featuresX].to_csv("test.csv")

In [None]:
X = diabetes_features[featuresX].to_numpy()

In [None]:
y = diabetes_features[featuresY].to_numpy().flatten()

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=42)

In [None]:
rfc = RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1)

In [None]:
rfc.fit(X_train, y_train)

In [None]:
rfc.score(X_test, y_test)

In [None]:
pred_difference = [0 for i in range(len(X_test))]

In [None]:
for i, (x, y) in enumerate(zip(X_test, y_test)):
    prediction = rfc.predict(x.reshape(1, -1))
    pred_difference[i] = prediction[0]-y

In [None]:
sns.histplot(pred_difference, binwidth=1)

In [None]:
pred_std = np.std(pred_difference)
pred_mean = np.mean(pred_difference)
print("Prediction std:", pred_std)
print("Prediction mean:", pred_mean)
coverage = 0
