In [690]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, f1_score
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from ydata_profiling import ProfileReport
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier

In [691]:
from scipy.stats import chi2_contingency


def TestIndependence(X,y,var,alpha=0.05):
    dfObserved = pd.crosstab(y,X)
    chi2, p, dof, expected = chi2_contingency(dfObserved.values)
    dfExpected = pd.DataFrame(expected, columns=dfObserved.columns, index = dfObserved.index)
    if p<alpha:
        print("{0} is IMPORTANT for Prediction".format(var))
        return False
    else:
        print("{0} is NOT an important predictor. (Discard {0} from model)".format(var))
        return True

In [692]:
def plot_importance(coef,name):
    imp_coef = coef.sort_values()
    plt.figure(figsize=(8,10))
    imp_coef.plot(kind = "barh")
    plt.title("Feature importance using " + name + " Model")
    plt.show()
    


In [693]:
pd.set_option('display.max_columns', None)

train_df = pd.read_csv("train.csv")
train_df.set_index("encounter_id", inplace=True)
train_df.head()

Unnamed: 0_level_0,country,patient_id,race,gender,age,weight,payer_code,outpatient_visits_in_previous_year,emergency_visits_in_previous_year,inpatient_visits_in_previous_year,admission_type,medical_specialty,average_pulse_bpm,discharge_disposition,admission_source,length_of_stay_in_hospital,number_lab_tests,non_lab_procedures,number_of_medications,primary_diagnosis,secondary_diagnosis,additional_diagnosis,number_diagnoses,glucose_test_result,a1c_test_result,change_in_meds_during_hospitalization,prescribed_diabetes_meds,medication,readmitted_binary,readmitted_multiclass
encounter_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1
533253,USA,70110,Caucasian,Female,[70-80),?,?,0,0,2,Emergency,Family/GeneralPractice,127,Discharged to home,Emergency Room,2,48,0,20,515,276,466,8,,,No,No,[],No,>30 days
426224,USA,29775006,AfricanAmerican,Male,[50-60),?,?,0,0,0,Emergency,?,128,Discharged/transferred to a federal health car...,Clinic Referral,14,71,0,25,38,785,162,9,,,No,Yes,['insulin'],No,No
634063,USA,80729253,Caucasian,Female,[60-70),?,?,0,0,1,,Family/GeneralPractice,94,Discharged to home,,6,60,1,22,534,135,250,6,,,Ch,Yes,"['glimepiride', 'insulin']",No,No
890610,USA,2919042,AfricanAmerican,Male,[60-70),?,MC,0,0,1,Emergency,InternalMedicine,81,Discharged to home,Transfer from another health care facility,6,48,2,9,569,562,455,5,,,No,No,[],No,No
654194,USA,84871971,Caucasian,Female,[70-80),?,HM,1,0,0,Elective,?,126,Discharged/transferred to home with home healt...,Physician Referral,6,47,1,15,715,599,428,9,,,No,No,[],No,>30 days


In [694]:
test_df = pd.read_csv("test.csv")
test_df.set_index("encounter_id", inplace=True)
test_df.head()

Unnamed: 0_level_0,country,patient_id,race,gender,age,weight,payer_code,outpatient_visits_in_previous_year,emergency_visits_in_previous_year,inpatient_visits_in_previous_year,admission_type,medical_specialty,average_pulse_bpm,discharge_disposition,admission_source,length_of_stay_in_hospital,number_lab_tests,non_lab_procedures,number_of_medications,primary_diagnosis,secondary_diagnosis,additional_diagnosis,number_diagnoses,glucose_test_result,a1c_test_result,change_in_meds_during_hospitalization,prescribed_diabetes_meds,medication
encounter_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1
499502,USA,103232799,Caucasian,Male,[80-90),?,HM,0,0,0,Emergency,?,139,Discharged to home,Emergency Room,3,48,0,14,491.0,414.0,250,6,,,No,Yes,['metformin']
447319,USA,93395304,Caucasian,Male,[20-30),?,HM,0,0,1,Emergency,?,103,Discharged to home,Emergency Room,10,77,1,7,250.13,70.0,794,7,>300,,No,No,[]
309126,USA,6281586,AfricanAmerican,Male,[30-40),?,?,0,0,0,Emergency,Pulmonology,99,Discharged to home,Emergency Room,2,44,3,12,786.0,250.6,536,6,,,No,Yes,['insulin']
181183,USA,67381308,Caucasian,Male,[50-60),?,BC,0,0,0,Not Available,?,136,Discharged to home,Emergency Room,4,48,1,16,820.0,873.0,E884,9,,,Ch,Yes,"['metformin', 'glyburide', 'insulin']"
359339,USA,71670204,Caucasian,Male,[60-70),?,?,0,0,0,Emergency,InternalMedicine,137,Discharged to home,Emergency Room,1,43,0,10,599.0,427.0,414,9,,,No,Yes,['metformin']


In [696]:
columns = [
    "outpatient_visits_in_previous_year",
    "emergency_visits_in_previous_year",
    "inpatient_visits_in_previous_year",
]

# train_df = train_df.where(train_df['inpatient_visits_in_previous_year'] < 4)
# train_df = train_df.where(train_df['outpatient_visits_in_previous_year'] < 3)
# train_df = train_df.where(train_df['emergency_visits_in_previous_year'] < 2)

train_df["has_outpatient_visits_in_previous_year"] = train_df["outpatient_visits_in_previous_year"].apply(lambda x: 1 if x > 0 else 0)
train_df["has_inpatient_visits_in_previous_year"] = train_df["inpatient_visits_in_previous_year"].apply(lambda x: 1 if x > 0 else 0)
train_df["has_emergency_visits_in_previous_year"] = train_df["emergency_visits_in_previous_year"].apply(lambda x: 1 if x > 0 else 0)

train_df.drop(columns=columns, inplace=True)

train_df["admission_type"].fillna("Not Available", inplace=True)
train_df["admission_type"] = train_df["admission_type"].replace({"Not Mapped": "Not Available"})


test_df["has_outpatient_visits_in_previous_year"] = test_df["outpatient_visits_in_previous_year"].apply(lambda x: 1 if x > 0 else 0)
test_df["has_inpatient_visits_in_previous_year"] = test_df["inpatient_visits_in_previous_year"].apply(lambda x: 1 if x > 0 else 0)
test_df["has_emergency_visits_in_previous_year"] = test_df["emergency_visits_in_previous_year"].apply(lambda x: 1 if x > 0 else 0)

test_df.drop(columns=columns, inplace=True)

test_df["admission_type"].fillna("Not Available", inplace=True)
test_df["admission_type"] = test_df["admission_type"].replace({"Not Mapped": "Not Available"})

In [697]:
# Replaces all occurrences of '?' to NaN to normalize missing values to NaN
test_df = test_df.replace('?', np.nan)
train_df = train_df.replace('?', np.nan)

In [698]:
# Drop the country as it only has 'USA' value
test_df = test_df.drop('country', axis=1)
train_df = train_df.drop('country', axis=1)

# Drop encounter_id and patient_id as these are just IDs and have no predict power

# Drop the weight column because of high number of missing values 
test_df = test_df.drop('weight', axis=1)
train_df = train_df.drop('weight', axis=1)

In [699]:
age_mapping = {
    '[0-10)': 1,
    '[10-20)': 2,
    '[20-30)': 3,
    '[30-40)': 4,
    '[40-50)': 5,
    '[50-60)': 6,
    '[60-70)': 7,
    '[70-80)': 8,
    '[80-90)': 9,
    '[90-100)': 10,
    np.nan: np.nan }
train_df = train_df.replace(age_mapping)

train_df = train_df.replace({"Yes":1 ,"No":0 , "Ch":1})
train_df['payer_code'] = train_df['payer_code'].replace(np.nan, "None")

test_df = test_df.replace(age_mapping)

test_df = test_df.replace({"Yes":1 ,"No":0 , "Ch":1})
test_df['payer_code'] = test_df['payer_code'].replace(np.nan, "None")

## The cell below just contains some dirty code to make data compliant with ScikitLearn. Most of the decisions might or should be changed in the future

In [700]:
def clean_df(df):
    df['race'] = df['race'].replace(np.nan, "Unknown")
    df['admission_type'] = df['admission_type'].fillna(df['admission_type'].mode()[0])
    df['discharge_disposition'] = df['discharge_disposition'].fillna(df['discharge_disposition'].mode()[0])
    
    admission_source_mode = df['admission_source'].mode()[0]
    df['admission_source'] = df['admission_source'].fillna(admission_source_mode)
    
    df = df.drop('glucose_test_result', axis=1)
    df = df.drop('a1c_test_result', axis=1)
    
    df['medical_specialty_missing'] = df['medical_specialty'].isna().astype(int)
    df['medical_specialty'] = df['medical_specialty'].replace(np.nan, "Unknown")
    
    #To age would be better to use knn?
    df['age'] = df['age'].fillna(df['age'].mean())
    
    # I'll drop these for simplicity, but It should be reverted.
    df = df.drop("medication", axis=1)
    # df = df.drop("medical_specialty", axis=1)

    # average pulse is uniformed distributed, so we'll get rid of this
    # df = df.drop("average_pulse_bpm", axis=1)
    df = df.drop("primary_diagnosis", axis=1)
    df = df.drop("secondary_diagnosis", axis=1)
    df = df.drop("additional_diagnosis", axis=1)
    
    return df

train_df = clean_df(train_df)
test_df = clean_df(test_df)

In [701]:
profile = ProfileReport(
    train_df,
    title='Medical Data',
    correlations={
        "pearson": {"calculate": True},
        "spearman": {"calculate": False},
        "kendall": {"calculate": False},
        "phi_k": {"calculate": False},
        "cramers": {"calculate": False},
    },
)
# profile.to_notebook_iframe()

In [702]:
numeric_features = [
    "age",
    # "average_pulse_bpm",
    "length_of_stay_in_hospital",
    "number_lab_tests",
    "non_lab_procedures",
    "number_of_medications",
    "number_diagnoses",
    "change_in_meds_during_hospitalization",
    "prescribed_diabetes_meds",
]

categorical_features = [
    "race",
    "gender",
    "payer_code",
    "admission_type",
    "medical_specialty",
    "has_outpatient_visits_in_previous_year",
    "has_emergency_visits_in_previous_year",
    "has_inpatient_visits_in_previous_year",
    "discharge_disposition",
    "admission_source",
    # "primary_diagnosis",
    # "secondary_diagnosis",
    # "additional_diagnosis",
    # "medication", 
]

In [703]:
scaler = MinMaxScaler()
train_df[numeric_features] = scaler.fit_transform(train_df[numeric_features])
test_df[numeric_features] = scaler.transform(test_df[numeric_features])

In [704]:
ohc = OneHotEncoder(sparse_output=False, drop="first", handle_unknown="ignore")
ohc_feat = ohc.fit_transform(train_df[categorical_features])
ohc_feat_names = ohc.get_feature_names_out()
ohc_df = pd.DataFrame(ohc_feat, index=train_df.index, columns=ohc_feat_names)

train_df = pd.concat([train_df.drop(columns=categorical_features), ohc_df], axis=1)

ohc_feat = ohc.transform(test_df[categorical_features])
ohc_feat_names = ohc.get_feature_names_out()
ohc_df = pd.DataFrame(ohc_feat, index=test_df.index, columns=ohc_feat_names)

test_df = pd.concat([test_df.drop(columns=categorical_features), ohc_df], axis=1)



In [705]:
profile = ProfileReport(
    train_df,
    title='Medical Data',
    correlations={
        "pearson": {"calculate": True},
        "spearman": {"calculate": False},
        "kendall": {"calculate": False},
        "phi_k": {"calculate": False},
        "cramers": {"calculate": False},
    },
)
# profile.to_notebook_iframe()

# REMOVING COLUMNS THAT ARE NOT IMPORTANT [CATEGORICAL]

In [706]:
cat = train_df.loc[:, "medical_specialty_missing":].columns
for var in cat:
    if TestIndependence(train_df[var], train_df["readmitted_binary"], var):
        train_df.drop(columns=[var], inplace=True)
        test_df.drop(columns=[var], inplace=True)

medical_specialty_missing is IMPORTANT for Prediction
race_Asian is NOT an important predictor. (Discard race_Asian from model)
race_Caucasian is IMPORTANT for Prediction
race_Hispanic is NOT an important predictor. (Discard race_Hispanic from model)
race_Other is IMPORTANT for Prediction
race_Unknown is IMPORTANT for Prediction
gender_Male is NOT an important predictor. (Discard gender_Male from model)
gender_Unknown/Invalid is NOT an important predictor. (Discard gender_Unknown/Invalid from model)
payer_code_CH is NOT an important predictor. (Discard payer_code_CH from model)
payer_code_CM is IMPORTANT for Prediction
payer_code_CP is IMPORTANT for Prediction
payer_code_DM is NOT an important predictor. (Discard payer_code_DM from model)
payer_code_HM is IMPORTANT for Prediction
payer_code_MC is IMPORTANT for Prediction
payer_code_MD is NOT an important predictor. (Discard payer_code_MD from model)
payer_code_MP is NOT an important predictor. (Discard payer_code_MP from model)
payer_c

In [None]:
# SPLITING TARGET VARIABLE

In [None]:
y_train = train_df["readmitted_binary"].values
X_train = train_df.drop(columns=["readmitted_binary", "readmitted_multiclass"])

In [708]:
model = DecisionTreeClassifier(max_depth=25, random_state=42)
model.fit(X_train, y_train)

In [709]:
y_pred = model.predict(X_train)
f1 = f1_score(y_train, y_pred)
f1

ValueError: pos_label=1 is not a valid label. It should be one of ['No', 'Yes']

In [None]:
# Build CSV

y_pred = model.predict(test_df)
submission_df = pd.DataFrame(y_pred)

test_data = pd.read_csv("test.csv")

submission_df = submission_df.rename(columns={0:"readmitted_binary"})
submission_df.insert(0, "encounter_id", test_data["encounter_id"])
submission_df["readmitted_binary"].replace({1: 'Yes', 0: 'No'}, inplace=True)

submission_df.to_csv("submission.csv", index=False)
submission_df["readmitted_binary"].value_counts()

In [None]:
train_df