In [None]:
#supress warnings (especially from sklearn)
def warn(*args, **kwargs):
    pass
import warnings
warnings.warn = warn
import os
from tqdm import tqdm
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.model_selection import LeaveOneOut
from sklearn.linear_model import ElasticNetCV, LogisticRegressionCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report
from sklearn.decomposition import PCA
import seaborn as sns
import csv

In [None]:
ccs_data = pd.read_csv('../data/patient_ccs_100.csv')
ccs_data = ccs_data.rename(index=str, columns={"SUBJECT_ID": "subject_id", "HADM_ID": "hadm_id"})
all_data = pd.read_csv('../data/x_with_lacefeatures.csv')
labels = pd.read_csv('../data/y_more_no_df_clean.csv')
merged_data = all_data.merge(ccs_data, on=['subject_id', 'hadm_id'])
labeled_data = merged_data.join(labels)
labeled_data.to_csv('../data/labeled_clustered_data_with_ids.csv')
save_labeled_data = labeled_data.drop(['subject_id', 'hadm_id'], axis=1)
save_labeled_data.to_csv('../data/labeled_clustered_data.csv')

In [None]:
cluster_readmission = labeled_data.groupby(['cluster_num'])['label'].sum()
cluster_readmission = cluster_readmission.reset_index(drop=False)
total_noreadmit, total_readmit = labeled_data['label'].value_counts()
sns.barplot(x='cluster_num', y='label', data=cluster_readmission)
plt.title('Readmissions by Cluster')
plt.show()

In [None]:
cluster_readmits = labeled_data['cluster_num'].value_counts()
readmits = {}
for cluster, readmit in enumerate(cluster_readmission['label']):
    readmits[cluster] = float(readmit)/float(cluster_readmits[cluster])
plt.bar(range(len(readmits)), list(readmits.values()), align='center')
plt.xticks(range(len(readmits)), list(readmits.keys()))
plt.ylabel('Percent readmitted')
plt.xlabel('Cluster Number')
plt.title('Percent readmitted by cluster (out of all patients in cluster)')
plt.show()

In [None]:
readmits_all = {}
readmits_r = {}
for cluster, readmit in enumerate(cluster_readmission['label']):
    readmits_all[cluster] = float(readmit)/float(total_noreadmit+total_readmit)
    readmits_r[cluster] = float(readmit)/float(total_readmit)
plt.bar(range(len(readmits_r)), list(readmits_r.values()), align='center')
plt.xticks(range(len(readmits_r)), list(readmits_r.keys()))
plt.ylabel('Percent readmitted')
plt.xlabel('Cluster')
plt.title('Percent readmitted in cluster out of all readmitted')
plt.show()

In [None]:
def preprocess():
    data = pd.read_csv('../data/labeled_clustered_data.csv')
    df_0 = data[(data[['cluster_num']] == 0).any(axis=1)]
    df_0_label=df_0.pop('label')
    df_1 = data[(data[['cluster_num']] == 1).any(axis=1)]
    df_1_label=df_1.pop('label')
    df_2 = data[(data[['cluster_num']] == 2).any(axis=1)]
    df_2_label=df_2.pop('label')
    dfs = [df_0, df_1, df_2]
    dfs_labels = [df_0_label, df_1_label, df_2_label]
    return dfs, dfs_labels

In [None]:
def classifaction_report_csv(report, filename):
    report_data = []
    lines = report.split('\n')
    for line in lines[2:-3]:
        row = {}
        row_data = line.split('      ')
        row['class'] = row_data[0]
        row['precision'] = float(row_data[1])
        row['recall'] = float(row_data[2])
        row['f1_score'] = float(row_data[3])
        row['support'] = float(row_data[4])
        report_data.append(row)
    dataframe = pd.DataFrame.from_dict(report_data)
    dataframe.to_csv(filename, index = False)

In [None]:
def run_pca(df):
    pca = PCA().fit(df)
    variances = np.cumsum(pca.explained_variance_ratio_)
    max_variance = -1
    n_components = 0
    for c in range(len(variances)):
        if variances[c] < 1 and variances[c] >  max_variance: 
            max_variance = variances[c]
            n_components = c
    reduced_data = PCA(n_components=n_components).fit_transform(df) 
    return reduced_data

In [None]:
def create_pipeline():
    pipeline = []
    lr = LogisticRegressionCV(random_state = 0, n_jobs=-1)
    en = ElasticNetCV(random_state = 0, n_jobs=-1)
    rf = RandomForestClassifier(random_state=0, n_jobs=-1)
    pipeline.append(['logistic_regression', lr]) 
    pipeline.append(['elastic_network', en]) 
    pipeline.append(['random_forest', rf])
    return pipeline

In [None]:
def run_pipeline(dfs, dfs_labels):
    pipeline = create_pipeline()
    for i, df in enumerate(dfs):
        labels = dfs_labels[i]
        reduced_df = run_pca(df)
        X_train, X_test, y_train, y_test = train_test_split(reduced_df, labels, test_size=0.2, random_state=0)
        for name, clf in pipeline:
#             print(name+'_cluster_'+str(i))
            directory = 'results/'
            filename = directory+name+'_cluster_'+str(i)+'.csv'
            clf.fit(X_train, y_train)
            y_preds = [0 if x < 0.5 else 1 for x in clf.predict(X_test)]
            report = classification_report(y_test, y_preds, target_names=['Not Readmitted', 'Readmitted'])
            classifaction_report_csv(report, filename)

In [None]:
dfs, dfs_labels = preprocess()
run_pipeline(dfs, dfs_labels)