In [4]:
import numpy as np
import pandas as pd
import statsmodels.api as sm 
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from imblearn.under_sampling import RandomUnderSampler
from sklearn.metrics import accuracy_score, precision_score, recall_score

### Diabetes-130

In [11]:
#loading Dataset
data = pd.read_csv('data/diabetic_data.csv')

data = data.drop(['weight','payer_code','medical_specialty', 'citoglipton', 'examide'], axis = 1)
data = data[data.diag_1 != '?'] 
data = data[data.diag_1 != '?'] 
data = data[data.diag_1 != '?'] 
data = data[data.race != '?']
data = data[data.discharge_disposition_id != 11]
data = data[data.gender != 'Unknown/Invalid']

keys = ['metformin', 'repaglinide', 'nateglinide', 'chlorpropamide', 'glimepiride', 'glipizide', 'glyburide', 'pioglitazone', 'rosiglitazone', 'acarbose', 'miglitol', 'insulin', 'glyburide-metformin', 'tolazamide', 'metformin-pioglitazone','metformin-rosiglitazone', 'glimepiride-pioglitazone', 'glipizide-metformin', 'troglitazone', 'tolbutamide', 'acetohexamide']

for col in keys:
    colname = str(col) + 'temp'
    data[colname] = data[col].apply(lambda x: 0 if (x == 'No' or x == 'Steady') else 1)
data['numchange'] = 0

for col in keys:
    colname = str(col) + 'temp'
    data['numchange'] = data['numchange'] + data[colname]
    del data[colname]
    
data['admission_type_id'] = data['admission_type_id'].replace(2,1)
data['admission_type_id'] = data['admission_type_id'].replace(7,1)
data['admission_type_id'] = data['admission_type_id'].replace(6,5)
data['admission_type_id'] = data['admission_type_id'].replace(8,5)

data['discharge_disposition_id'] = data['discharge_disposition_id'].replace(6,1)
data['discharge_disposition_id'] = data['discharge_disposition_id'].replace(8,1)
data['discharge_disposition_id'] = data['discharge_disposition_id'].replace(9,1)
data['discharge_disposition_id'] = data['discharge_disposition_id'].replace(13,1)
data['discharge_disposition_id'] = data['discharge_disposition_id'].replace(3,2)
data['discharge_disposition_id'] = data['discharge_disposition_id'].replace(4,2)
data['discharge_disposition_id'] = data['discharge_disposition_id'].replace(5,2)
data['discharge_disposition_id'] = data['discharge_disposition_id'].replace(14,2)
data['discharge_disposition_id'] = data['discharge_disposition_id'].replace(22,2)
data['discharge_disposition_id'] = data['discharge_disposition_id'].replace(23,2)
data['discharge_disposition_id'] = data['discharge_disposition_id'].replace(24,2)
data['discharge_disposition_id'] = data['discharge_disposition_id'].replace(12,10)
data['discharge_disposition_id'] = data['discharge_disposition_id'].replace(15,10)
data['discharge_disposition_id'] = data['discharge_disposition_id'].replace(16,10)
data['discharge_disposition_id'] = data['discharge_disposition_id'].replace(17,10)
data['discharge_disposition_id'] = data['discharge_disposition_id'].replace(25,18)
data['discharge_disposition_id'] = data['discharge_disposition_id'].replace(26,18)

data['admission_source_id'] = data['admission_source_id'].replace(2,1)
data['admission_source_id'] = data['admission_source_id'].replace(3,1)
data['admission_source_id'] = data['admission_source_id'].replace(5,4)
data['admission_source_id'] = data['admission_source_id'].replace(6,4)
data['admission_source_id'] = data['admission_source_id'].replace(10,4)
data['admission_source_id'] = data['admission_source_id'].replace(22,4)
data['admission_source_id'] = data['admission_source_id'].replace(25,4)
data['admission_source_id'] = data['admission_source_id'].replace(15,9)
data['admission_source_id'] = data['admission_source_id'].replace(17,9)
data['admission_source_id'] = data['admission_source_id'].replace(20,9)
data['admission_source_id'] = data['admission_source_id'].replace(21,9)
data['admission_source_id'] = data['admission_source_id'].replace(13,11)
data['admission_source_id'] = data['admission_source_id'].replace(14,11)

data['change'] = data['change'].replace('Ch', 1)
data['change'] = data['change'].replace('No', 0)
data['gender'] = data['gender'].replace('Male', 1)
data['gender'] = data['gender'].replace('Female', 0)
data['diabetesMed'] = data['diabetesMed'].replace('Yes', 1)
data['diabetesMed'] = data['diabetesMed'].replace('No', 0)

for col in keys:
    data[col] = data[col].replace('No', 0)
    data[col] = data[col].replace('Steady', 1)
    data[col] = data[col].replace('Up', 1)
    data[col] = data[col].replace('Down', 1)
    
data['A1Cresult'] = data['A1Cresult'].replace('>7', 1)
data['A1Cresult'] = data['A1Cresult'].replace('>8', 1)
data['A1Cresult'] = data['A1Cresult'].replace('Norm', 0)
data['A1Cresult'] = data['A1Cresult'].replace('None', -99)
data['max_glu_serum'] = data['max_glu_serum'].replace('>200', 1)
data['max_glu_serum'] = data['max_glu_serum'].replace('>300', 1)
data['max_glu_serum'] = data['max_glu_serum'].replace('Norm', 0)
data['max_glu_serum'] = data['max_glu_serum'].replace('None', -99)

for i in range(0,10):
    data['age'] = data['age'].replace('['+str(10*i)+'-'+str(10*(i+1))+')', i+1)
    
data = data.drop_duplicates(subset= ['patient_nbr'], keep = 'first')

data['readmitted'] = data['readmitted'].replace('>30', 0)
data['readmitted'] = data['readmitted'].replace('<30', 1)
data['readmitted'] = data['readmitted'].replace('NO', 0)

data['level1_diag1'] = data['diag_1']
data.loc[data['diag_1'].str.contains('V'), ['level1_diag1']] = 0
data.loc[data['diag_1'].str.contains('E'), ['level1_diag1']] = 0
data['level1_diag1'] = data['level1_diag1'].replace('?', -1)
data['level1_diag1'] = data['level1_diag1'].astype(float)

for index, row in data.iterrows():
    if (row['level1_diag1'] >= 390 and row['level1_diag1'] < 460) or (np.floor(row['level1_diag1']) == 785):
        data.loc[index, 'level1_diag1'] = 1
    elif (row['level1_diag1'] >= 460 and row['level1_diag1'] < 520) or (np.floor(row['level1_diag1']) == 786):
        data.loc[index, 'level1_diag1'] = 2
    elif (row['level1_diag1'] >= 520 and row['level1_diag1'] < 580) or (np.floor(row['level1_diag1']) == 787):
        data.loc[index, 'level1_diag1'] = 3
    elif (np.floor(row['level1_diag1']) == 250):
        data.loc[index, 'level1_diag1'] = 4
    elif (row['level1_diag1'] >= 800 and row['level1_diag1'] < 1000):
        data.loc[index, 'level1_diag1'] = 5
    elif (row['level1_diag1'] >= 710 and row['level1_diag1'] < 740):
        data.loc[index, 'level1_diag1'] = 6
    elif (row['level1_diag1'] >= 580 and row['level1_diag1'] < 630) or (np.floor(row['level1_diag1']) == 788):
        data.loc[index, 'level1_diag1'] = 7
    elif (row['level1_diag1'] >= 140 and row['level1_diag1'] < 240):
        data.loc[index, 'level1_diag1'] = 8
    else:
        data.loc[index, 'level1_diag1'] = 0
        
data = data.drop(['encounter_id', 'patient_nbr'], axis = 1)
col_names = ['gender', 'admission_type_id', 'discharge_disposition_id', 'admission_source_id',\
          'A1Cresult', 'metformin', 'repaglinide', 'nateglinide', 'chlorpropamide', 'glimepiride', 'acetohexamide', \
          'glipizide', 'glyburide', 'tolbutamide', 'pioglitazone', 'rosiglitazone', 'acarbose','miglitol', \
          'troglitazone', 'tolazamide', 'insulin', 'glyburide-metformin', 'glipizide-metformin', \
          'glimepiride-pioglitazone', 'metformin-rosiglitazone', 'metformin-pioglitazone', 'change', 'diabetesMed', \
          'age', 'A1Cresult', 'max_glu_serum', 'level1_diag1']

for na in col_names:
    data[na].astype('object')
    
num_col = list(set(list(data._get_numeric_data().columns))- {'change', 'diabetesMed', 'readmitted', 'level1_diag1', 'metformin', 'repaglinide', 'nateglinide', 'chlorpropamide', 'glimepiride', 'acetohexamide', \
          'glipizide', 'glyburide', 'tolbutamide', 'pioglitazone', 'rosiglitazone', 'acarbose','miglitol', \
          'troglitazone', 'tolazamide', 'insulin', 'glyburide-metformin', 'glipizide-metformin', \
          'glimepiride-pioglitazone', 'metformin-rosiglitazone', 'metformin-pioglitazone','A1Cresult', 'admission_type_id', 'discharge_disposition_id',
                                      'admission_source_id', 'max_glu_serum', 'A1Cresult'})

for column in num_col:
    data[column] = (data[column] - data[column].mean()) / data[column].std()

i = ['metformin', 'repaglinide', 'nateglinide', 'chlorpropamide', 'glimepiride', 'acetohexamide', \
          'glipizide', 'glyburide', 'tolbutamide', 'pioglitazone', 'rosiglitazone', 'acarbose','miglitol', \
          'troglitazone', 'tolazamide', 'insulin', 'glyburide-metformin', 'glipizide-metformin', \
          'glimepiride-pioglitazone', 'metformin-rosiglitazone', 'metformin-pioglitazone','A1Cresult']
data[i] = data[i].astype('int64')

data['readmitted'] = data['readmitted'].apply(lambda x: 0 if x == 2 else x)
data.drop(['diag_1', 'diag_2', 'diag_3', 'glimepiride-pioglitazone', 'metformin-rosiglitazone'], axis=1, inplace=True)

data = pd.get_dummies(data, columns=['age', 'admission_type_id', 'discharge_disposition_id',
                                      'admission_source_id', 'max_glu_serum', 'A1Cresult', 'level1_diag1', 'race'], drop_first = False)

data.columns

Index(['gender', 'time_in_hospital', 'num_lab_procedures', 'num_procedures',
       'num_medications', 'number_outpatient', 'number_emergency',
       'number_inpatient', 'number_diagnoses', 'metformin', 'repaglinide',
       'nateglinide', 'chlorpropamide', 'glimepiride', 'acetohexamide',
       'glipizide', 'glyburide', 'tolbutamide', 'pioglitazone',
       'rosiglitazone', 'acarbose', 'miglitol', 'troglitazone', 'tolazamide',
       'insulin', 'glyburide-metformin', 'glipizide-metformin',
       'metformin-pioglitazone', 'change', 'diabetesMed', 'readmitted',
       'numchange', 'age_-3.787224938653078', 'age_-3.161411479692977',
       'age_-2.535598020732876', 'age_-1.909784561772775',
       'age_-1.283971102812674', 'age_-0.658157643852573',
       'age_-0.032344184892472004', 'age_0.593469274067629',
       'age_1.21928273302773', 'age_1.845096191987831', 'admission_type_id_1',
       'admission_type_id_3', 'admission_type_id_4', 'admission_type_id_5',
       'discharge_disposi

In [15]:
data['age_-2.535598020732876']

0         0
1         0
2         1
3         0
4         0
         ..
101754    0
101755    0
101756    0
101758    0
101765    0
Name: age_-2.535598020732876, Length: 68610, dtype: uint8

In [18]:
lambdas = np.linspace(.0001, .1, 10)
lambdas

array([0.0001, 0.0112, 0.0223, 0.0334, 0.0445, 0.0556, 0.0667, 0.0778,
       0.0889, 0.1   ])

In [19]:
feature_set = set(data.columns) - {'readmitted'}
X = data[feature_set]
y = data['readmitted']

undersample = RandomUnderSampler(random_state=42)
new_X, new_y = undersample.fit_resample(X, y)

X_train, X_test, y_train, y_test = train_test_split(new_X, new_y, test_size=0.20, random_state=42)

logit = LogisticRegression(C=1.0/0.1, fit_intercept=True, max_iter=1000, penalty='l2', solver='lbfgs')
logit.fit(X_train, y_train)

logit_pred = logit.predict(X_test)

pd.crosstab(pd.Series(y_test, name = 'Actual'), pd.Series(logit_pred, name = 'Predict'), margins = True)

print("Accuracy is {0:.2f}".format(accuracy_score(y_test, logit_pred)))
print("Precision is {0:.2f}".format(precision_score(y_test, logit_pred, zero_division=0)))
print("Recall is {0:.2f}".format(recall_score(y_test, logit_pred)))

  X = data[feature_set]


Accuracy is 0.61
Precision is 0.59
Recall is 0.57


### Adult

In [62]:
features = ['age', 'workclass', 'final_weight', 'education', 'education_num', 'marital_status', 'occupation', 'relationship', 'race', 'sex', 'capital_gain', 'capital_loss', 'hours_per_week', 'native_country', 'income_class']

data = pd.read_csv('data/adult.data', comment='|', skipinitialspace=True, names=features,
        na_values={
            'capital_gain': 99999,
            'workclass': '?',
            'native_country': '?',
            'occupation': '?'
        },
    )
print(len(data))
data = data.replace({'<=50K.' : '<=50K', '>50K.' : '>50K'})

data = (data[(data['workclass'] != '?') & (data['occupation'] != '?') & (data['workclass'] != 'Without-pay')].reset_index(drop=True))
data.loc[data['workclass'].isin(['State-gov', 'Federal-gov', 'Local-gov']), 'workclass'] = 'Government'
data.loc[data['workclass'].isin(['Self-emp-not-inc', 'Self-emp-inc']), 'workclass'] = 'Self-Employed'
data.loc[data['workclass'].isin(['Private']), 'workclass'] = 'Privately-Employed'

data.loc[data['occupation'] == 'Armed-Forces', 'occupation'] = 'Protective-serv'

data.loc[data['marital_status'].isin(['Married-AF-spouse', 'Married-civ-spouse', 'Married-spouse-absent']), 'marital_status'] ='Married'
data.loc[data['marital_status'].isin(['Divorced', 'Never-married', 'Separated', 'Widowed']), 'marital_status'] = 'Not-married'

data.loc[data['education_num'] <= 8, 'education'] = 'Less than High School'
data.loc[data['education_num'].isin([9, 10]), 'education'] = 'High School'
data.loc[data['education_num'].isin([11, 12]), 'education'] = 'Associates'
data.loc[data['education_num'].isin([13]), 'education'] = 'Bachelors'
data.loc[data['education_num'].isin([14]), 'education'] = 'Masters'
data.loc[data['education_num'].isin([15, 16]), 'education'] = 'PhD/Professional'

data = data.drop('final_weight', axis=1)
data = data.drop_duplicates()
data = data.drop('education_num', axis = 1)
data = data.dropna(how='any', axis=0)
data.capital_gain = data.capital_gain.astype(int)

to_replace = ['workclass', 'education', 'marital_status', 'occupation','relationship', 'race', 'native_country']
data = pd.get_dummies(data, columns=to_replace, drop_first = False)

data['sex'] = data['sex'].astype('category').cat.codes
data['income_class'] = data['income_class'].astype('category').cat.codes

data['age'] = (data['age'] - data['age'].mean()) / data['age'].std()
data['hours_per_week'] = (data['hours_per_week'] - data['hours_per_week'].mean()) / data['hours_per_week'].std()
data['capital_gain'] = (data['capital_gain'] - data['capital_gain'].mean()) / data['capital_gain'].std()
data['capital_loss'] = (data['capital_loss'] - data['capital_loss'].mean()) / data['capital_loss'].std()

data = data.drop('native_country_Holand-Netherlands', axis=1)

32561


In [65]:
feature_set = set(data.columns) - {'income_class'}
X = data[feature_set]
y = data['income_class']
print(len(X))
#undersample = RandomUnderSampler(random_state=42)
#new_X, new_y = undersample.fit_resample(X, y)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

logit = LogisticRegression(C=1.0, fit_intercept=True, max_iter=1000, penalty='l2', solver='lbfgs')
logit.fit(X_train, y_train)

logit_pred = logit.predict(X_test)

pd.crosstab(pd.Series(y_test, name = 'Actual'), pd.Series(logit_pred, name = 'Predict'), margins = True)

print("Accuracy is {0:.2f}".format(accuracy_score(y_test, logit_pred)))
print("Precision is {0:.2f}".format(precision_score(y_test, logit_pred, zero_division=0)))
print("Recall is {0:.2f}".format(recall_score(y_test, logit_pred)))

  X = data[feature_set]


26223
Accuracy is 0.84
Precision is 0.73
Recall is 0.61


### Law School

In [49]:
data = pd.read_csv('data/lawschs1_1.csv')
data.columns
print(len(data.columns))

15


In [50]:
data = data[data.MissingRace != 1]
data = data.drop('Race', axis=1)
data = data.drop('MissingRace', axis=1)
data = data.drop('college', axis=1)
data = data.drop('Year', axis=1)
data = data.dropna(how='any', axis=0)

In [51]:
to_replace = ['LSAT', 'GPA', 'Gender']
data = pd.get_dummies(data, columns=to_replace, drop_first = False)
print(len(data.columns))

420


In [52]:
from matplotlib.colors import ListedColormap

my_cmap = ListedColormap(sns.light_palette((250, 100, 50), input="husl", n_colors=50).as_hex())
table = data.corr(method='pearson')
table.style.background_gradient(cmap=my_cmap, axis = 0)

KeyboardInterrupt: 

In [53]:
feature_set = set(data.columns) - {'admit'}
X = data[feature_set]
y = data['admit']
print(len(X))

#undersample = RandomUnderSampler(random_state=42)
#new_X, new_y = undersample.fit_resample(X, y)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

logit = LogisticRegression(C=1.0, fit_intercept=True, max_iter=1000, penalty='l2', solver='lbfgs')
logit.fit(X_train, y_train)

logit_pred = logit.predict(X_test)

pd.crosstab(pd.Series(y_test, name = 'Actual'), pd.Series(logit_pred, name = 'Predict'), margins = True)

print("Accuracy is {0:.2f}".format(accuracy_score(y_test, logit_pred)))
print("Precision is {0:.2f}".format(precision_score(y_test, logit_pred, zero_division=0)))
print("Recall is {0:.2f}".format(recall_score(y_test, logit_pred)))

  X = data[feature_set]


24754
Accuracy is 0.73
Precision is 0.73
Recall is 0.82


### Ricci

In [144]:
data = pd.read_csv('data/ricci.csv')
data = data.drop(['Oral', 'Combine'], axis=1)

In [145]:
data.loc[data['Oral_round'] <= 9, 'Oral_bin'] = '0-9'
data.loc[(data['Oral_round'] <= 19) & (data['Oral_round'] > 9), 'Oral_bin'] = '10-19'
data.loc[(data['Oral_round'] <= 29) & (data['Oral_round'] > 19), 'Oral_bin'] = '20-29'
data.loc[(data['Oral_round'] <= 39) & (data['Oral_round'] > 29), 'Oral_bin'] = '30-39'
data.loc[(data['Oral_round'] <= 49) & (data['Oral_round'] > 39), 'Oral_bin'] = '40-49'
data.loc[(data['Oral_round'] <= 59) & (data['Oral_round'] > 49), 'Oral_bin'] = '50-59'
data.loc[(data['Oral_round'] <= 69) & (data['Oral_round'] > 59), 'Oral_bin'] = '60-69'
data.loc[(data['Oral_round'] <= 79) & (data['Oral_round'] > 69), 'Oral_bin'] = '70-79'
data.loc[(data['Oral_round'] <= 89) & (data['Oral_round'] > 79), 'Oral_bin'] = '80-89'
data.loc[(data['Oral_round'] <= 100) & (data['Oral_round'] > 89), 'Oral_bin'] = '90-100'

In [146]:
data.loc[data['Combine_round'] <= 9, 'Combine_bin'] = '0-9'
data.loc[(data['Combine_round'] <= 19) & (data['Combine_round'] > 9), 'Combine_bin'] = '10-19'
data.loc[(data['Combine_round'] <= 29) & (data['Combine_round'] > 19), 'Combine_bin'] = '20-29'
data.loc[(data['Combine_round'] <= 39) & (data['Combine_round'] > 29), 'Combine_bin'] = '30-39'
data.loc[(data['Combine_round'] <= 49) & (data['Combine_round'] > 39), 'Combine_bin'] = '40-49'
data.loc[(data['Combine_round'] <= 59) & (data['Combine_round'] > 49), 'Combine_bin'] = '50-59'
data.loc[(data['Combine_round'] <= 69) & (data['Combine_round'] > 59), 'Combine_bin'] = '60-69'
data.loc[(data['Combine_round'] <= 79) & (data['Combine_round'] > 69), 'Combine_bin'] = '70-79'
data.loc[(data['Combine_round'] <= 89) & (data['Combine_round'] > 79), 'Combine_bin'] = '80-89'
data.loc[(data['Combine_round'] <= 100) & (data['Combine_round'] > 89), 'Combine_bin'] = '90-100'

In [147]:
data.loc[data['Written'] <= 9, 'Written_bin'] = '0-9'
data.loc[(data['Written'] <= 19) & (data['Written'] > 9), 'Written_bin'] = '10-19'
data.loc[(data['Written'] <= 29) & (data['Written'] > 19), 'Written_bin'] = '20-29'
data.loc[(data['Written'] <= 39) & (data['Written'] > 29), 'Written_bin'] = '30-39'
data.loc[(data['Written'] <= 49) & (data['Written'] > 39), 'Written_bin'] = '40-49'
data.loc[(data['Written'] <= 59) & (data['Written'] > 49), 'Written_bin'] = '50-59'
data.loc[(data['Written'] <= 69) & (data['Written'] > 59), 'Written_bin'] = '60-69'
data.loc[(data['Written'] <= 79) & (data['Written'] > 69), 'Written_bin'] = '70-79'
data.loc[(data['Written'] <= 89) & (data['Written'] > 79), 'Written_bin'] = '80-89'
data.loc[(data['Written'] <= 100) & (data['Written'] > 89), 'Written_bin'] = '90-100'

In [148]:
data.loc[(data['promote'] == 'y'), 'promote'] = 1
data.loc[(data['promote'] == 'n'), 'promote'] = 0

In [149]:
data = data.drop(['Oral_round', 'Combine_round', 'Written'], axis=1)
data

Unnamed: 0,Race,Position,promote,Oral_bin,Combine_bin,Written_bin
0,W,Captain,1,90-100,90-100,90-100
1,W,Captain,1,80-89,80-89,90-100
2,W,Captain,1,80-89,80-89,80-89
3,W,Captain,1,80-89,80-89,70-79
4,W,Captain,1,70-79,80-89,80-89
...,...,...,...,...,...,...
113,H,Lieutenant,0,40-49,50-59,60-69
114,B,Lieutenant,0,50-59,50-59,50-59
115,H,Lieutenant,0,40-49,50-59,50-59
116,B,Lieutenant,0,50-59,50-59,40-49


In [150]:
to_replace = ['Race', 'Oral_bin', 'Combine_bin', 'Written_bin']
data = pd.get_dummies(data, columns=to_replace, drop_first = False)
data['Position'] = data['Position'].astype('category').cat.codes
data['promote'] = data['promote'].astype('int64')

In [153]:
len(data.columns)

23

In [157]:
data = data.sample(frac=1)
feature_set = set(data.columns) - {'promote'}
X = data[feature_set]
y = data['promote']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

logit = LogisticRegression(C=1.0, fit_intercept=True, max_iter=1000, penalty='l2', solver='lbfgs')
logit.fit(X_train, y_train)

logit_pred = logit.predict(X_test)

pd.crosstab(pd.Series(y_test, name = 'Actual'), pd.Series(logit_pred, name = 'Predict'), margins = True)

print("Accuracy is {0:.2f}".format(accuracy_score(y_test, logit_pred)))
print("Precision is {0:.2f}".format(precision_score(y_test, logit_pred, zero_division=0)))
print("Recall is {0:.2f}".format(recall_score(y_test, logit_pred)))

Accuracy is 1.00
Precision is 1.00
Recall is 1.00


  X = data[feature_set]
