In [283]:
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
from collections import Counter
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, train_test_split, cross_val_score
from sklearn.feature_selection import RFECV, SelectKBest, chi2, f_classif
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, f1_score
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, VotingClassifier, BaggingClassifier
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler, PowerTransformer, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.neighbors import KNeighborsClassifier
from sklearn.impute import KNNImputer, SimpleImputer, MissingIndicator
from sklearn.naive_bayes import GaussianNB
from tqdm import tqdm
from imblearn.over_sampling import SMOTE, RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from xgboost import XGBClassifier

- Importing files:

In [284]:
df_train = pd.read_csv(r'train.csv')

In [285]:
df_test = pd.read_csv(r'test.csv')

In [286]:
df_train

Unnamed: 0,Patient_ID,Family_Case_ID,Severity,Name,Birthday_year,Parents or siblings infected,Wife/Husband or children infected,Medical_Expenses_Family,Medical_Tent,City,Deceased
0,1,4696,3,Miss Linda Betty,,0,0,225,,Santa Fe,1
1,2,21436,1,Ms. Ramona Elvira,1966.0,0,1,1663,,Albuquerque,0
2,3,7273,3,Mr. Mario Vernon,1982.0,0,0,221,,Santa Fe,1
3,4,8226,3,Mr. Hector Joe,1997.0,0,0,220,,Santa Fe,1
4,5,19689,3,Ms. Jennie Debra,1994.0,0,0,222,,Santa Fe,0
...,...,...,...,...,...,...,...,...,...,...,...
895,896,6253,3,Ms. Linda Wilcox,1998.0,1,1,344,,Santa Fe,0
896,897,6483,3,Mr. Haiden Vance,2006.0,0,0,258,,Santa Fe,0
897,898,981,3,Miss Anaiya Love,1990.0,0,0,214,,Taos,1
898,899,16418,2,Mr. Robert Williams,1994.0,1,1,812,,Santa Fe,0


- Splitting title, first and last names:

In [287]:
df_train.loc[:, 'Title'] = df_train.loc[:,'Name'].apply(lambda x: re.search(r"(.+?)\s(.+?)\s(.+)", x).group(1))
df_train.loc[:, 'First_Name'] = df_train.loc[:,'Name'].apply(lambda x: re.search(r"(.+?)\s(.+?)\s(.+)", x).group(2))
df_train.loc[:, 'Last_Name'] = df_train.loc[:,'Name'].apply(lambda x: re.search(r"(.+?)\s(.+?)\s(.+)", x).group(3))

In [288]:
df_test.loc[:, 'Title'] = df_test.loc[:,'Name'].apply(lambda x: re.search(r"(.+?)\s(.+?)\s(.+)", x).group(1))
df_test.loc[:, 'First_Name'] = df_test.loc[:,'Name'].apply(lambda x: re.search(r"(.+?)\s(.+?)\s(.+)", x).group(2))
df_test.loc[:, 'Last_Name'] = df_test.loc[:,'Name'].apply(lambda x: re.search(r"(.+?)\s(.+?)\s(.+)", x).group(3))

- Creating gender:

In [289]:
gender_dictionary ={'Miss' : 'F', 'Ms.' : 'F', 'Mr.' : 'M', 'Master' : 'M'} 
df_train.loc[:,'gender'] = df_train.loc[:,'Title'].map(gender_dictionary) 

In [290]:
gender_dictionary ={'Miss' : 'F', 'Mrs.' : 'F', 'Mr.' : 'M', 'Master' : 'M'} 
df_test.loc[:,'gender'] = df_test.loc[:,'Title'].map(gender_dictionary) 

- Filling missing values:

__Medical_tent__

In [291]:
df_train.Medical_Tent = df_train.Medical_Tent.fillna('No_tent')

In [292]:
df_test.Medical_Tent = df_test.Medical_Tent.fillna('No_tent')

__Birthday_year__

In [293]:
imputer = SimpleImputer(strategy='mean')
imputer.fit(df_train.loc[:,['Birthday_year']])
df_train.loc[:,'Birthday_year'] = imputer.transform(df_train.loc[:,['Birthday_year']]).astype(int)

In [294]:
df_test.loc[:,'Birthday_year'] = imputer.transform(df_test.loc[:,['Birthday_year']]).astype(int)

__City__

In [295]:
df_train.loc[:,'City'] = df_train.loc[:,'City'].fillna('Santa Fe')

- Encoding:

In [296]:
for feature in ['Medical_Tent', 'City', 'gender']:
    encoder = LabelEncoder()
    encoder.fit(df_train.loc[:, feature])
    df_train.loc[:, feature] = encoder.transform(df_train.loc[:, feature])
    df_test.loc[:, feature] = encoder.transform(df_test.loc[:, feature])

- Creating expenses per capita from medical_expenses_family:

In [297]:
df_train.loc[:,'Expenses_per_capita'] = df_train.loc[:,'Medical_Expenses_Family']/(df_train.loc[:,'Parents or siblings infected']+df_train.loc[:,'Wife/Husband or children infected']+1)

In [298]:
df_test.loc[:,'Expenses_per_capita'] = df_test.loc[:,'Medical_Expenses_Family']/(df_test.loc[:,'Parents or siblings infected']+df_test.loc[:,'Wife/Husband or children infected']+1)

In [299]:
df_train

Unnamed: 0,Patient_ID,Family_Case_ID,Severity,Name,Birthday_year,Parents or siblings infected,Wife/Husband or children infected,Medical_Expenses_Family,Medical_Tent,City,Deceased,Title,First_Name,Last_Name,gender,Expenses_per_capita
0,1,4696,3,Miss Linda Betty,1990,0,0,225,7,1,1,Miss,Linda,Betty,0,225.000000
1,2,21436,1,Ms. Ramona Elvira,1966,0,1,1663,7,0,0,Ms.,Ramona,Elvira,0,831.500000
2,3,7273,3,Mr. Mario Vernon,1982,0,0,221,7,1,1,Mr.,Mario,Vernon,1,221.000000
3,4,8226,3,Mr. Hector Joe,1997,0,0,220,7,1,1,Mr.,Hector,Joe,1,220.000000
4,5,19689,3,Ms. Jennie Debra,1994,0,0,222,7,1,0,Ms.,Jennie,Debra,0,222.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
895,896,6253,3,Ms. Linda Wilcox,1998,1,1,344,7,1,0,Ms.,Linda,Wilcox,0,114.666667
896,897,6483,3,Mr. Haiden Vance,2006,0,0,258,7,1,0,Mr.,Haiden,Vance,1,258.000000
897,898,981,3,Miss Anaiya Love,1990,0,0,214,7,2,1,Miss,Anaiya,Love,0,214.000000
898,899,16418,2,Mr. Robert Williams,1994,1,1,812,7,1,0,Mr.,Robert,Williams,1,270.666667


In [300]:
df_test

Unnamed: 0,Patient_ID,Family_Case_ID,Severity,Name,Birthday_year,Parents or siblings infected,Wife/Husband or children infected,Medical_Expenses_Family,Medical_Tent,City,Title,First_Name,Last_Name,gender,Expenses_per_capita
0,901,49242,3,Mr. Jody Pedro,1990,0,0,203,7,1,Mr.,Jody,Pedro,1,203.0
1,902,10400,3,Mr. Kevin Brent,1988,0,0,631,7,1,Mr.,Kevin,Brent,1,631.0
2,903,10795,3,Mr. Frankie Cary,1981,1,0,376,7,0,Mr.,Frankie,Cary,1,188.0
3,904,62440,3,Mr. Rick Pete,1990,0,1,405,7,0,Mr.,Rick,Pete,1,202.5
4,905,81311,2,Mr. Matthew Erick,1996,0,0,378,7,1,Mr.,Matthew,Erick,1,378.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
395,1296,110522,3,Mr. Luther Rogelio,1990,0,0,221,7,1,Mr.,Luther,Rogelio,1,221.0
396,1297,118768,3,Mr. Emanuel Ruben,1990,0,0,202,7,0,Mr.,Emanuel,Ruben,1,202.0
397,1298,86158,1,Mrs. Misty Camille,1994,0,1,3830,2,0,Mrs.,Misty,Camille,0,1915.0
398,1299,18523,3,Master Gustavo Jordan,2007,2,0,567,7,1,Master,Gustavo,Jordan,1,189.0


- Dropping some variables to train the model:

In [301]:
X = df_train.drop(columns=['Patient_ID','Family_Case_ID','Name','Title','First_Name','Last_Name','Deceased',
                           'Medical_Expenses_Family'])
y = df_train['Deceased']

In [302]:
X_final = df_test.drop(columns=['Patient_ID','Family_Case_ID','Name','Title','First_Name','Last_Name',
                                'Medical_Expenses_Family'])

In [303]:
X

Unnamed: 0,Severity,Birthday_year,Parents or siblings infected,Wife/Husband or children infected,Medical_Tent,City,gender,Expenses_per_capita
0,3,1990,0,0,7,1,0,225.000000
1,1,1966,0,1,7,0,0,831.500000
2,3,1982,0,0,7,1,1,221.000000
3,3,1997,0,0,7,1,1,220.000000
4,3,1994,0,0,7,1,0,222.000000
...,...,...,...,...,...,...,...,...
895,3,1998,1,1,7,1,0,114.666667
896,3,2006,0,0,7,1,1,258.000000
897,3,1990,0,0,7,2,0,214.000000
898,2,1994,1,1,7,1,1,270.666667


In [304]:
scaler=MinMaxScaler()
X=scaler.fit_transform(X)
X_final=scaler.transform(X_final)

In [334]:
X[0]

array([1.        , 0.63291139, 0.        , 0.        , 0.875     ,
       0.5       , 0.        , 0.01568491])

In [306]:

from copy import deepcopy
from sklearn.model_selection import train_test_split

import keras

from keras.models import Sequential
from keras.layers import LSTM, Dense, Dropout, Masking, Embedding, Bidirectional
from keras.optimizers import Adam

In [352]:
X2, X_test, y2, y_test = train_test_split(X, 
                                                    y, 
                                                    test_size=0.1, 
                                                    #random_state=55, 
                                                    shuffle=True, 
                                                    stratify=y
                                                   )

In [353]:
X_train, X_val, y_train, y_val = train_test_split(X2, 
                                                    y2, 
                                                    test_size=0.1, 
                                                   # random_state, 
                                                    shuffle=True, 
                                                    stratify=y2
                                                   )

In [385]:
def build_model(input_size, output_size):
    model = Sequential()
    model.add(Dense(4, input_dim=input_size,activation='relu'))
    model.add(Dense(4, input_dim=input_size,activation='relu'))
    model.add(Dense(4, input_dim=input_size,activation='relu'))
    model.add(Dense(4, input_dim=input_size,activation='relu'))
   # model.add(Dropout(0.2))
#     model.add(Dense(16,activation='relu'))
#     model.add(Dropout(0.2))
    model.add(Dense(output_size, activation='sigmoid') )
    # Compile the model
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    return model

In [390]:
model=build_model(len(X_train[0]),1)
model.fit(X_train, y_train, validation_data=(X_val,y_val), batch_size=1,epochs=12,verbose=1)


Train on 729 samples, validate on 81 samples
Epoch 1/12
Epoch 2/12
Epoch 3/12
Epoch 4/12
Epoch 5/12
Epoch 6/12
Epoch 7/12
Epoch 8/12
Epoch 9/12
Epoch 10/12
Epoch 11/12
Epoch 12/12


<keras.callbacks.callbacks.History at 0x15686a490>

In [392]:
model.evaluate(X_test,y_test)



[0.5422683583365546, 0.7333333492279053]

In [345]:

model=build_model(len(X_train[0]),1)
model.fit(X, y, batch_size=8,epochs=50,verbose=1)
pred=model.predict(X_final)
pred=pred.flatten()


Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [346]:
for i in range(len(pred)):
    if pred[i]<.5:
        pred[i]=0
    else:
        pred[i]=1

In [347]:
pred=pred.astype("int")

In [348]:
version3 = pd.DataFrame( pred,df_test.Patient_ID.values )

In [349]:
version3.rename(columns={0: "Deceased"},inplace=True)
version3.rename_axis("Patient_ID",inplace=True)

In [350]:
version3

Unnamed: 0_level_0,Deceased
Patient_ID,Unnamed: 1_level_1
901,1
902,1
903,1
904,1
905,1
...,...
1296,1
1297,1
1298,0
1299,1


In [351]:
version3.to_csv('versions/Group6__version_alex.csv', index=True)

In [49]:
pipe = Pipeline(
    [
        (
            'scaler', MinMaxScaler()
        ),
        (
            'classifier', BaggingClassifier(
                base_estimator= XGBClassifier(
                    colsample_bytree = 1.0, 
                    eta = 0.63, 
                    max_depth = 2
                ),
                n_estimators=10,
                max_features=1.0,
                bootstrap=True,
                random_state=42
            )
        )
    ]
)

In [50]:
parameter_space = {
    'scaler': [
        MinMaxScaler(), 
        MinMaxScaler(feature_range=(-1,1)),
        RobustScaler(),
        StandardScaler(),
        PowerTransformer()
    ],
    'classifier__n_estimators': list(np.arange(5,50,1)),
    'classifier__max_features': list(np.arange(2,9,1)),
    'classifier__bootstrap': [True, False],
    
}

In [51]:
gridsearch = GridSearchCV(
    pipe, 
    parameter_space, 
    cv=5, 
    verbose=2,
    n_jobs=-1,
    return_train_score=True,
    scoring='accuracy'
)

In [52]:
best_model = gridsearch.fit(X, y)

Fitting 5 folds for each of 3150 candidates, totalling 15750 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:    1.1s
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed:    7.7s
[Parallel(n_jobs=-1)]: Done 349 tasks      | elapsed:   30.8s
[Parallel(n_jobs=-1)]: Done 632 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done 997 tasks      | elapsed:  3.8min
[Parallel(n_jobs=-1)]: Done 1442 tasks      | elapsed:  5.5min
[Parallel(n_jobs=-1)]: Done 1969 tasks      | elapsed:  8.1min
[Parallel(n_jobs=-1)]: Done 2576 tasks      | elapsed: 10.9min
[Parallel(n_jobs=-1)]: Done 3265 tasks      | elapsed: 14.8min
[Parallel(n_jobs=-1)]: Done 4034 tasks      | elapsed: 17.9min
[Parallel(n_jobs=-1)]: Done 4885 tasks      | elapsed: 22.3min
[Parallel(n_jobs=-1)]: Done 5816 tasks      | elapsed: 27.7min
[Parallel(n_jobs=-1)]: Done 6829 tasks      | elapsed: 33.9min
[Parallel(n_jobs=-1)]: Done 7922 tasks      | elapsed: 40.5min
[Parallel(n_jobs=-1)]: Done 9097 tasks      | e

In [55]:
best_model.best_params_

{'classifier__bootstrap': False,
 'classifier__max_features': 7,
 'classifier__n_estimators': 9,
 'scaler': MinMaxScaler(copy=True, feature_range=(0, 1))}

In [58]:
print('Mean: ', best_model.cv_results_['mean_test_score'][best_model.best_index_])
print('Split 1: ', best_model.cv_results_['split0_test_score'][best_model.best_index_])
print('Split 2: ', best_model.cv_results_['split1_test_score'][best_model.best_index_])
print('Split 3: ', best_model.cv_results_['split2_test_score'][best_model.best_index_])
print('Split 4: ', best_model.cv_results_['split3_test_score'][best_model.best_index_])
print('Split 5: ', best_model.cv_results_['split4_test_score'][best_model.best_index_])

Mean:  0.8400000000000001
Split 1:  0.8666666666666667
Split 2:  0.8333333333333334
Split 3:  0.8444444444444444
Split 4:  0.8333333333333334
Split 5:  0.8222222222222222


In [60]:
pred_final_model_3 = best_model.predict(X_final)

In [61]:
version3 = pd.DataFrame({'Patient_ID': list(df_test.Patient_ID.values), 'Deceased':list(pred_final_model_3)}) 

In [62]:
version3.to_csv('m20190922_version3.csv', index=False)