In [746]:
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
from collections import Counter
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, train_test_split, cross_val_score
from sklearn.feature_selection import RFECV, SelectKBest, chi2, f_classif
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, f1_score
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, VotingClassifier, BaggingClassifier
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler, PowerTransformer, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.neighbors import KNeighborsClassifier
from sklearn.impute import KNNImputer, SimpleImputer, MissingIndicator
from sklearn.naive_bayes import GaussianNB
from tqdm import tqdm
from imblearn.over_sampling import SMOTE, RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from xgboost import XGBClassifier

- Importing files:

In [771]:
df_train = pd.read_csv(r'train.csv')

In [772]:
df_test = pd.read_csv(r'test.csv')

In [773]:
df_train

Unnamed: 0,Patient_ID,Family_Case_ID,Severity,Name,Birthday_year,Parents or siblings infected,Wife/Husband or children infected,Medical_Expenses_Family,Medical_Tent,City,Deceased
0,1,4696,3,Miss Linda Betty,,0,0,225,,Santa Fe,1
1,2,21436,1,Ms. Ramona Elvira,1966.0,0,1,1663,,Albuquerque,0
2,3,7273,3,Mr. Mario Vernon,1982.0,0,0,221,,Santa Fe,1
3,4,8226,3,Mr. Hector Joe,1997.0,0,0,220,,Santa Fe,1
4,5,19689,3,Ms. Jennie Debra,1994.0,0,0,222,,Santa Fe,0
...,...,...,...,...,...,...,...,...,...,...,...
895,896,6253,3,Ms. Linda Wilcox,1998.0,1,1,344,,Santa Fe,0
896,897,6483,3,Mr. Haiden Vance,2006.0,0,0,258,,Santa Fe,0
897,898,981,3,Miss Anaiya Love,1990.0,0,0,214,,Taos,1
898,899,16418,2,Mr. Robert Williams,1994.0,1,1,812,,Santa Fe,0


- Splitting title, first and last names:

In [774]:
df_train.loc[:, 'Title'] = df_train.loc[:,'Name'].apply(lambda x: re.search(r"(.+?)\s(.+?)\s(.+)", x).group(1))
df_train.loc[:, 'First_Name'] = df_train.loc[:,'Name'].apply(lambda x: re.search(r"(.+?)\s(.+?)\s(.+)", x).group(2))
df_train.loc[:, 'Last_Name'] = df_train.loc[:,'Name'].apply(lambda x: re.search(r"(.+?)\s(.+?)\s(.+)", x).group(3))

In [775]:
df_test.loc[:, 'Title'] = df_test.loc[:,'Name'].apply(lambda x: re.search(r"(.+?)\s(.+?)\s(.+)", x).group(1))
df_test.loc[:, 'First_Name'] = df_test.loc[:,'Name'].apply(lambda x: re.search(r"(.+?)\s(.+?)\s(.+)", x).group(2))
df_test.loc[:, 'Last_Name'] = df_test.loc[:,'Name'].apply(lambda x: re.search(r"(.+?)\s(.+?)\s(.+)", x).group(3))

- Creating gender:

In [776]:
gender_dictionary ={'Miss' : 'F', 'Ms.' : 'F', 'Mr.' : 'M', 'Master' : 'M'} 
df_train.loc[:,'gender'] = df_train.loc[:,'Title'].map(gender_dictionary) 

In [777]:
gender_dictionary ={'Miss' : 'F', 'Mrs.' : 'F', 'Mr.' : 'M', 'Master' : 'M'} 
df_test.loc[:,'gender'] = df_test.loc[:,'Title'].map(gender_dictionary) 

- Filling missing values:

__Medical_tent__

In [778]:
df_train.Medical_Tent = df_train.Medical_Tent.fillna('No_tent')

In [779]:
df_test.Medical_Tent = df_test.Medical_Tent.fillna('No_tent')

__Birthday_year__

In [780]:
imputer = SimpleImputer(strategy='mean')
imputer.fit(df_train.loc[:,['Birthday_year']])
df_train.loc[:,'Birthday_year'] = imputer.transform(df_train.loc[:,['Birthday_year']]).astype(int)

In [781]:
df_test.loc[:,'Birthday_year'] = imputer.transform(df_test.loc[:,['Birthday_year']]).astype(int)

__City__

In [782]:
df_train.loc[:,'City'] = df_train.loc[:,'City'].fillna('Santa Fe')

- Encoding:

In [783]:
for feature in ['Medical_Tent', 'City', 'gender']:
    encoder = LabelEncoder()
    encoder.fit(df_train.loc[:, feature])
    df_train.loc[:, feature] = encoder.transform(df_train.loc[:, feature])
    df_test.loc[:, feature] = encoder.transform(df_test.loc[:, feature])

- Creating expenses per capita from medical_expenses_family:

In [784]:
df_train.loc[:,'Expenses_per_capita'] = df_train.loc[:,'Medical_Expenses_Family']/(df_train.loc[:,'Parents or siblings infected']+df_train.loc[:,'Wife/Husband or children infected']+1)

In [785]:
df_test.loc[:,'Expenses_per_capita'] = df_test.loc[:,'Medical_Expenses_Family']/(df_test.loc[:,'Parents or siblings infected']+df_test.loc[:,'Wife/Husband or children infected']+1)

In [786]:
df_train

Unnamed: 0,Patient_ID,Family_Case_ID,Severity,Name,Birthday_year,Parents or siblings infected,Wife/Husband or children infected,Medical_Expenses_Family,Medical_Tent,City,Deceased,Title,First_Name,Last_Name,gender,Expenses_per_capita
0,1,4696,3,Miss Linda Betty,1990,0,0,225,7,1,1,Miss,Linda,Betty,0,225.000000
1,2,21436,1,Ms. Ramona Elvira,1966,0,1,1663,7,0,0,Ms.,Ramona,Elvira,0,831.500000
2,3,7273,3,Mr. Mario Vernon,1982,0,0,221,7,1,1,Mr.,Mario,Vernon,1,221.000000
3,4,8226,3,Mr. Hector Joe,1997,0,0,220,7,1,1,Mr.,Hector,Joe,1,220.000000
4,5,19689,3,Ms. Jennie Debra,1994,0,0,222,7,1,0,Ms.,Jennie,Debra,0,222.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
895,896,6253,3,Ms. Linda Wilcox,1998,1,1,344,7,1,0,Ms.,Linda,Wilcox,0,114.666667
896,897,6483,3,Mr. Haiden Vance,2006,0,0,258,7,1,0,Mr.,Haiden,Vance,1,258.000000
897,898,981,3,Miss Anaiya Love,1990,0,0,214,7,2,1,Miss,Anaiya,Love,0,214.000000
898,899,16418,2,Mr. Robert Williams,1994,1,1,812,7,1,0,Mr.,Robert,Williams,1,270.666667


In [787]:
df_test

Unnamed: 0,Patient_ID,Family_Case_ID,Severity,Name,Birthday_year,Parents or siblings infected,Wife/Husband or children infected,Medical_Expenses_Family,Medical_Tent,City,Title,First_Name,Last_Name,gender,Expenses_per_capita
0,901,49242,3,Mr. Jody Pedro,1990,0,0,203,7,1,Mr.,Jody,Pedro,1,203.0
1,902,10400,3,Mr. Kevin Brent,1988,0,0,631,7,1,Mr.,Kevin,Brent,1,631.0
2,903,10795,3,Mr. Frankie Cary,1981,1,0,376,7,0,Mr.,Frankie,Cary,1,188.0
3,904,62440,3,Mr. Rick Pete,1990,0,1,405,7,0,Mr.,Rick,Pete,1,202.5
4,905,81311,2,Mr. Matthew Erick,1996,0,0,378,7,1,Mr.,Matthew,Erick,1,378.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
395,1296,110522,3,Mr. Luther Rogelio,1990,0,0,221,7,1,Mr.,Luther,Rogelio,1,221.0
396,1297,118768,3,Mr. Emanuel Ruben,1990,0,0,202,7,0,Mr.,Emanuel,Ruben,1,202.0
397,1298,86158,1,Mrs. Misty Camille,1994,0,1,3830,2,0,Mrs.,Misty,Camille,0,1915.0
398,1299,18523,3,Master Gustavo Jordan,2007,2,0,567,7,1,Master,Gustavo,Jordan,1,189.0


- Dropping some variables to train the model:

In [788]:
X = df_train.drop(columns=['Patient_ID','Family_Case_ID','Name','Title','First_Name','Last_Name','Deceased',
                           'Medical_Expenses_Family'])
y = df_train['Deceased']

In [789]:
X_final = df_test.drop(columns=['Patient_ID','Family_Case_ID','Name','Title','First_Name','Last_Name',
                                'Medical_Expenses_Family'])

In [790]:
X

Unnamed: 0,Severity,Birthday_year,Parents or siblings infected,Wife/Husband or children infected,Medical_Tent,City,gender,Expenses_per_capita
0,3,1990,0,0,7,1,0,225.000000
1,1,1966,0,1,7,0,0,831.500000
2,3,1982,0,0,7,1,1,221.000000
3,3,1997,0,0,7,1,1,220.000000
4,3,1994,0,0,7,1,0,222.000000
...,...,...,...,...,...,...,...,...
895,3,1998,1,1,7,1,0,114.666667
896,3,2006,0,0,7,1,1,258.000000
897,3,1990,0,0,7,2,0,214.000000
898,2,1994,1,1,7,1,1,270.666667


In [791]:
scaler=MinMaxScaler()
columns=X.columns
X=pd.DataFrame(scaler.fit_transform(X))
X.columns=columns
X_final=pd.DataFrame(scaler.transform(X_final))
X_final.columns=columns

In [792]:
X

Unnamed: 0,Severity,Birthday_year,Parents or siblings infected,Wife/Husband or children infected,Medical_Tent,City,gender,Expenses_per_capita
0,1.0,0.632911,0.000000,0.000,0.875,0.5,0.0,0.015685
1,0.0,0.329114,0.000000,0.125,0.875,0.0,0.0,0.057964
2,1.0,0.531646,0.000000,0.000,0.875,0.5,1.0,0.015406
3,1.0,0.721519,0.000000,0.000,0.875,0.5,1.0,0.015336
4,1.0,0.683544,0.000000,0.000,0.875,0.5,0.0,0.015476
...,...,...,...,...,...,...,...,...
895,1.0,0.734177,0.166667,0.125,0.875,0.5,0.0,0.007993
896,1.0,0.835443,0.000000,0.000,0.875,0.5,1.0,0.017985
897,1.0,0.632911,0.000000,0.000,0.875,1.0,0.0,0.014918
898,0.5,0.683544,0.166667,0.125,0.875,0.5,1.0,0.018868


In [670]:
X[0]

array([1.        , 0.63291139, 0.        , 0.        , 0.875     ,
       0.5       , 0.        , 0.01568491])

In [671]:

from copy import deepcopy
from sklearn.model_selection import train_test_split

import keras

from keras.models import Sequential
from keras.layers import LSTM, Dense, Dropout, Masking, Embedding, Bidirectional
from keras.optimizers import Adam

In [672]:
X2, X_test, y2, y_test = train_test_split(X, 
                                                    y, 
                                                    test_size=0.1, 
                                                    #random_state=55, 
                                                    shuffle=True, 
                                                    stratify=y
                                                   )

In [673]:
X_train, X_val, y_train, y_val = train_test_split(X2, 
                                                    y2, 
                                                    test_size=0.1, 
                                                   # random_state, 
                                                    shuffle=True, 
                                                    stratify=y2
                                                   )

In [690]:
def build_model(input_size, output_size):
    model = Sequential()
    model.add(Dense(16, input_dim=input_size,activation='relu'))
    model.add(Dense(16, input_dim=input_size,activation='relu'))
   # model.add(Dropout(0.2))
#     model.add(Dense(16,activation='relu'))
#     model.add(Dropout(0.2))
    model.add(Dense(output_size, activation='sigmoid') )
    # Compile the model
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    return model

In [691]:
model=build_model(len(X_train[0]),1)
model.fit(X_train, y_train, validation_data=(X_val,y_val), batch_size=16,epochs=40,verbose=1)


Train on 729 samples, validate on 81 samples
Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
Epoch 12/40
Epoch 13/40
Epoch 14/40
Epoch 15/40
Epoch 16/40
Epoch 17/40
Epoch 18/40
Epoch 19/40
Epoch 20/40
Epoch 21/40
Epoch 22/40
Epoch 23/40
Epoch 24/40
Epoch 25/40
Epoch 26/40
Epoch 27/40
Epoch 28/40
Epoch 29/40
Epoch 30/40
Epoch 31/40
Epoch 32/40
Epoch 33/40
Epoch 34/40
Epoch 35/40
Epoch 36/40
Epoch 37/40
Epoch 38/40
Epoch 39/40
Epoch 40/40


<keras.callbacks.callbacks.History at 0x1639ef310>

In [684]:
model.evaluate(X_test,y_test)



[0.5146363178888956, 0.7666666507720947]

In [345]:

model=build_model(len(X_train[0]),1)
model.fit(X, y, batch_size=8,epochs=50,verbose=1)
pred=model.predict(X_final)
pred=pred.flatten()


Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [346]:
for i in range(len(pred)):
    if pred[i]<.5:
        pred[i]=0
    else:
        pred[i]=1

In [347]:
pred=pred.astype("int")

In [348]:
version3 = pd.DataFrame( pred,df_test.Patient_ID.values )

In [349]:
version3.rename(columns={0: "Deceased"},inplace=True)
version3.rename_axis("Patient_ID",inplace=True)

In [350]:
version3

Unnamed: 0_level_0,Deceased
Patient_ID,Unnamed: 1_level_1
901,1
902,1
903,1
904,1
905,1
...,...
1296,1
1297,1
1298,0
1299,1


In [351]:
version3.to_csv('versions/Group6__version_alex.csv', index=True)

%%%%

In [417]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.neural_network import MLPClassifier

In [550]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import AdaBoostClassifier

modelKNN = KNeighborsClassifier()
bagging_KNN = BaggingClassifier(base_estimator = modelKNN, random_state = 5)
NN=MLPClassifier(hidden_layer_sizes=(16,16))
bagging_NN = BaggingClassifier(base_estimator = NN, random_state = 5)
AB=AdaBoostClassifier(random_state = 5)

In [473]:
X_train, X_val, y_train, y_val = train_test_split(X, 
                                                    y, 
                                                    test_size=0.1, 
                                                   # random_state, 
                                                    shuffle=True, 
                                                    stratify=y
                                                   )

In [490]:
modelKNN.fit(X_train,y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                     weights='uniform')

In [491]:
modelKNN.score(X_val,y_val)

0.8333333333333334

In [None]:
predknn=modelKNN.predict(X_final)

In [492]:
NN.fit(X_train,y_train)

MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
              beta_2=0.999, early_stopping=False, epsilon=1e-08,
              hidden_layer_sizes=(16, 16), learning_rate='constant',
              learning_rate_init=0.001, max_fun=15000, max_iter=200,
              momentum=0.9, n_iter_no_change=10, nesterovs_momentum=True,
              power_t=0.5, random_state=None, shuffle=True, solver='adam',
              tol=0.0001, validation_fraction=0.1, verbose=False,
              warm_start=False)

In [493]:
NN.score(X_val,y_val)

0.8555555555555555

In [509]:
prednn=NN.predict(X_final)

In [494]:
AB.fit(X_train, y_train)

AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None, learning_rate=1.0,
                   n_estimators=50, random_state=5)

In [495]:
AB.score(X_val,y_val)

0.8666666666666667

In [510]:
predab=AB.predict(X_final)

In [548]:
bagging_KNN.fit(X_train,y_train)

BaggingClassifier(base_estimator=KNeighborsClassifier(algorithm='auto',
                                                      leaf_size=30,
                                                      metric='minkowski',
                                                      metric_params=None,
                                                      n_jobs=None,
                                                      n_neighbors=5, p=2,
                                                      weights='uniform'),
                  bootstrap=True, bootstrap_features=False, max_features=1.0,
                  max_samples=1.0, n_estimators=1000, n_jobs=None,
                  oob_score=False, random_state=5, verbose=0, warm_start=False)

In [549]:
bagging_KNN.score(X_val,y_val)

0.8444444444444444

In [511]:
predbknn=bagging_KNN.predict(X_final)

In [551]:
bagging_NN.fit(X_train,y_train)

BaggingClassifier(base_estimator=MLPClassifier(activation='relu', alpha=0.0001,
                                               batch_size='auto', beta_1=0.9,
                                               beta_2=0.999,
                                               early_stopping=False,
                                               epsilon=1e-08,
                                               hidden_layer_sizes=(16, 16),
                                               learning_rate='constant',
                                               learning_rate_init=0.001,
                                               max_fun=15000, max_iter=200,
                                               momentum=0.9,
                                               n_iter_no_change=10,
                                               nesterovs_momentum=True,
                                               power_t=0.5, random_state=None,
                                               shuffle=True, solver='ada

In [552]:
bagging_NN.score(X_val,y_val)

0.8666666666666667

In [512]:
predbnn=bagging_NN.predict(X_final)

In [611]:
from sklearn.ensemble import StackingClassifier
from xgboost import XGBClassifier

In [612]:
estimators = [#('rf', RandomForestClassifier(random_state=5)),
    ('ab',AdaBoostClassifier(random_state = 5)),
    ('xgb',XGBClassifier(
                    colsample_bytree = 1.0, 
                    eta = 0.63, 
                    max_depth = 2
                )),
              ('nb', GaussianNB()),('knn',KNeighborsClassifier()),
              ('nn',MLPClassifier(hidden_layer_sizes=(16,16))),
              ]

In [613]:
SC = StackingClassifier(estimators=estimators, final_estimator=LogisticRegression())

# modelKNN = KNeighborsClassifier()
# bagging_KNN = BaggingClassifier(base_estimator = modelKNN, random_state = 5)
# NN=MLPClassifier(hidden_layer_sizes=(16,16))
# bagging_NN = BaggingClassifier(base_estimator = NN, random_state = 5)
# AB=AdaBoostClassifier(random_state = 5)

In [614]:
SC.fit(X_train, y_train)

StackingClassifier(cv=None,
                   estimators=[('ab',
                                AdaBoostClassifier(algorithm='SAMME.R',
                                                   base_estimator=None,
                                                   learning_rate=1.0,
                                                   n_estimators=50,
                                                   random_state=5)),
                               ('xgb',
                                XGBClassifier(base_score=None, booster=None,
                                              colsample_bylevel=None,
                                              colsample_bynode=None,
                                              colsample_bytree=1.0, eta=0.63,
                                              gamma=None, gpu_id=None,
                                              importance_type='gain',
                                              interaction_constraints=...
                                   

In [615]:
SC.score(X_val,y_val)

0.8666666666666667

In [616]:
sc=SC.predict(X_final)

In [696]:
from sklearn.ensemble import GradientBoostingClassifier

In [697]:
GB = GradientBoostingClassifier(random_state = 5)

In [698]:
GB.fit(X_train, y_train)

GradientBoostingClassifier(ccp_alpha=0.0, criterion='friedman_mse', init=None,
                           learning_rate=0.1, loss='deviance', max_depth=3,
                           max_features=None, max_leaf_nodes=None,
                           min_impurity_decrease=0.0, min_impurity_split=None,
                           min_samples_leaf=1, min_samples_split=2,
                           min_weight_fraction_leaf=0.0, n_estimators=100,
                           n_iter_no_change=None, presort='deprecated',
                           random_state=5, subsample=1.0, tol=0.0001,
                           validation_fraction=0.1, verbose=0,
                           warm_start=False)

In [699]:
GB.score(X_val,y_val)

0.8271604938271605

# CV

# LogRe

In [880]:
parameter_space = {
    'penalty':['l1', 'l2'],
    #'dual': [False],
    #'tol':[1e-4,],
   # 'C':[1.0],
    'fit_intercept':[True,False],
    'solver':['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],
    'multi_class':['ovr', 'multinomial']
}
model=LogisticRegression()
clf = GridSearchCV(model, parameter_space,n_jobs=-1,verbose=5,cv=5)

In [881]:
clf.fit(X,y)

Fitting 5 folds for each of 40 candidates, totalling 200 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  48 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done 200 out of 200 | elapsed:    0.3s finished


GridSearchCV(cv=5, error_score=nan,
             estimator=LogisticRegression(C=1.0, class_weight=None, dual=False,
                                          fit_intercept=True,
                                          intercept_scaling=1, l1_ratio=None,
                                          max_iter=100, multi_class='auto',
                                          n_jobs=None, penalty='l2',
                                          random_state=None, solver='lbfgs',
                                          tol=0.0001, verbose=0,
                                          warm_start=False),
             iid='deprecated', n_jobs=-1,
             param_grid={'fit_intercept': [True, False],
                         'multi_class': ['ovr', 'multinomial'],
                         'penalty': ['l1', 'l2'],
                         'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag',
                                    'saga']},
             pre_dispatch='2*n_jobs', refit=True, return_tr

In [882]:
# Best parameter set
print('------------------------------------------------------------------------------------------------------------------------')
print('Best parameters found:\n', clf.best_params_)
print('------------------------------------------------------------------------------------------------------------------------')

# All results
means = clf.cv_results_['mean_test_score']
stds = clf.cv_results_['std_test_score']
for mean, std, params in zip(means, stds, clf.cv_results_['params']):
    if mean>.786:
        print("%0.3f (+/-%0.03f) for %r" % (mean, std , params))
        

------------------------------------------------------------------------------------------------------------------------
Best parameters found:
 {'fit_intercept': False, 'multi_class': 'ovr', 'penalty': 'l1', 'solver': 'liblinear'}
------------------------------------------------------------------------------------------------------------------------
0.787 (+/-0.035) for {'fit_intercept': True, 'multi_class': 'ovr', 'penalty': 'l1', 'solver': 'saga'}
0.787 (+/-0.035) for {'fit_intercept': True, 'multi_class': 'multinomial', 'penalty': 'l1', 'solver': 'saga'}
0.787 (+/-0.037) for {'fit_intercept': True, 'multi_class': 'multinomial', 'penalty': 'l2', 'solver': 'newton-cg'}
0.787 (+/-0.037) for {'fit_intercept': True, 'multi_class': 'multinomial', 'penalty': 'l2', 'solver': 'lbfgs'}
0.787 (+/-0.037) for {'fit_intercept': True, 'multi_class': 'multinomial', 'penalty': 'l2', 'solver': 'sag'}
0.787 (+/-0.037) for {'fit_intercept': True, 'multi_class': 'multinomial', 'penalty': 'l2', 'solver'

# RanFor

In [892]:
parameter_space = {
    'n_estimators':[60,80,100,120],
    'criterion':['entropy'],
    'max_depth':[7,8,9,10,11,12,13],
    'oob_score':[True],
    'bootstrap':[True],
    'min_samples_leaf':[1,2,3,4,5,10,20],
    'max_features':[None]
    
}
model=RandomForestClassifier()
clf = GridSearchCV(model, parameter_space,n_jobs=-1,verbose=5,cv=5)
clf.fit(X,y)

Fitting 5 folds for each of 196 candidates, totalling 980 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  40 tasks      | elapsed:    1.5s
[Parallel(n_jobs=-1)]: Done 130 tasks      | elapsed:    4.1s
[Parallel(n_jobs=-1)]: Done 256 tasks      | elapsed:    8.0s
[Parallel(n_jobs=-1)]: Done 418 tasks      | elapsed:   13.0s
[Parallel(n_jobs=-1)]: Done 616 tasks      | elapsed:   19.1s
[Parallel(n_jobs=-1)]: Done 850 tasks      | elapsed:   26.3s
[Parallel(n_jobs=-1)]: Done 980 out of 980 | elapsed:   30.3s finished


GridSearchCV(cv=5, error_score=nan,
             estimator=RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                              class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              max_samples=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators=100, n_jobs=None,...
                                              random_state=None, verbose=0,
                                   

In [894]:
# Best parameter set
print('------------------------------------------------------------------------------------------------------------------------')
print('Best parameters found:\n', clf.best_params_)
print('------------------------------------------------------------------------------------------------------------------------')

# All results
means = clf.cv_results_['mean_test_score']
stds = clf.cv_results_['std_test_score']
for mean, std, params in zip(means, stds, clf.cv_results_['params']):
    if mean>.833:
        print("%0.3f (+/-%0.03f) for %r" % (mean, std , params))

------------------------------------------------------------------------------------------------------------------------
Best parameters found:
 {'bootstrap': True, 'criterion': 'entropy', 'max_depth': 10, 'max_features': None, 'min_samples_leaf': 2, 'n_estimators': 120, 'oob_score': True}
------------------------------------------------------------------------------------------------------------------------
0.838 (+/-0.029) for {'bootstrap': True, 'criterion': 'entropy', 'max_depth': 10, 'max_features': None, 'min_samples_leaf': 2, 'n_estimators': 120, 'oob_score': True}
0.833 (+/-0.022) for {'bootstrap': True, 'criterion': 'entropy', 'max_depth': 13, 'max_features': None, 'min_samples_leaf': 2, 'n_estimators': 80, 'oob_score': True}


# GradBoost

In [899]:
parameter_space = {
    'loss':[ 'exponential'],
    'n_estimators':[60],
    'max_depth':[7,9,11,13],
    'min_samples_leaf':[10,20,50,100],
    'max_features':[None]
    
    
    
}
model=GradientBoostingClassifier()
clf = GridSearchCV(model, parameter_space,n_jobs=-1,verbose=5,cv=5)
clf.fit(X,y)

Fitting 5 folds for each of 16 candidates, totalling 80 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  40 tasks      | elapsed:    2.9s
[Parallel(n_jobs=-1)]: Done  66 out of  80 | elapsed:    3.3s remaining:    0.7s
[Parallel(n_jobs=-1)]: Done  80 out of  80 | elapsed:    3.4s finished


GridSearchCV(cv=5, error_score=nan,
             estimator=GradientBoostingClassifier(ccp_alpha=0.0,
                                                  criterion='friedman_mse',
                                                  init=None, learning_rate=0.1,
                                                  loss='deviance', max_depth=3,
                                                  max_features=None,
                                                  max_leaf_nodes=None,
                                                  min_impurity_decrease=0.0,
                                                  min_impurity_split=None,
                                                  min_samples_leaf=1,
                                                  min_samples_split=2,
                                                  min_weight_fraction_leaf=0.0,
                                                  n_estimators=100,
                                                  n_iter_no_c...
                 

In [900]:
# Best parameter set
print('------------------------------------------------------------------------------------------------------------------------')
print('Best parameters found:\n', clf.best_params_)
print('------------------------------------------------------------------------------------------------------------------------')

# All results
means = clf.cv_results_['mean_test_score']
stds = clf.cv_results_['std_test_score']
for mean, std, params in zip(means, stds, clf.cv_results_['params']):
    if mean>.833:
        print("%0.3f (+/-%0.03f) for %r" % (mean, std , params))

------------------------------------------------------------------------------------------------------------------------
Best parameters found:
 {'loss': 'exponential', 'max_depth': 7, 'max_features': None, 'min_samples_leaf': 10, 'n_estimators': 60}
------------------------------------------------------------------------------------------------------------------------
0.836 (+/-0.024) for {'loss': 'exponential', 'max_depth': 7, 'max_features': None, 'min_samples_leaf': 10, 'n_estimators': 60}
0.833 (+/-0.019) for {'loss': 'exponential', 'max_depth': 11, 'max_features': None, 'min_samples_leaf': 20, 'n_estimators': 60}


# AdaBoost

In [907]:
parameter_space = {
    'n_estimators':[100,150,200,250],
    'learning_rate':[1.0,0.9,0.8],
    'algorithm':['SAMME.R']
}
model=AdaBoostClassifier()
clf = GridSearchCV(model, parameter_space,n_jobs=-1,verbose=5,cv=5)
clf.fit(X,y)

Fitting 5 folds for each of 12 candidates, totalling 60 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 out of  60 | elapsed:    2.1s remaining:    0.9s
[Parallel(n_jobs=-1)]: Done  55 out of  60 | elapsed:    2.5s remaining:    0.2s
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:    2.8s finished


GridSearchCV(cv=5, error_score=nan,
             estimator=AdaBoostClassifier(algorithm='SAMME.R',
                                          base_estimator=None,
                                          learning_rate=1.0, n_estimators=50,
                                          random_state=None),
             iid='deprecated', n_jobs=-1,
             param_grid={'algorithm': ['SAMME.R'],
                         'learning_rate': [1.0, 0.9, 0.8],
                         'n_estimators': [100, 150, 200, 250]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=5)

In [908]:
# Best parameter set
print('------------------------------------------------------------------------------------------------------------------------')
print('Best parameters found:\n', clf.best_params_)
print('------------------------------------------------------------------------------------------------------------------------')

# All results
means = clf.cv_results_['mean_test_score']
stds = clf.cv_results_['std_test_score']
for mean, std, params in zip(means, stds, clf.cv_results_['params']):
    if mean>.8:
        print("%0.3f (+/-%0.03f) for %r" % (mean, std , params))

------------------------------------------------------------------------------------------------------------------------
Best parameters found:
 {'algorithm': 'SAMME.R', 'learning_rate': 0.9, 'n_estimators': 150}
------------------------------------------------------------------------------------------------------------------------
0.802 (+/-0.019) for {'algorithm': 'SAMME.R', 'learning_rate': 1.0, 'n_estimators': 100}
0.808 (+/-0.017) for {'algorithm': 'SAMME.R', 'learning_rate': 1.0, 'n_estimators': 150}
0.801 (+/-0.019) for {'algorithm': 'SAMME.R', 'learning_rate': 1.0, 'n_estimators': 200}
0.808 (+/-0.025) for {'algorithm': 'SAMME.R', 'learning_rate': 1.0, 'n_estimators': 250}
0.808 (+/-0.019) for {'algorithm': 'SAMME.R', 'learning_rate': 0.9, 'n_estimators': 100}
0.810 (+/-0.020) for {'algorithm': 'SAMME.R', 'learning_rate': 0.9, 'n_estimators': 150}
0.807 (+/-0.023) for {'algorithm': 'SAMME.R', 'learning_rate': 0.9, 'n_estimators': 200}
0.807 (+/-0.021) for {'algorithm': 'SAMME.R

# MLP

In [914]:
parameter_space = {
    'hidden_layer_sizes':[(16,16),(8,8),(12,12),(16,8)],
    'activation':[ 'relu'],
    'solver':['lbfgs'],
    'max_iter':[50,75,100,125,150,175,200],
    'learning_rate':['constant'],
    

}
model=MLPClassifier()
clf = GridSearchCV(model, parameter_space,n_jobs=-1,verbose=5,cv=5)
clf.fit(X,y)

Fitting 5 folds for each of 28 candidates, totalling 140 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  48 tasks      | elapsed:    1.0s
[Parallel(n_jobs=-1)]: Done 138 out of 140 | elapsed:    2.3s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done 140 out of 140 | elapsed:    2.4s finished


GridSearchCV(cv=5, error_score=nan,
             estimator=MLPClassifier(activation='relu', alpha=0.0001,
                                     batch_size='auto', beta_1=0.9,
                                     beta_2=0.999, early_stopping=False,
                                     epsilon=1e-08, hidden_layer_sizes=(100,),
                                     learning_rate='constant',
                                     learning_rate_init=0.001, max_fun=15000,
                                     max_iter=200, momentum=0.9,
                                     n_iter_no_change=10,
                                     nesterovs_momentum=True, power_t=0.5,
                                     random_state...
                                     solver='adam', tol=0.0001,
                                     validation_fraction=0.1, verbose=False,
                                     warm_start=False),
             iid='deprecated', n_jobs=-1,
             param_grid={'activation': ['re

In [916]:
# Best parameter set
print('------------------------------------------------------------------------------------------------------------------------')
print('Best parameters found:\n', clf.best_params_)
print('------------------------------------------------------------------------------------------------------------------------')

# All results
means = clf.cv_results_['mean_test_score']
stds = clf.cv_results_['std_test_score']
for mean, std, params in zip(means, stds, clf.cv_results_['params']):
    if mean>.818:
        print("%0.3f (+/-%0.03f) for %r" % (mean, std , params))

------------------------------------------------------------------------------------------------------------------------
Best parameters found:
 {'activation': 'relu', 'hidden_layer_sizes': (16, 8), 'learning_rate': 'constant', 'max_iter': 125, 'solver': 'lbfgs'}
------------------------------------------------------------------------------------------------------------------------
0.822 (+/-0.027) for {'activation': 'relu', 'hidden_layer_sizes': (16, 8), 'learning_rate': 'constant', 'max_iter': 125, 'solver': 'lbfgs'}


# KNN

In [924]:
parameter_space = {
    'n_neighbors':[21,23,25,27,29,31],
    'weights':['uniform'],
    'algorithm':['auto', 'ball_tree', 'kd_tree', 'brute'],
    'leaf_size':[10,20,30,40,50,60,70,80,90,100],
    'p':[1]
    

}
model=KNeighborsClassifier()
clf = GridSearchCV(model, parameter_space,n_jobs=-1,verbose=5,cv=5)
clf.fit(X,y)

Fitting 5 folds for each of 240 candidates, totalling 1200 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  48 tasks      | elapsed:    0.2s
[Parallel(n_jobs=-1)]: Done 496 tasks      | elapsed:    1.2s
[Parallel(n_jobs=-1)]: Done 1168 tasks      | elapsed:    2.3s
[Parallel(n_jobs=-1)]: Done 1200 out of 1200 | elapsed:    2.3s finished


GridSearchCV(cv=5, error_score=nan,
             estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30,
                                            metric='minkowski',
                                            metric_params=None, n_jobs=None,
                                            n_neighbors=5, p=2,
                                            weights='uniform'),
             iid='deprecated', n_jobs=-1,
             param_grid={'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],
                         'leaf_size': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100],
                         'n_neighbors': [21, 23, 25, 27, 29, 31], 'p': [1],
                         'weights': ['uniform']},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=5)

In [927]:
# Best parameter set
print('------------------------------------------------------------------------------------------------------------------------')
print('Best parameters found:\n', clf.best_params_)
print('------------------------------------------------------------------------------------------------------------------------')

# All results
means = clf.cv_results_['mean_test_score']
stds = clf.cv_results_['std_test_score']
for mean, std, params in zip(means, stds, clf.cv_results_['params']):
    if mean>.807:
        print("%0.3f (+/-%0.03f) for %r" % (mean, std , params))

------------------------------------------------------------------------------------------------------------------------
Best parameters found:
 {'algorithm': 'auto', 'leaf_size': 10, 'n_neighbors': 29, 'p': 1, 'weights': 'uniform'}
------------------------------------------------------------------------------------------------------------------------
0.808 (+/-0.029) for {'algorithm': 'auto', 'leaf_size': 10, 'n_neighbors': 29, 'p': 1, 'weights': 'uniform'}
0.808 (+/-0.029) for {'algorithm': 'auto', 'leaf_size': 20, 'n_neighbors': 29, 'p': 1, 'weights': 'uniform'}
0.808 (+/-0.029) for {'algorithm': 'auto', 'leaf_size': 30, 'n_neighbors': 29, 'p': 1, 'weights': 'uniform'}
0.808 (+/-0.029) for {'algorithm': 'auto', 'leaf_size': 40, 'n_neighbors': 29, 'p': 1, 'weights': 'uniform'}
0.808 (+/-0.029) for {'algorithm': 'auto', 'leaf_size': 50, 'n_neighbors': 29, 'p': 1, 'weights': 'uniform'}
0.808 (+/-0.029) for {'algorithm': 'auto', 'leaf_size': 60, 'n_neighbors': 29, 'p': 1, 'weights': 'un

In [928]:
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression

In [929]:
clf1 = LogisticRegression(fit_intercept= False, multi_class= 'ovr', penalty='l1', solver= 'saga', random_state=1)


In [930]:

clf2 = RandomForestClassifier(bootstrap= True, criterion= 'entropy', max_depth= 10, max_features= None, 
                              min_samples_leaf= 2, n_estimators= 120, oob_score= True, random_state=1)


In [931]:
clf3 = GradientBoostingClassifier(loss= 'exponential', max_depth= 7, max_features= None,
                                  min_samples_leaf= 10, n_estimators= 60,random_state = 1)


In [932]:
clf4 = XGBClassifier(colsample_bytree = 1.0, eta = 0.63, max_depth = 2)

In [933]:
clf5 = AdaBoostClassifier(algorithm= 'SAMME.R', learning_rate= 0.9, n_estimators= 150,random_state = 5)

In [934]:
clf6 = MLPClassifier(activation= 'relu', hidden_layer_sizes= (16, 8), learning_rate= 'constant',
                      max_iter= 125, solver= 'lbfgs')

In [937]:
clf7 = KNeighborsClassifier(algorithm= 'auto', leaf_size = 60, n_neighbors= 29, 
                             p= 1, weights= 'uniform')

In [952]:
VC = VotingClassifier(estimators=[('lr', clf1), ('rf', clf2), ('gb', clf3), ('xgb',clf4), 
                                  ('ab',clf5), ('mlp',clf6), ('knn',clf7)], voting='hard')

In [953]:
X.columns

Index(['Severity', 'Birthday_year', 'Parents or siblings infected',
       'Wife/Husband or children infected', 'Medical_Tent', 'City', 'gender',
       'Expenses_per_capita'],
      dtype='object')

In [976]:
to_drop=['City']

In [981]:
X_train, X_val, y_train, y_val = train_test_split(X.drop(to_drop,axis=1), 
                                                    y, 
                                                    test_size=0.1, 
                                                   # random_state, 
                                                    shuffle=True, 
                                                    stratify=y
                                                   )

In [985]:
VC.fit(X_train,y_train)

VotingClassifier(estimators=[('lr',
                              LogisticRegression(C=1.0, class_weight=None,
                                                 dual=False,
                                                 fit_intercept=False,
                                                 intercept_scaling=1,
                                                 l1_ratio=None, max_iter=100,
                                                 multi_class='ovr', n_jobs=None,
                                                 penalty='l1', random_state=1,
                                                 solver='saga', tol=0.0001,
                                                 verbose=0, warm_start=False)),
                             ('rf',
                              RandomForestClassifier(bootstrap=True,
                                                     ccp_alpha=0.0,
                                                     class_weight=None,
                                                 

In [986]:
VC.score(X_train,y_train)

0.8777777777777778

In [987]:
VC.score(X_val,y_val)

0.8555555555555555

In [959]:
VC.classes_

array([0, 1])

In [960]:
vc=VC.predict(X_final)

In [988]:
to_drop=['City']

In [995]:
VC.fit(X.drop(to_drop,axis=1),y)

VotingClassifier(estimators=[('lr',
                              LogisticRegression(C=1.0, class_weight=None,
                                                 dual=False,
                                                 fit_intercept=False,
                                                 intercept_scaling=1,
                                                 l1_ratio=None, max_iter=100,
                                                 multi_class='ovr', n_jobs=None,
                                                 penalty='l1', random_state=1,
                                                 solver='saga', tol=0.0001,
                                                 verbose=0, warm_start=False)),
                             ('rf',
                              RandomForestClassifier(bootstrap=True,
                                                     ccp_alpha=0.0,
                                                     class_weight=None,
                                                 

In [996]:
VC.score(X_train,y_train)

0.8716049382716049

In [997]:
vc=VC.predict(X_final.drop(to_drop,axis=1))

In [998]:
vers = pd.DataFrame( vc,df_test.Patient_ID.values )
vers.rename(columns={0: "Deceased"},inplace=True)
vers.rename_axis("Patient_ID",inplace=True)

In [999]:
vers.to_csv('versions/Group6_version__alex.csv', index=True)

In [506]:
version3 = pd.DataFrame( predknn,df_test.Patient_ID.values )
version3.rename(columns={0: "Deceased"},inplace=True)
version3.rename_axis("Patient_ID",inplace=True)

In [513]:
version3 = pd.DataFrame( prednn,df_test.Patient_ID.values )
version3.rename(columns={0: "Deceased"},inplace=True)
version3.rename_axis("Patient_ID",inplace=True)

In [515]:
version3 = pd.DataFrame( predab,df_test.Patient_ID.values )
version3.rename(columns={0: "Deceased"},inplace=True)
version3.rename_axis("Patient_ID",inplace=True)

In [517]:
version3 = pd.DataFrame( predbknn,df_test.Patient_ID.values )
version3.rename(columns={0: "Deceased"},inplace=True)
version3.rename_axis("Patient_ID",inplace=True)

In [519]:
version3 = pd.DataFrame( predbnn,df_test.Patient_ID.values )
version3.rename(columns={0: "Deceased"},inplace=True)
version3.rename_axis("Patient_ID",inplace=True)

In [507]:
version3

Unnamed: 0_level_0,Deceased
Patient_ID,Unnamed: 1_level_1
901,1
902,1
903,1
904,1
905,1
...,...
1296,1
1297,1
1298,0
1299,1


In [520]:
version3.to_csv('versions/Group6_version__alex.csv', index=True)