In [1]:
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
from collections import Counter
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, train_test_split, cross_val_score
from sklearn.feature_selection import RFECV, SelectKBest, chi2, f_classif
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, f1_score
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, VotingClassifier, BaggingClassifier
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler, PowerTransformer, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.neighbors import KNeighborsClassifier
from sklearn.impute import KNNImputer, SimpleImputer, MissingIndicator
from sklearn.naive_bayes import GaussianNB
from tqdm import tqdm
from imblearn.over_sampling import SMOTE, RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from xgboost import XGBClassifier

- Importing files:

In [None]:
df_train = pd.read_csv(r'train.csv')

In [3]:
df_test = pd.read_csv(r'test.csv')

- Splitting title, first and last names:

In [4]:
df_train.loc[:, 'Title'] = df_train.loc[:,'Name'].apply(lambda x: re.search(r"(.+?)\s(.+?)\s(.+)", x).group(1))
df_train.loc[:, 'First_Name'] = df_train.loc[:,'Name'].apply(lambda x: re.search(r"(.+?)\s(.+?)\s(.+)", x).group(2))
df_train.loc[:, 'Last_Name'] = df_train.loc[:,'Name'].apply(lambda x: re.search(r"(.+?)\s(.+?)\s(.+)", x).group(3))

In [5]:
df_test.loc[:, 'Title'] = df_test.loc[:,'Name'].apply(lambda x: re.search(r"(.+?)\s(.+?)\s(.+)", x).group(1))
df_test.loc[:, 'First_Name'] = df_test.loc[:,'Name'].apply(lambda x: re.search(r"(.+?)\s(.+?)\s(.+)", x).group(2))
df_test.loc[:, 'Last_Name'] = df_test.loc[:,'Name'].apply(lambda x: re.search(r"(.+?)\s(.+?)\s(.+)", x).group(3))

- Creating gender:

In [6]:
gender_dictionary ={'Miss' : 'F', 'Ms.' : 'F', 'Mr.' : 'M', 'Master' : 'M'} 
df_train.loc[:,'gender'] = df_train.loc[:,'Title'].map(gender_dictionary) 

In [7]:
gender_dictionary ={'Miss' : 'F', 'Mrs.' : 'F', 'Mr.' : 'M', 'Master' : 'M'} 
df_test.loc[:,'gender'] = df_test.loc[:,'Title'].map(gender_dictionary) 

- Filling missing values:

__Medical_tent__

In [9]:
df_train.Medical_Tent = df_train.Medical_Tent.fillna('No_tent')

In [10]:
df_test.Medical_Tent = df_test.Medical_Tent.fillna('No_tent')

__Birthday_year__

In [11]:
imputer = SimpleImputer(strategy='mean')
imputer.fit(df_train.loc[:,['Birthday_year']])
df_train.loc[:,'Birthday_year'] = imputer.transform(df_train.loc[:,['Birthday_year']]).astype(int)

In [12]:
df_test.loc[:,'Birthday_year'] = imputer.transform(df_test.loc[:,['Birthday_year']]).astype(int)

__City__

In [13]:
df_train.loc[:,'City'] = df_train.loc[:,'City'].fillna('Santa Fe')

- Encoding:

In [17]:
for feature in ['Medical_Tent', 'City', 'gender']:
    encoder = LabelEncoder()
    encoder.fit(df_train.loc[:, feature])
    df_train.loc[:, feature] = encoder.transform(df_train.loc[:, feature])
    df_test.loc[:, feature] = encoder.transform(df_test.loc[:, feature])

- Creating expenses per capita from medical_expenses_family:

In [18]:
df_train.loc[:,'Expenses_per_capita'] = df_train.loc[:,'Medical_Expenses_Family']/(df_train.loc[:,'Parents or siblings infected']+df_train.loc[:,'Wife/Husband or children infected']+1)

In [19]:
df_test.loc[:,'Expenses_per_capita'] = df_test.loc[:,'Medical_Expenses_Family']/(df_test.loc[:,'Parents or siblings infected']+df_test.loc[:,'Wife/Husband or children infected']+1)

- Dropping some variables to train the model:

In [26]:
X = df_train.drop(columns=['Patient_ID','Family_Case_ID','Name','Title','First_Name','Last_Name','Deceased',
                           'Medical_Expenses_Family'])
y = df_train['Deceased']

In [27]:
X_final = df_test.drop(columns=['Patient_ID','Family_Case_ID','Name','Title','First_Name','Last_Name',
                                'Medical_Expenses_Family'])

In [49]:
pipe = Pipeline(
    [
        (
            'scaler', MinMaxScaler()
        ),
        (
            'classifier', BaggingClassifier(
                base_estimator= XGBClassifier(
                    colsample_bytree = 1.0, 
                    eta = 0.63, 
                    max_depth = 2
                ),
                n_estimators=10,
                max_features=1.0,
                bootstrap=True,
                random_state=42
            )
        )
    ]
)

In [50]:
parameter_space = {
    'scaler': [
        MinMaxScaler(), 
        MinMaxScaler(feature_range=(-1,1)),
        RobustScaler(),
        StandardScaler(),
        PowerTransformer()
    ],
    'classifier__n_estimators': list(np.arange(5,50,1)),
    'classifier__max_features': list(np.arange(2,9,1)),
    'classifier__bootstrap': [True, False],
    
}

In [51]:
gridsearch = GridSearchCV(
    pipe, 
    parameter_space, 
    cv=5, 
    verbose=2,
    n_jobs=-1,
    return_train_score=True,
    scoring='accuracy'
)

In [52]:
best_model = gridsearch.fit(X, y)

Fitting 5 folds for each of 3150 candidates, totalling 15750 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:    1.1s
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed:    7.7s
[Parallel(n_jobs=-1)]: Done 349 tasks      | elapsed:   30.8s
[Parallel(n_jobs=-1)]: Done 632 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done 997 tasks      | elapsed:  3.8min
[Parallel(n_jobs=-1)]: Done 1442 tasks      | elapsed:  5.5min
[Parallel(n_jobs=-1)]: Done 1969 tasks      | elapsed:  8.1min
[Parallel(n_jobs=-1)]: Done 2576 tasks      | elapsed: 10.9min
[Parallel(n_jobs=-1)]: Done 3265 tasks      | elapsed: 14.8min
[Parallel(n_jobs=-1)]: Done 4034 tasks      | elapsed: 17.9min
[Parallel(n_jobs=-1)]: Done 4885 tasks      | elapsed: 22.3min
[Parallel(n_jobs=-1)]: Done 5816 tasks      | elapsed: 27.7min
[Parallel(n_jobs=-1)]: Done 6829 tasks      | elapsed: 33.9min
[Parallel(n_jobs=-1)]: Done 7922 tasks      | elapsed: 40.5min
[Parallel(n_jobs=-1)]: Done 9097 tasks      | e

In [55]:
best_model.best_params_

{'classifier__bootstrap': False,
 'classifier__max_features': 7,
 'classifier__n_estimators': 9,
 'scaler': MinMaxScaler(copy=True, feature_range=(0, 1))}

In [58]:
print('Mean: ', best_model.cv_results_['mean_test_score'][best_model.best_index_])
print('Split 1: ', best_model.cv_results_['split0_test_score'][best_model.best_index_])
print('Split 2: ', best_model.cv_results_['split1_test_score'][best_model.best_index_])
print('Split 3: ', best_model.cv_results_['split2_test_score'][best_model.best_index_])
print('Split 4: ', best_model.cv_results_['split3_test_score'][best_model.best_index_])
print('Split 5: ', best_model.cv_results_['split4_test_score'][best_model.best_index_])

Mean:  0.8400000000000001
Split 1:  0.8666666666666667
Split 2:  0.8333333333333334
Split 3:  0.8444444444444444
Split 4:  0.8333333333333334
Split 5:  0.8222222222222222


In [60]:
pred_final_model_3 = best_model.predict(X_final)

In [61]:
version3 = pd.DataFrame({'Patient_ID': list(df_test.Patient_ID.values), 'Deceased':list(pred_final_model_3)}) 

In [62]:
version3.to_csv('m20190922_version3.csv', index=False)