In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn import tree
from sklearn.model_selection import GridSearchCV
import matplotlib.pyplot as plt
from sklearn import datasets
from sklearn.metrics import classification_report
from xgboost import XGBClassifier
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

In [None]:
df = pd.read_csv("./train.csv")

In [None]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [None]:
df[['female','male']] = pd.get_dummies(df['Sex'])
df[["C","Q","S"]] = pd.get_dummies(df["Embarked"])
df['TotalFamilyMembers'] = df['SibSp'] + df['Parch'] + 1
df['AgeTimesPclass'] = df['Age'] * df['Pclass']
df.fillna(method="ffill",inplace=True)
drop_features = ["Sex",'Ticket','Name','Cabin',"Embarked",'Age','SibSp','Parch']
df.drop(drop_features,inplace=True,axis=1)

In [None]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare,female,male,C,Q,S
0,1,0,3,22.0,1,0,7.25,0,1,0,0,1
1,2,1,1,38.0,1,0,71.2833,1,0,1,0,0
2,3,1,3,26.0,0,0,7.925,1,0,0,0,1
3,4,1,1,35.0,1,0,53.1,1,0,0,0,1
4,5,0,3,35.0,0,0,8.05,0,1,0,0,1


In [None]:
x_train, x_test, y_train, y_test = train_test_split(df.loc[:,'Pclass':],df.Survived,\
                                                          test_size=0.2,random_state=42)

In [None]:
dt_model = DecisionTreeClassifier(random_state=42)
dt_model.fit(x_train,y_train)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=42, splitter='best')

In [None]:
dt_model_predictions = dt_model.predict(x_test)

In [None]:
print(classification_report(y_test,dt_model_predictions))

              precision    recall  f1-score   support

           0       0.82      0.82      0.82       105
           1       0.74      0.74      0.74        74

    accuracy                           0.79       179
   macro avg       0.78      0.78      0.78       179
weighted avg       0.79      0.79      0.79       179



In [None]:
parameter_grid = {
    'criterion': ['gini','entropy'],
    'splitter': ['best','random'],
    'max_leaf_nodes': [2,4,5,7,10],
    'max_features': ['auto','log2','sqrt']
}

In [None]:
dt_model_gridsearched = GridSearchCV(cv=5,estimator=DecisionTreeClassifier(random_state=42),
                                     param_grid = parameter_grid)

In [None]:
dt_model_gridsearched.fit(x_train,y_train)

GridSearchCV(cv=5, error_score=nan,
             estimator=DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features=None,
                                              max_leaf_nodes=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              presort='deprecated',
                                              random_state=42,
                                              splitter='best'),
             iid='deprecated', n_jobs=None,
             param_grid={'criterion': ['gini', 'entropy'],
                     

In [None]:
dt_model_gridsearched.best_params_

{'criterion': 'gini',
 'max_features': 'auto',
 'max_leaf_nodes': 10,
 'splitter': 'random'}

In [None]:
print(classification_report(y_test,dt_model_gridsearched.predict(x_test)))

              precision    recall  f1-score   support

           0       0.80      0.88      0.84       105
           1       0.80      0.69      0.74        74

    accuracy                           0.80       179
   macro avg       0.80      0.78      0.79       179
weighted avg       0.80      0.80      0.80       179



In [None]:
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(x_train,y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=42, verbose=0,
                       warm_start=False)

In [None]:
rf_model_predictions = rf_model.predict(x_test)

In [None]:
print(classification_report(y_test,rf_model_predictions))

              precision    recall  f1-score   support

           0       0.84      0.89      0.86       105
           1       0.82      0.76      0.79        74

    accuracy                           0.83       179
   macro avg       0.83      0.82      0.82       179
weighted avg       0.83      0.83      0.83       179



In [None]:
rf_model.score(x_test,y_test)

0.8324022346368715

In [None]:
parameter_grid = {
    'bootstrap': [True, False],
    'criterion': ['gini','entropy'],
    'n_estimators': [10,20,50,100,500],
    'max_leaf_nodes': [2,4,5,7,10],
    'max_features': ['auto','log2','sqrt'],
    'min_samples_split': [2,3,4,5],
    'min_samples_leaf': [1,2,3,4],
    'min_weight_fraction_leaf': [0.0,0.05,0.1,0.3,0.5]
}

In [None]:
rf_model_gridsearched = GridSearchCV(cv=5,estimator=RandomForestClassifier(bootstrap=False,criterion='entropy',max_features='auto',max_leaf_nodes=10,n_estimators=500,class_weight={1:0.62,0:0.39},random_state=42),
                                     param_grid = parameter_grid)

In [None]:
rf_model_gridsearched.fit(x_train,y_train)

GridSearchCV(cv=5, error_score=nan,
             estimator=RandomForestClassifier(bootstrap=False, ccp_alpha=0.0,
                                              class_weight={0: 0.39, 1: 0.62},
                                              criterion='entropy',
                                              max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=10,
                                              max_samples=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators=500, n_jobs=None,
                                              oob_sc

In [None]:
rf_model_gridsearched.best_params_

{'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0}

In [None]:
print(classification_report(y_test,rf_model_gridsearched.predict(x_test)))

              precision    recall  f1-score   support

           0       0.82      0.88      0.85       105
           1       0.81      0.73      0.77        74

    accuracy                           0.82       179
   macro avg       0.81      0.80      0.81       179
weighted avg       0.82      0.82      0.81       179



In [None]:
gb_model = GradientBoostingClassifier(learning_rate=0.5,random_state=42)
gb_model.fit(x_train,y_train)

GradientBoostingClassifier(ccp_alpha=0.0, criterion='friedman_mse', init=None,
                           learning_rate=0.5, loss='deviance', max_depth=3,
                           max_features=None, max_leaf_nodes=None,
                           min_impurity_decrease=0.0, min_impurity_split=None,
                           min_samples_leaf=1, min_samples_split=2,
                           min_weight_fraction_leaf=0.0, n_estimators=100,
                           n_iter_no_change=None, presort='deprecated',
                           random_state=42, subsample=1.0, tol=0.0001,
                           validation_fraction=0.1, verbose=0,
                           warm_start=False)

In [None]:
gb_model_predictions = gb_model.predict(x_test)

In [None]:
print(classification_report(y_test,gb_model_predictions))

              precision    recall  f1-score   support

           0       0.83      0.82      0.83       105
           1       0.75      0.77      0.76        74

    accuracy                           0.80       179
   macro avg       0.79      0.79      0.79       179
weighted avg       0.80      0.80      0.80       179



In [None]:
parameter_grid = {
    'loss': ['deviance','exponential'],
    'learning_rate': [0.1,0.3,0.5,0.7],
    'n_estimators': [10,20,50,100],
    'max_leaf_nodes': [2,3,5,7,10]
}

In [None]:
gb_model_gridsearched = GridSearchCV(cv=5,estimator=GradientBoostingClassifier(random_state=42),
                                     param_grid = parameter_grid)

In [None]:
gb_model_gridsearched.fit(x_train,y_train)

GridSearchCV(cv=5, error_score=nan,
             estimator=GradientBoostingClassifier(ccp_alpha=0.0,
                                                  criterion='friedman_mse',
                                                  init=None, learning_rate=0.1,
                                                  loss='deviance', max_depth=3,
                                                  max_features=None,
                                                  max_leaf_nodes=None,
                                                  min_impurity_decrease=0.0,
                                                  min_impurity_split=None,
                                                  min_samples_leaf=1,
                                                  min_samples_split=2,
                                                  min_weight_fraction_leaf=0.0,
                                                  n_estimators=100,
                                                  n_iter_no_c...
                 

In [None]:
gb_model_gridsearched.best_params_

{'learning_rate': 0.1,
 'loss': 'exponential',
 'max_leaf_nodes': 7,
 'n_estimators': 100}

In [None]:
print(classification_report(y_test,gb_model_gridsearched.predict(x_test)))

              precision    recall  f1-score   support

           0       0.79      0.88      0.83       105
           1       0.79      0.68      0.73        74

    accuracy                           0.79       179
   macro avg       0.79      0.78      0.78       179
weighted avg       0.79      0.79      0.79       179



In [None]:
ab_model = AdaBoostClassifier(random_state=42)
ab_model.fit(x_train,y_train)

AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None, learning_rate=1.0,
                   n_estimators=50, random_state=42)

In [None]:
ab_model_predictions = ab_model.predict(x_test)

In [None]:
print(classification_report(y_test,ab_model_predictions))

              precision    recall  f1-score   support

           0       0.82      0.79      0.81       105
           1       0.72      0.76      0.74        74

    accuracy                           0.78       179
   macro avg       0.77      0.77      0.77       179
weighted avg       0.78      0.78      0.78       179



In [None]:
parameter_grid = {
    'algorithm': ['SAMME','SAMME.R'],
    'n_estimators': [5,10,20,50,100],
    'learning_rate': [0.5,1.0,1.5]
}

In [None]:
ab_model_gridsearched = GridSearchCV(cv=5,estimator=AdaBoostClassifier(random_state=42),
                                     param_grid = parameter_grid)

In [None]:
ab_model_gridsearched.fit(x_train,y_train)

GridSearchCV(cv=5, error_score=nan,
             estimator=AdaBoostClassifier(algorithm='SAMME.R',
                                          base_estimator=None,
                                          learning_rate=1.0, n_estimators=50,
                                          random_state=42),
             iid='deprecated', n_jobs=None,
             param_grid={'algorithm': ['SAMME', 'SAMME.R'],
                         'learning_rate': [0.5, 1.0, 1.5],
                         'n_estimators': [5, 10, 20, 50, 100]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=0)

In [None]:
ab_model_gridsearched.best_params_

{'algorithm': 'SAMME', 'learning_rate': 1.5, 'n_estimators': 50}

In [None]:
print(classification_report(y_test,ab_model_gridsearched.predict(x_test)))

              precision    recall  f1-score   support

           0       0.83      0.84      0.83       105
           1       0.77      0.76      0.76        74

    accuracy                           0.80       179
   macro avg       0.80      0.80      0.80       179
weighted avg       0.80      0.80      0.80       179



In [None]:
xgb_model = XGBClassifier(colsample_bylevel=0.9,colsample_bytree=0.8,gamma=0.99,max_depth=5,min_child_weight=1,n_estimators=100,nthread=4,random_state=42,silent=True)
xgb_model.fit(x_train,y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=0.9,
              colsample_bynode=1, colsample_bytree=0.8, gamma=0.99,
              learning_rate=0.1, max_delta_step=0, max_depth=5,
              min_child_weight=1, missing=None, n_estimators=100, n_jobs=1,
              nthread=4, objective='binary:logistic', random_state=42,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=True, subsample=1, verbosity=1)

In [None]:
xgb_model.score(x_test,y_test)

0.8156424581005587

In [None]:
xgb_model_predictions = xgb_model.predict(x_test)

In [None]:
print(classification_report(y_test,xgb_model_predictions))

              precision    recall  f1-score   support

           0       0.81      0.90      0.85       105
           1       0.83      0.70      0.76        74

    accuracy                           0.82       179
   macro avg       0.82      0.80      0.80       179
weighted avg       0.82      0.82      0.81       179



In [None]:
parameter_grid = {
    'max_depth': [2,3,4,5,6],
    'learning_rate': [0.1,0.2,0.3],
    'n_estimators': [10,20,50,100],
    'min_child_weight': [1,2,5,10],
    'max_delta_step': [0,1,2]
}

In [None]:
xgb_model_gridsearched = GridSearchCV(cv=5,estimator=XGBClassifier(random_state=42),
                                      param_grid=parameter_grid)

In [None]:
xgb_model_gridsearched.fit(x_train,y_train)

In [None]:
xgb_model_gridsearched.best_params_

{'learning_rate': 0.1,
 'max_delta_step': 0,
 'max_depth': 5,
 'min_child_weight': 1,
 'n_estimators': 50}

In [None]:
print(classification_report(y_test,xgb_model_gridsearched.predict(x_test)))

              precision    recall  f1-score   support

           0       0.81      0.90      0.86       105
           1       0.84      0.70      0.76        74

    accuracy                           0.82       179
   macro avg       0.83      0.80      0.81       179
weighted avg       0.82      0.82      0.82       179



In [None]:
df_test = pd.read_csv("./test.csv")
df_test[['female','male']] = pd.get_dummies(df_test['Sex'])
df_test[["C","Q","S"]] = pd.get_dummies(df_test['Embarked'])
df_test['TotalFamilyMembers'] = df_test['SibSp'] + df_test['Parch'] + 1
df_test['AgeTimesPclass'] = df_test['Age'] * df_test['Pclass']
df_test.fillna(method="ffill",inplace=True)
drop_features = ["Sex",'Ticket','Name','Cabin',"Embarked",'Age','SibSp','Parch']
df_test.drop(drop_features,inplace=True,axis=1)

predictions_for_submission = rf_model.predict(df_test.loc[:,"Pclass":])
df_submission = df_test[['PassengerId']].copy()
df_submission['Survived'] = predictions_for_submission
df_submission.to_csv("submission_v3.csv")

In [None]:
kf = KFold(n_splits=10, random_state=42, shuffle=True)

In [None]:
clf = RandomForestClassifier(bootstrap=False,criterion='entropy',max_features='auto',max_leaf_nodes=10,n_estimators=500,random_state=42)

In [None]:
X = df.loc[:,'Pclass':]
y = df.loc[:,'Survived']
scoring = 'accuracy'
results = cross_val_score(clf, X.values, y.values, cv=kf, n_jobs=1, scoring=scoring)
results

array([0.82222222, 0.7752809 , 0.84269663, 0.7752809 , 0.87640449,
       0.87640449, 0.78651685, 0.7752809 , 0.78651685, 0.88764045])

In [None]:
print(results.mean())

0.8204244694132333


In [None]:
scoring = 'accuracy'
results = cross_val_score(clf, x_train, y_train, cv=kf, n_jobs=1, scoring=scoring)
results

array([0.86111111, 0.80555556, 0.85915493, 0.73239437, 0.77464789,
       0.78873239, 0.84507042, 0.83098592, 0.83098592, 0.90140845])

In [None]:
print(results.mean())

0.8230046948356808


In [None]:
clf.fit(x_train,y_train)

RandomForestClassifier(bootstrap=False, ccp_alpha=0.0, class_weight=None,
                       criterion='entropy', max_depth=None, max_features='auto',
                       max_leaf_nodes=10, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=500,
                       n_jobs=None, oob_score=False, random_state=42, verbose=0,
                       warm_start=False)

In [None]:
clf_predictions = clf.predict(x_test)

In [None]:
print(classification_report(y_test,clf_predictions))

              precision    recall  f1-score   support

           0       0.80      0.88      0.84       105
           1       0.80      0.69      0.74        74

    accuracy                           0.80       179
   macro avg       0.80      0.78      0.79       179
weighted avg       0.80      0.80      0.80       179

