In [146]:
# import pandas to handle data manipulation
import pandas as pd
# import numpy to handle array computations efficiently
import numpy as np
# import xgboost for data modelling
import xgboost as xgb
# import matplotlib to plot graphs
import matplotlib.pyplot as plt
import seaborn as sns
# import from sklearn to complement data modelling
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import OrdinalEncoder,StandardScaler
from sklearn.metrics import accuracy_score
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import StackingClassifier
from sklearn.svm import SVC

In [147]:
# load in data from csv
df1=pd.read_csv('train.csv')
df2=pd.read_csv('test.csv')

In [148]:
# used Data Wrangler to check if there is any missing data
incl_columns=['Survived','Pclass','Sex','Age','SibSp','Parch','Fare','Embarked']

df_train=df1[incl_columns]
# fill in for Sex
df_train=df_train.fillna("NA")
print(df_train.dtypes)
# used Data Wrangler to check if there is any missing data - now filled with NA
df_test=df2[incl_columns[1:]]
df_test=df_test.fillna("NA")
df_test["Fare"]=df_test["Fare"].replace("NA",df_train["Fare"].mean())
print(df_test.dtypes)

Survived      int64
Pclass        int64
Sex          object
Age          object
SibSp         int64
Parch         int64
Fare        float64
Embarked     object
dtype: object
Pclass        int64
Sex          object
Age          object
SibSp         int64
Parch         int64
Fare        float64
Embarked     object
dtype: object


  df_test["Fare"]=df_test["Fare"].replace("NA",df_train["Fare"].mean())


In [149]:
#for each feature, rank the categories in the order of increasing loss
cat_col=["Sex","Embarked"]

# Age, Embarked has NA
# Age needs replace by mean
# Sex, Embarked needs ordinal encoding
# Sex sorted by Survived for ordinal encoding


# Ordinal Encoder for Sex and Embarked
df_train_ord=df_train.copy()
sorted_cat_list=list()
for col in cat_col:
    #reset index to convert the Series back to a DataFrame with index as column, use df_train
    sorted_cat=df_train.loc[df_train.index,[col,"Survived"]].reset_index().pivot_table(index=col,values="Survived",aggfunc=np.mean).sort_values(by="Survived").index
    sorted_cat_list.append(list(sorted_cat))

#create Ordinal Encoder with the sorted categories for each features,set unknown to -1, setting it to a category with lowest loss( only for )
oe=OrdinalEncoder(categories=sorted_cat_list,handle_unknown="use_encoded_value",unknown_value=np.nan)
#fit on train set
oe.fit(df_train_ord[cat_col])

#transform the categorical features and save as dataframe
df_train_ord_temp=oe.transform(df_train_ord[cat_col])
df_train_ord_temp=pd.DataFrame(df_train_ord_temp,columns=cat_col,index=df_train.index)

#update original dataframe
df_train_ord.update(df_train_ord_temp)
for col in cat_col:
    df_train_ord[col]=df_train_ord[col].astype(float)

# replace NA with mean for Age 
NA_replacement=df_train_ord.loc[df_train_ord["Age"]!="NA","Age"].mean()
df_train_ord["Age"]=df_train_ord["Age"].replace("NA",NA_replacement)

  sorted_cat=df_train.loc[df_train.index,[col,"Survived"]].reset_index().pivot_table(index=col,values="Survived",aggfunc=np.mean).sort_values(by="Survived").index
  sorted_cat=df_train.loc[df_train.index,[col,"Survived"]].reset_index().pivot_table(index=col,values="Survived",aggfunc=np.mean).sort_values(by="Survived").index
  df_train_ord["Age"]=df_train_ord["Age"].replace("NA",NA_replacement)


In [150]:
# create new column with first letter of name of cabin/ NA if value missing
cabin=df1["Cabin"]
cabin.replace(np.nan,"NA",inplace=True)

# extract cabin type
def extract_cabin(cabin):
    if cabin[:2]=="NA":
        return "NA"
    else:
        return cabin[0]

cabin=cabin.apply(extract_cabin)

# create new columns in train_ord with new feature
df_train_ord["Cabin"]=cabin
sorted_cabin=df_train_ord.loc[df_train_ord.index,["Cabin","Survived"]].reset_index().pivot_table(index="Cabin",values="Survived",aggfunc=np.mean).sort_values(by="Survived").index
sorted_cabin=sorted_cabin.tolist()
#create Ordinal Encoder with the sorted categories for each features,set unknown to -1, setting it to a category with lowest loss( only for )
oe2=OrdinalEncoder(categories=[sorted_cabin],handle_unknown="use_encoded_value",unknown_value=np.nan)
#fit on train set
oe2.fit(df_train_ord[["Cabin"]])

# #transform the categorical features and save as dataframe
df_train_ord_temp=oe2.transform(df_train_ord[["Cabin"]])
df_train_ord_temp=pd.DataFrame(df_train_ord_temp,columns=["Cabin"],index=df_train.index)

#update original dataframe
df_train_ord.update(df_train_ord_temp)
df_train_ord["Cabin"]=df_train_ord["Cabin"].astype(float)

  sorted_cabin=df_train_ord.loc[df_train_ord.index,["Cabin","Survived"]].reset_index().pivot_table(index="Cabin",values="Survived",aggfunc=np.mean).sort_values(by="Survived").index


In [151]:
# Further feature engineering 
# look at the titanic graphs file to see how to engineer features

# group age into bins
df_train_ord["Age_bin"]=pd.qcut(df_train_ord["Age"],10,duplicates="drop").apply(lambda x: x.right)

# group Sibsp into bins
df_train_ord["SibSp_bin"]=df_train_ord.loc[df_train_ord["SibSp"]<=0,"SibSp"]=0
df_train_ord["SibSp_bin"]=df_train_ord.loc[(df_train_ord["SibSp"]>0) & (df_train_ord["SibSp"]<=2),"SibSp"]=1
df_train_ord["SibSp_bin"]=df_train_ord.loc[(df_train_ord["SibSp"]>2) & (df_train_ord["SibSp"]<=4),"SibSp"]=2

# fare bin
df_train_ord["Fare_bin"]=pd.qcut(df_train_ord["Fare"],10,duplicates="drop").apply(lambda x: x.right)

# group Parch into bins
df_train_ord["Parch_bin"]=df_train_ord.loc[df_train_ord["Parch"]<=0,"Parch"]=0
df_train_ord["Parch_bin"]=df_train_ord.loc[(df_train_ord["Parch"]>0) & (df_train_ord["Parch"]<=3),"Parch"]=1
df_train_ord["Parch_bin"]=df_train_ord.loc[(df_train_ord["Parch"]>3),"Parch"]=2

# drop original features
df_train_ord.drop(["Age","SibSp","Fare","Parch"],axis=1,inplace=True)


In [152]:
# train - test for hyperparameter tuning later
X_train, X_test, y_train, y_test=train_test_split(df_train_ord[df_train_ord.columns[1:]],df_train_ord[df_train_ord.columns[0]],test_size=0.2,random_state=42)

In [153]:
# normalize the data - used by SVM later
std_scaler=StandardScaler()
std_scaler.fit(X_train)

X_train_std=std_scaler.transform(X_train)
X_test_std=std_scaler.transform(X_test)
df_train_ord_std=std_scaler.transform(df_train_ord[df_train_ord.columns[1:]])

In [154]:
# note that we do not perform dimension reduction on the data
# this helps us to exploit all data to generate insights
# unnessary dimension reduction can lead weaker model performance

In [155]:
# train on train set only and test on test set to check performance

xgboost=xgb.XGBClassifier(device="cuda",colsample_bynode=0.5,objective="reg:logistic",enable_categorical=True)

# we use GridSearchCV to find the best hyperparameters
# set max depth to 6 default, sample half of features at each split
# we set the range of lambda, L2 reg 0-60 to prevent overfitting, alpha 0-70 to prevent overfitting, min_child_weight 3-15

params={"lambda":np.arange(0,60,10),"alpha":np.arange(0,70,10),"min_child_weight":np.arange(3,15,2)}
xgboost_tune=GridSearchCV(xgboost,params,scoring="accuracy",cv=5)
xgboost_tune.fit(X_train,y_train)

# extract the best estimator
best_xgboost=xgboost_tune.best_estimator_
# generate predictions on the test set
best_xgboost_prob=best_xgboost.predict(X_test)

# save cv results
xgboost_tune_results=pd.DataFrame(xgboost_tune.cv_results_)
display(xgboost_tune_results.sort_values(by="rank_test_score").head(10))
best_xgboost_score=accuracy_score(y_test,best_xgboost_prob)
print(f"Accuracy of best XGBoost: {best_xgboost_score}")

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_alpha,param_lambda,param_min_child_weight,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
18,0.040583,0.000809,0.00509,0.000341,0,30,3,"{'alpha': 0, 'lambda': 30, 'min_child_weight': 3}",0.811189,0.804196,0.852113,0.78169,0.880282,0.825894,0.035458,1
12,0.042418,0.001891,0.005012,0.000253,0,20,3,"{'alpha': 0, 'lambda': 20, 'min_child_weight': 3}",0.804196,0.811189,0.838028,0.767606,0.873239,0.818852,0.035286,2
24,0.040213,0.00143,0.005341,0.000681,0,40,3,"{'alpha': 0, 'lambda': 40, 'min_child_weight': 3}",0.811189,0.804196,0.830986,0.774648,0.866197,0.817443,0.030353,3
30,0.039711,0.001259,0.005147,0.00035,0,50,3,"{'alpha': 0, 'lambda': 50, 'min_child_weight': 3}",0.818182,0.804196,0.816901,0.774648,0.873239,0.817433,0.032003,4
6,0.042393,0.001584,0.005595,0.000394,0,10,3,"{'alpha': 0, 'lambda': 10, 'min_child_weight': 3}",0.797203,0.797203,0.830986,0.760563,0.887324,0.814656,0.042621,5
14,0.035118,0.001195,0.005321,0.000286,0,20,7,"{'alpha': 0, 'lambda': 20, 'min_child_weight': 7}",0.797203,0.825175,0.823944,0.774648,0.852113,0.814616,0.026477,6
31,0.036817,0.001564,0.005448,0.000694,0,50,5,"{'alpha': 0, 'lambda': 50, 'min_child_weight': 5}",0.804196,0.811189,0.823944,0.774648,0.852113,0.813218,0.025302,7
10,0.043162,0.001469,0.006143,0.000565,0,10,11,"{'alpha': 0, 'lambda': 10, 'min_child_weight':...",0.811189,0.825175,0.816901,0.78169,0.830986,0.813188,0.017148,8
1,0.041827,0.003539,0.005784,0.000507,0,0,5,"{'alpha': 0, 'lambda': 0, 'min_child_weight': 5}",0.804196,0.804196,0.823944,0.767606,0.859155,0.811819,0.029862,9
4,0.036083,0.005095,0.004993,0.000122,0,0,11,"{'alpha': 0, 'lambda': 0, 'min_child_weight': 11}",0.804196,0.811189,0.838028,0.788732,0.816901,0.811809,0.016151,10


Accuracy of best XGBoost: 0.8379888268156425


In [156]:
# svm
from sklearn.svm import SVC
svm=SVC()
param_svm={"C":np.arange(0.1,2,0.2),"gamma":np.arange(0.1,1,0.1)}
svm_tune=GridSearchCV(svm,param_svm,scoring="accuracy",cv=5)
svm_tune.fit(X_train_std,y_train)

best_svm=svm_tune.best_estimator_
svm_prob=best_svm.predict(X_test_std)
svm_score=accuracy_score(y_test,svm_prob)
print(f"Accuracy of svm: {svm_score}")
svm_tune_results=pd.DataFrame(svm_tune.cv_results_)
display(svm_tune_results.sort_values(by="rank_test_score").head(10))

Accuracy of svm: 0.7597765363128491


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_C,param_gamma,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
12,0.007476,0.000594,0.003997,7e-06,0.3,0.4,"{'C': 0.30000000000000004, 'gamma': 0.4}",0.811189,0.846154,0.84507,0.753521,0.838028,0.818792,0.035007,1
13,0.007725,0.000409,0.004001,1e-06,0.3,0.5,"{'C': 0.30000000000000004, 'gamma': 0.5}",0.811189,0.846154,0.84507,0.746479,0.838028,0.817384,0.037647,2
82,0.008233,0.00045,0.004373,0.000513,1.9,0.2,"{'C': 1.9000000000000004, 'gamma': 0.2}",0.79021,0.839161,0.84507,0.746479,0.852113,0.814607,0.040433,3
64,0.007538,0.000468,0.003103,0.000202,1.5,0.2,"{'C': 1.5000000000000004, 'gamma': 0.2}",0.797203,0.839161,0.838028,0.746479,0.852113,0.814597,0.038736,4
73,0.007694,0.000404,0.003722,0.000391,1.7,0.2,"{'C': 1.7000000000000004, 'gamma': 0.2}",0.79021,0.839161,0.838028,0.746479,0.852113,0.813198,0.039458,5
22,0.007299,0.000331,0.003802,0.000482,0.5,0.5,"{'C': 0.5000000000000001, 'gamma': 0.5}",0.797203,0.839161,0.838028,0.753521,0.838028,0.813188,0.033836,6
21,0.008003,6e-06,0.003479,0.000447,0.5,0.4,"{'C': 0.5000000000000001, 'gamma': 0.4}",0.797203,0.839161,0.838028,0.753521,0.838028,0.813188,0.033836,6
14,0.007319,0.00041,0.004401,0.000492,0.3,0.6,"{'C': 0.30000000000000004, 'gamma': 0.6}",0.811189,0.853147,0.830986,0.746479,0.823944,0.813149,0.036009,8
81,0.008525,0.000542,0.004123,0.000402,1.9,0.1,"{'C': 1.9000000000000004, 'gamma': 0.1}",0.79021,0.832168,0.84507,0.746479,0.84507,0.811799,0.038371,9
30,0.007307,0.000589,0.003501,0.000449,0.7,0.4,"{'C': 0.7000000000000001, 'gamma': 0.4}",0.79021,0.839161,0.838028,0.746479,0.84507,0.81179,0.038148,10


In [157]:
# stacking models 
# stack 1st layer:xgboost + (normalization -> svm), 2nd layer:logistic regression

estimators=[("xgboost",xgb.XGBClassifier(device="cuda",colsample_bynode=0.5,objective="reg:logistic",enable_categorical=True,reg_lambda=40,reg_alpha=0,min_child_weight=3))
           ,("svm",make_pipeline(StandardScaler(),SVC(C=0.3,gamma=0.2)))]

stack=StackingClassifier(estimators=estimators,final_estimator=LogisticRegression())
# param_stack={"xgboost__reg_lambda":np.arange(0,30,10),"xgboost__reg_alpha":np.arange(0,30,10),"xgboost__min_child_weight":np.arange(3,8,2)
#              ,"svm__svc__C":np.arange(0.2,1,0.2),"svm__svc__gamma":np.arange(0.1,1,0.2)
#              ,"final_estimator__C":np.arange(0.1,1,0.2)}
param_stack={"final_estimator__C":np.arange(0.1,2,0.2)}
stack_tune=GridSearchCV(stack,param_stack,scoring="accuracy",cv=5)
stack_tune.fit(X_train,y_train)

# extract the best estimator
best_stack=stack_tune.best_estimator_
# generate predictions on the test set
best_stack_prob=best_stack.predict(X_test)

# save cv results
stack_tune_results=pd.DataFrame(stack_tune.cv_results_)
display(stack_tune_results.sort_values(by="rank_test_score").head(10))
best_stack_score=accuracy_score(y_test,best_stack_prob)
print(f"Accuracy of best model stack: {best_stack_score}")

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_final_estimator__C,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
3,0.473135,0.060198,0.014637,0.001193,0.7,{'final_estimator__C': 0.7000000000000001},0.811189,0.839161,0.816901,0.746479,0.859155,0.814577,0.038068,1
5,0.448152,0.015019,0.014755,0.001388,1.1,{'final_estimator__C': 1.1000000000000003},0.804196,0.846154,0.816901,0.746479,0.859155,0.814577,0.039331,1
4,0.452075,0.027492,0.01411,0.000836,0.9,{'final_estimator__C': 0.9000000000000001},0.804196,0.839161,0.816901,0.746479,0.859155,0.813178,0.038294,3
6,0.460443,0.037928,0.014541,0.000402,1.3,{'final_estimator__C': 1.3000000000000003},0.797203,0.846154,0.816901,0.746479,0.859155,0.813178,0.039797,3
8,0.458884,0.029842,0.014164,0.000732,1.7,{'final_estimator__C': 1.7000000000000004},0.804196,0.839161,0.816901,0.746479,0.859155,0.813178,0.038294,3
2,0.496237,0.061229,0.014504,0.00165,0.5,{'final_estimator__C': 0.5000000000000001},0.811189,0.846154,0.802817,0.746479,0.859155,0.813159,0.039399,6
7,0.457481,0.040313,0.014641,0.000388,1.5,{'final_estimator__C': 1.5000000000000004},0.797203,0.839161,0.816901,0.746479,0.859155,0.81178,0.038722,7
9,0.463077,0.042782,0.01466,0.000747,1.9,{'final_estimator__C': 1.9000000000000004},0.804196,0.825175,0.816901,0.746479,0.859155,0.810381,0.036775,8
1,0.47344,0.033019,0.015293,0.000948,0.3,{'final_estimator__C': 0.30000000000000004},0.811189,0.839161,0.795775,0.746479,0.859155,0.810352,0.038742,9
0,0.455648,0.031583,0.014919,0.001001,0.1,{'final_estimator__C': 0.1},0.797203,0.846154,0.788732,0.739437,0.859155,0.806136,0.042998,10


Accuracy of best model stack: 0.8044692737430168


In [158]:
# print out parameters (used to extract parameters that can be changed in the GridSearchCV)
stack.get_params()

{'cv': None,
 'estimators': [('xgboost',
   XGBClassifier(base_score=None, booster=None, callbacks=None,
                 colsample_bylevel=None, colsample_bynode=0.5,
                 colsample_bytree=None, device='cuda', early_stopping_rounds=None,
                 enable_categorical=True, eval_metric=None, feature_types=None,
                 gamma=None, grow_policy=None, importance_type=None,
                 interaction_constraints=None, learning_rate=None, max_bin=None,
                 max_cat_threshold=None, max_cat_to_onehot=None,
                 max_delta_step=None, max_depth=None, max_leaves=None,
                 min_child_weight=3, missing=nan, monotone_constraints=None,
                 multi_strategy=None, n_estimators=None, n_jobs=None,
                 num_parallel_tree=None, objective='reg:logistic', ...)),
  ('svm',
   Pipeline(steps=[('standardscaler', StandardScaler()),
                   ('svc', SVC(C=0.3, gamma=0.2))]))],
 'final_estimator__C': 1.0,
 'final_esti

In [159]:
# xgboost random forest
xgboost_rf=xgb.XGBRFClassifier(device="cuda",colsample_bynode=0.5,objective="reg:logistic",enable_categorical=True,n_estimators=500)

# we use GridSearchCV to find the best hyperparameters
# set max depth to 6 default, sample half of features at each split
# we set the range of lambda, L2 reg 0-40 to prevent overfitting, alpha 0-70 to prevent overfitting, min_child_weight 3-15
# we fit random forest later after getting best parameters

params={"lambda":np.arange(0,40,10),"alpha":np.arange(0,70,10),"min_child_weight":np.arange(3,15,2)}
xgboost_rf_tune=GridSearchCV(xgboost_rf,params,scoring="accuracy",cv=5)
xgboost_rf_tune.fit(X_train,y_train)

# extract the best estimator
best_xgboost_rf=xgboost_rf_tune.best_estimator_
# generate predictions on the test set
best_xgboost_rf_prob=best_xgboost_rf.predict(X_test)

# save cv results
xgboost_rf_tune_results=pd.DataFrame(xgboost_rf_tune.cv_results_)
display(xgboost_rf_tune_results.sort_values(by="rank_test_score").head(10))
best_xgboost_rf_score=accuracy_score(y_test,best_xgboost_rf_prob)
print(f"Accuracy of best XGBoost Random Forest: {best_xgboost_rf_score}")

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_alpha,param_lambda,param_min_child_weight,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.224343,0.038931,0.007472,0.000589,0,0,3,"{'alpha': 0, 'lambda': 0, 'min_child_weight': 3}",0.811189,0.839161,0.774648,0.753521,0.838028,0.803309,0.034214,1
6,0.21578,0.038196,0.007341,0.000426,0,10,3,"{'alpha': 0, 'lambda': 10, 'min_child_weight': 3}",0.811189,0.839161,0.774648,0.753521,0.838028,0.803309,0.034214,1
18,0.17542,0.002967,0.007194,0.000404,0,30,3,"{'alpha': 0, 'lambda': 30, 'min_child_weight': 3}",0.811189,0.839161,0.774648,0.753521,0.838028,0.803309,0.034214,1
12,0.179518,0.005937,0.007441,0.000483,0,20,3,"{'alpha': 0, 'lambda': 20, 'min_child_weight': 3}",0.811189,0.839161,0.774648,0.753521,0.838028,0.803309,0.034214,1
31,0.08987,0.003176,0.006015,2.9e-05,10,10,5,"{'alpha': 10, 'lambda': 10, 'min_child_weight'...",0.776224,0.825175,0.802817,0.767606,0.802817,0.794928,0.020667,5
37,0.112211,0.024043,0.006092,0.00067,10,20,5,"{'alpha': 10, 'lambda': 20, 'min_child_weight'...",0.776224,0.825175,0.802817,0.767606,0.802817,0.794928,0.020667,5
43,0.109378,0.02218,0.005934,0.000296,10,30,5,"{'alpha': 10, 'lambda': 30, 'min_child_weight'...",0.776224,0.825175,0.802817,0.767606,0.802817,0.794928,0.020667,5
25,0.089101,0.003723,0.005891,0.000455,10,0,5,"{'alpha': 10, 'lambda': 0, 'min_child_weight': 5}",0.776224,0.825175,0.802817,0.767606,0.802817,0.794928,0.020667,5
24,0.092173,0.002723,0.006003,3e-06,10,0,3,"{'alpha': 10, 'lambda': 0, 'min_child_weight': 3}",0.776224,0.825175,0.802817,0.767606,0.795775,0.793519,0.020319,9
30,0.089293,0.001978,0.006002,4e-06,10,10,3,"{'alpha': 10, 'lambda': 10, 'min_child_weight'...",0.776224,0.825175,0.802817,0.767606,0.795775,0.793519,0.020319,9


Accuracy of best XGBoost Random Forest: 0.8156424581005587


In [160]:
# data manipulation on unseen validation set for competition

# Ordinal Encoder for Sex and Embarked
#create Ordinal Encoder with the sorted categories for each features,set unknown to -1, setting it to a category with lowest loss( only for )

df_test_ord=df_test.copy()

#transform the categorical features and save as dataframe
df_test_ord_temp=oe.transform(df_test_ord[cat_col])
df_test_ord_temp=pd.DataFrame(df_test_ord_temp,columns=cat_col,index=df_test.index)

#update original dataframe
df_test_ord.update(df_test_ord_temp)
for col in cat_col:
    df_test_ord[col]=df_test_ord[col].astype(float)

# replace NA with mean for Age 
df_test_ord["Age"]=df_test_ord["Age"].replace("NA",NA_replacement)


# handle new "Cabin" column
cabin=df2["Cabin"]
cabin.replace(np.nan,"NA",inplace=True)

cabin=cabin.apply(extract_cabin)

# create new columns in train_ord with new feature
df_test_ord["Cabin"]=cabin
# # #transform the categorical features and save as dataframe
df_test_ord_temp=oe2.transform(df_test_ord[["Cabin"]])
df_test_ord_temp=pd.DataFrame(df_test_ord_temp,columns=["Cabin"],index=df_test.index)

#update original dataframe
df_test_ord.update(df_test_ord_temp)
df_test_ord["Cabin"]=df_test_ord["Cabin"].astype(float)

  df_test_ord["Age"]=df_test_ord["Age"].replace("NA",NA_replacement)


In [161]:
# Further feature engineering 
# look at the titanic graphs file to see how to engineer features

# group age into bins
df_test_ord["Age_bin"]=pd.qcut(df_test_ord["Age"],10,duplicates="drop").apply(lambda x: x.right)

# group Sibsp into bins
df_test_ord["SibSp_bin"]=df_test_ord.loc[df_test_ord["SibSp"]<=0,"SibSp"]=0
df_test_ord["SibSp_bin"]=df_test_ord.loc[(df_test_ord["SibSp"]>0) & (df_test_ord["SibSp"]<=2),"SibSp"]=1
df_test_ord["SibSp_bin"]=df_test_ord.loc[(df_test_ord["SibSp"]>2) & (df_test_ord["SibSp"]<=4),"SibSp"]=2

# fare bin
df_test_ord["Fare_bin"]=pd.qcut(df_test_ord["Fare"],10,duplicates="drop").apply(lambda x: x.right)

# group Parch into bins
df_test_ord["Parch_bin"]=df_test_ord.loc[df_test_ord["Parch"]<=0,"Parch"]=0
df_test_ord["Parch_bin"]=df_test_ord.loc[(df_test_ord["Parch"]>0) & (df_test_ord["Parch"]<=3),"Parch"]=1
df_test_ord["Parch_bin"]=df_test_ord.loc[(df_test_ord["Parch"]>3),"Parch"]=2

# drop original features
df_test_ord.drop(["Age","SibSp","Fare","Parch"],axis=1,inplace=True)

In [162]:
# train on train set only and test on test set to check performance
# use best parameters from grid search previously
xgboost=xgb.XGBClassifier(**xgboost_tune.best_params_,device="cuda",colsample_bynode=0.5,objective="reg:logistic",enable_categorical=True)
xgboost.fit(df_train_ord.iloc[:,1:],df_train_ord.iloc[:,0])

xgboost_prob=xgboost.predict(df_test_ord)

In [163]:
# xgb.XGBClassifier(device="cuda",colsample_bynode=0.5,objective="reg:logistic",enable_categorical=True,reg_lambda=20,reg_alpha=0,min_child_weight=5
estimators=[("xgboost",xgb.XGBRFClassifier(reg_alpha=0,reg_lambda=0,min_child_weight=3,device="cuda",colsample_bynode=0.5,objective="reg:logistic",enable_categorical=True,n_estimators=1000))
           ,("svm",make_pipeline(StandardScaler(),SVC(C=0.3,gamma=0.2)))]

stack=StackingClassifier(estimators=estimators,final_estimator=LogisticRegression(C=1.7))
stack.fit(df_train_ord.iloc[:,1:],df_train_ord.iloc[:,0])
stack_prob=stack.predict(df_test_ord)

In [164]:
# use best parameters from grid search previously (used the one with least regularization- random forest has regularization properties)
xgboost_rf=xgb.XGBRFClassifier(reg_alpha=0,reg_lambda=0,min_child_weight=3,device="cuda",colsample_bynode=0.5,objective="reg:logistic",enable_categorical=True,n_estimators=1000)

xgboost_rf.fit(df_train_ord.iloc[:,1:],df_train_ord.iloc[:,0])

xgboost_rf_prob=xgboost_rf.predict(df_test_ord)

In [165]:
submission=pd.DataFrame({"PassengerId":df2["PassengerId"],"Survived":xgboost_prob})
submission.to_csv("submission.csv",index=False)

In [166]:
submission=pd.DataFrame({"PassengerId":df2["PassengerId"],"Survived":stack_prob})
submission.to_csv("submission_stack.csv",index=False)

In [167]:
submission=pd.DataFrame({"PassengerId":df2["PassengerId"],"Survived":xgboost_rf_prob})
submission.to_csv("submission_rf.csv",index=False)