# **Step 1: Libraries**

In [392]:
from tqdm import tqdm

# Data Science Core
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# EDA
from sklearn.preprocessing import LabelEncoder

# Model
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV 
from sklearn.tree import DecisionTreeClassifier

# Tuning
from sklearn.model_selection import KFold

# Evaluation
from sklearn.metrics import accuracy_score

pd.set_option('display.max_columns', None)
sns.set_theme()

import warnings
from sklearn.exceptions import FitFailedWarning
warnings.filterwarnings("ignore", category=FitFailedWarning)
warnings.filterwarnings("ignore", category=UserWarning)

# **Step 2: Loading data from train and test datasets**

In [393]:
df_train = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')
fullset = pd.concat([df_train, df_test], axis=0)

In [394]:
print(df_train.shape)
print(df_test.shape)
print(fullset.shape)

(891, 12)
(418, 11)
(1309, 12)


In [395]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


- Data EDA
Delete irrelevant data
Incomplete data
Graphs


In [396]:
df_train.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [397]:
df_train.describe(include=['O'])

Unnamed: 0,Name,Sex,Ticket,Cabin,Embarked
count,891,891,891,204,889
unique,891,2,681,147,3
top,"Braund, Mr. Owen Harris",male,347082,B96 B98,S
freq,1,577,7,4,644


In [398]:
# plt.figure(figsize=(8, 6))
# sns.barplot(x='Pclass', y='Survived', data=df_train, palette='pink')


# plt.title('Survival Rate by Passenger Class')
# plt.xlabel('Passenger Class')
# plt.ylabel('Survival Rate')

# plt.show()

In [399]:
# plt.figure()
# sns.barplot(x = 'Sex', y = 'Survived', data = df_train, palette = 'pink')

# plt.title('Survival rate by gender')
# plt.xlabel('Gender')
# plt.ylabel('Survival Rate')

# plt.show()

In [400]:
#plt.scatter(x = 'Fare', y = 'Survived', data = df_train)

In [401]:
df_train.query('Fare > 500')

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
258,259,1,1,"Ward, Miss. Anna",female,35.0,0,0,PC 17755,512.3292,,C
679,680,1,1,"Cardeza, Mr. Thomas Drake Martinez",male,36.0,0,1,PC 17755,512.3292,B51 B53 B55,C
737,738,1,1,"Lesurer, Mr. Gustave J",male,35.0,0,0,PC 17755,512.3292,B101,C


### **Binning(Continous -> Discrete)**

In [402]:
# Bin by Age
df_train['Age'].fillna(df_train['Age'].median(), inplace = True)
df_train.loc[df_train['Age'] < 20 ,'Grouped_Age'] = 'T'
df_train.loc[(df_train['Age'] >= 20) & (df_train['Age'] < 50),'Grouped_Age'] = 'M'
df_train.loc[df_train['Age'] >= 50 ,'Grouped_Age'] = 'O'

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_train['Age'].fillna(df_train['Age'].median(), inplace = True)


In [403]:
# plt.figure()
# sns.barplot(x = 'Grouped_Age', y = 'Survived', data = df_train, palette = 'pink')

# plt.title('Survival rate by age')
# plt.xlabel('Age')
# plt.ylabel('Survival Rate')

# plt.show()

In [404]:
df_train['Age'].describe()

count    891.000000
mean      29.361582
std       13.019697
min        0.420000
25%       22.000000
50%       28.000000
75%       35.000000
max       80.000000
Name: Age, dtype: float64

In [405]:
df_train['Age_bin'] = pd.qcut(df_train['Age'], q=4, labels=False)


In [406]:
df_train = df_train.drop(['Age'], axis = 1)

In [407]:
df_train.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'SibSp', 'Parch',
       'Ticket', 'Fare', 'Cabin', 'Embarked', 'Grouped_Age', 'Age_bin'],
      dtype='object')

Feature columns so far we have: Pclass, Name, Sex, Age, SibSp, Parch, Ticket, Fare, Cabin, Embarked, Grouped_Age
The Ones We have tested above: Sex, Age, Pclass.
In the following section, Sibsp, Parch, Fare, Cabin, Embarked will be tested out based on assumptions.

In [408]:
# SibSp
df_train[["SibSp", "Survived"]].groupby(['SibSp'], as_index=False).mean().sort_values(by='Survived', ascending=False)

Unnamed: 0,SibSp,Survived
1,1,0.535885
2,2,0.464286
0,0,0.345395
3,3,0.25
4,4,0.166667
5,5,0.0
6,8,0.0


In [409]:
#Parch
df_train[['Parch', 'Survived']].groupby(['Parch'], as_index = False).mean().sort_values(by = 'Survived', ascending = False)

Unnamed: 0,Parch,Survived
3,3,0.6
1,1,0.550847
2,2,0.5
0,0,0.343658
5,5,0.2
4,4,0.0
6,6,0.0


In [410]:
#Fare
df_train[['Fare', 'Survived']].groupby(['Fare'], as_index = False).mean().sort_values(by = 'Survived', ascending = False)

Unnamed: 0,Fare,Survived
247,512.3292,1.0
196,57.9792,1.0
89,13.8583,1.0
88,13.7917,1.0
86,13.4167,1.0
...,...,...
103,15.5500,0.0
180,47.1000,0.0
179,46.9000,0.0
178,42.4000,0.0


In [411]:
# Embarked
df_train[['Embarked', 'Survived']].groupby(['Embarked'], as_index = False).mean().sort_values(by = 'Survived', ascending = False)

Unnamed: 0,Embarked,Survived
0,C,0.553571
1,Q,0.38961
2,S,0.336957


In [412]:
# Cabin
df_train[['Cabin', 'Survived']].groupby(['Cabin'], as_index = False).mean().sort_values(by = 'Survived', ascending = False)

Unnamed: 0,Cabin,Survived
73,C62 C64,1.0
97,D21,1.0
94,D17,1.0
95,D19,1.0
60,C148,1.0
...,...,...
67,C46,0.0
64,C30,0.0
59,C128,0.0
56,C124,0.0


Higher Survival Rate:
Pclass: Class 1
Sex: Female
Age: T (Younger people) 
SibSp: Less Siblings 
Parch: 3 
Fare: Unknown  
Cabin: Unknown 
Embarked: C

In [413]:
df_train.info()
# Columns contain null values: Age, Cabin, Embarked

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 13 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   SibSp        891 non-null    int64  
 6   Parch        891 non-null    int64  
 7   Ticket       891 non-null    object 
 8   Fare         891 non-null    float64
 9   Cabin        204 non-null    object 
 10  Embarked     889 non-null    object 
 11  Grouped_Age  891 non-null    object 
 12  Age_bin      891 non-null    int64  
dtypes: float64(1), int64(6), object(6)
memory usage: 90.6+ KB


In [414]:
df_train['Embarked'].fillna(df_train['Embarked'].mode()[0], inplace = True)
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 13 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   SibSp        891 non-null    int64  
 6   Parch        891 non-null    int64  
 7   Ticket       891 non-null    object 
 8   Fare         891 non-null    float64
 9   Cabin        204 non-null    object 
 10  Embarked     891 non-null    object 
 11  Grouped_Age  891 non-null    object 
 12  Age_bin      891 non-null    int64  
dtypes: float64(1), int64(6), object(6)
memory usage: 90.6+ KB


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_train['Embarked'].fillna(df_train['Embarked'].mode()[0], inplace = True)


In [415]:
df_train = df_train.drop(['Cabin', 'Fare', 'Ticket', 'Grouped_Age'], axis = 'columns')

Extraxct people's title

In [416]:
df_train['Title'] = df_train['Name'].str.extract(r',\s*(\w+)\.\s*', expand=True)


In [417]:
df_train['Title'].value_counts()

Title
Mr          517
Miss        182
Mrs         125
Master       40
Dr            7
Rev           6
Major         2
Mlle          2
Col           2
Don           1
Mme           1
Ms            1
Lady          1
Sir           1
Capt          1
Jonkheer      1
Name: count, dtype: int64

In [418]:
replace_titles = ['Dr', 'Rev', 'Major', 'Mlle', 'Col', 'Don', 'Mme', 'Ms', 'Lady', 'Sir', 'Capt', 'Jonkheer']

In [419]:
df_train['Title'] = df_train['Title'].replace(replace_titles, ['Uncategorized']*len(replace_titles))

In [420]:
df_train['Title'].value_counts()

Title
Mr               517
Miss             182
Mrs              125
Master            40
Uncategorized     26
Name: count, dtype: int64

In [421]:
map_titles = {'Mr': 1, 'Miss': 2, 'Mrs': 3, 'Master': 4, 'Uncategorized': 5}
df_train['Title'] = df_train['Title'].map(map_titles)
df_train['Title'] = df_train['Title'].fillna(5)

In [422]:
df_train['Title'].value_counts()

Title
1.0    517
2.0    182
3.0    125
4.0     40
5.0     27
Name: count, dtype: int64

In [423]:
df_train = df_train.drop(['Name'], axis = 1)

Checking for outliers

In [424]:
df_train.tail()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,SibSp,Parch,Embarked,Age_bin,Title
886,887,0,2,male,0,0,S,1,5.0
887,888,1,1,female,0,0,S,0,2.0
888,889,0,3,female,1,2,S,1,2.0
889,890,1,1,male,0,0,C,1,1.0
890,891,0,3,male,0,0,Q,2,1.0


Possibly dropping passenger #631

In [425]:
#df_train.query('Age > 75 & Survived == 1')

In [426]:
#plt.scatter(x = 'Parch', y = 'Survived', data = df_train)

Possibily dropping #679

In [427]:
df_train.query('Parch == 6 & Survived == 0')

Unnamed: 0,PassengerId,Survived,Pclass,Sex,SibSp,Parch,Embarked,Age_bin,Title
678,679,0,3,female,1,6,S,3,3.0


In [428]:
vals = [679, 631]

In [429]:
df_train = df_train[df_train.PassengerId.isin(vals) == False]

# **Step 3: Feature Engineering**

### **Encoding**

* Numerical Value: Age, SibSp, Parch
* Categorical: Sex, Embarked

In [430]:
categorical_features_binary = ["Sex"]
categorical_features_onehot = ["Embarked"]

In [431]:
label_encoder = LabelEncoder()
for feature in categorical_features_binary:
    df_train[feature] = label_encoder.fit_transform(df_train[feature])
df_train

Unnamed: 0,PassengerId,Survived,Pclass,Sex,SibSp,Parch,Embarked,Age_bin,Title
0,1,0,3,1,1,0,S,0,1.0
1,2,1,1,0,1,0,C,3,3.0
2,3,1,3,0,0,0,S,1,2.0
3,4,1,1,0,1,0,S,2,3.0
4,5,0,3,1,0,0,S,2,1.0
...,...,...,...,...,...,...,...,...,...
886,887,0,2,1,0,0,S,1,5.0
887,888,1,1,0,0,0,S,0,2.0
888,889,0,3,0,1,2,S,1,2.0
889,890,1,1,1,0,0,C,1,1.0


In [432]:
df_train = pd.get_dummies(df_train, columns=categorical_features_onehot, dtype=int)
df_train

Unnamed: 0,PassengerId,Survived,Pclass,Sex,SibSp,Parch,Age_bin,Title,Embarked_C,Embarked_Q,Embarked_S
0,1,0,3,1,1,0,0,1.0,0,0,1
1,2,1,1,0,1,0,3,3.0,1,0,0
2,3,1,3,0,0,0,1,2.0,0,0,1
3,4,1,1,0,1,0,2,3.0,0,0,1
4,5,0,3,1,0,0,2,1.0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,1,0,0,1,5.0,0,0,1
887,888,1,1,0,0,0,0,2.0,0,0,1
888,889,0,3,0,1,2,1,2.0,0,0,1
889,890,1,1,1,0,0,1,1.0,1,0,0


In [433]:
## fixing test set
df_test = df_test.drop(['Ticket', 'Fare', 'Cabin', 'Name'], axis = 'columns')


In [434]:
df_test['Age'].fillna(df_test['Age'].median(), inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_test['Age'].fillna(df_test['Age'].median(), inplace=True)


In [435]:
df_test['Age_bin'] = pd.qcut(df_test['Age'], q=4, labels=False)
df_test = df_test.drop(['Age'], axis = 1)

In [436]:
categorical_binary = ["Sex"]
categorical_dummy = ["Embarked"]

In [437]:
label = LabelEncoder()
for feature in categorical_binary:
    df_test[feature] = label.fit_transform(df_test[feature])
df_test

Unnamed: 0,PassengerId,Pclass,Sex,SibSp,Parch,Embarked,Age_bin
0,892,3,1,0,0,Q,2
1,893,3,0,1,0,S,3
2,894,2,1,0,0,Q,3
3,895,3,1,0,0,S,1
4,896,3,0,1,1,S,0
...,...,...,...,...,...,...,...
413,1305,3,1,0,0,S,1
414,1306,1,0,0,0,C,3
415,1307,3,1,0,0,S,3
416,1308,3,1,0,0,S,1


In [438]:
df_test = pd.get_dummies(df_test, columns = categorical_dummy, dtype=int)
df_test

Unnamed: 0,PassengerId,Pclass,Sex,SibSp,Parch,Age_bin,Embarked_C,Embarked_Q,Embarked_S
0,892,3,1,0,0,2,0,1,0
1,893,3,0,1,0,3,0,0,1
2,894,2,1,0,0,3,0,1,0
3,895,3,1,0,0,1,0,0,1
4,896,3,0,1,1,0,0,0,1
...,...,...,...,...,...,...,...,...,...
413,1305,3,1,0,0,1,0,0,1
414,1306,1,0,0,0,3,1,0,0
415,1307,3,1,0,0,3,0,0,1
416,1308,3,1,0,0,1,0,0,1


# **Step 4: Model**

- Tree Model
    - Decision Tree
    - Bagging: RandomForest
    - Boosting(GBDT): XGBoost, LightGBM
- Linear Model
    - Ridge
- SVM
- KNN

Naive Bayes

Logistic Regression

Decision Tree

K Nearest Neighbor

Random Forest

SUpport Vector Classifier

Xtreme Gradient Boosting

Soft Voting Classifier

### Split

In [439]:
X = df_train.drop(['Survived'], axis=1)
y = df_train['Survived']

X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.3, random_state=42)
def baseline(model_name, X_train, X_valid, y_train, y_valid):
    model = model_name
    model.fit(X_train, y_train)
    y_pred = model.predict(X_valid)
    model_score = accuracy_score(y_pred, y_valid)
    return model_score

In [440]:
list_of_models = {
    'xgb' : XGBClassifier(),
    'lr' : LogisticRegression(max_iter=1000),
    'rf' : RandomForestClassifier(),
    'knn' : KNeighborsClassifier(),
    'gnb' : GaussianNB(),
    'dt' : DecisionTreeClassifier(random_state=1),
    'gb' : GradientBoostingClassifier()}

report = pd.DataFrame(
    {
    'Model': ['XGBoost', 'Logistic Regression', 'Random Forest', 'KNN', 'Naive Bayes', 'Decision Tree', 'Grradient Boosting'],
    'Score': [None] * len(list_of_models),
    'Score after tuning':[None] * len(list_of_models),
    'Best params':[None] * len(list_of_models)
    }
).astype({'Score': 'float64'})                           

i = 0
for model in list_of_models.values():
    model_score = baseline(model, X_train, X_valid, y_train, y_valid)
    report.loc[i, 'Score'] = model_score
    i+=1

report

Unnamed: 0,Model,Score,Score after tuning,Best params
0,XGBoost,0.790262,,
1,Logistic Regression,0.808989,,
2,Random Forest,0.794007,,
3,KNN,0.573034,,
4,Naive Bayes,0.801498,,
5,Decision Tree,0.737828,,
6,Grradient Boosting,0.820225,,


# **Step 5: Tuning**

### Utils

In [441]:
def tuning(model_name, param_grid, kf, grid_search_cv = False, randomized_search_cv = False):
    model = model_name

    if grid_search_cv:
        model = GridSearchCV(model, param_grid = param_grid, cv = kf, n_jobs = -1)
    elif randomized_search_cv:
        model = RandomizedSearchCV(model, param_distributions=param_grid, cv=kf, n_iter=10)

    best_model = model.fit(X_train, y_train)
        
    return best_model.best_score_, best_model.best_params_, best_model

### Configuration

In [442]:
param_grid = {  
    "XGBoost": {
        'n_estimators': [100, 200, 300],
        'learning_rate': [0.01, 0.05, 0.1],
        'max_depth': [3, 5, 7, 9],
        'min_child_weight': [1, 3, 5]
    },
    "Logistic Regression": {
        "max_iter": [2000],
        "penalty": ["l1", "l2"],
        "C": [1e-4, 1e-3, 1e-2, 1e-1, 1, 1e1, 1e2, 1e3, 1e4],
        "solver": ["liblinear"]
    },
    "Random Forest": {
        "n_estimators": [100, 200, 300],
        "max_features": ["auto", "sqrt"],
        "max_depth": [10]
    },
    "K-Nearest Neighbors": {
        "n_neighbors": [3, 5, 10],
        "weights": ["uniform", "distance"],
        "metric": ["euclidean", "manhattan"]
    },
    "Gaussian Naive Bayes": {
        "var_smoothing": [1, 0.1, 0.01, 0.001, 0.0001, 1e-5, 1e-6, 1e-7, 1e-8, 1e-9]
    },
    "Decision Tree": {
        "max_depth": [None, 10, 20, 30],
        "min_samples_split": [2, 10, 20],
        "min_samples_leaf": [1, 5, 10]
    },
    "Gradient Boosting": {
    'n_estimators': (50, 200), 
    'learning_rate': (0.01, 0.2), 
    'max_depth': (3, 10),  
    'min_samples_split': (2, 10),  
    'min_samples_leaf': (1, 10), 
    'max_features': ('sqrt', 'log2', None),  
    'subsample': (0.5, 1.0) 
    }
    
}

### RUN!!

In [443]:
p = ["XGBoost",
            "Logistic Regression",
            "Random Forest",
            "K-Nearest Neighbors",
            "Gaussian Naive Bayes",
            "Decision Tree",
            "Gradient Boosting"
]

kf = KFold(n_splits=5, shuffle=True, random_state=42)

i = 0
model_list = []
for model in tqdm(list_of_models.values()):
    best_score, best_params, best_model = tuning(model, param_grid[p[i]], kf, randomized_search_cv = True)
    report.loc[i, 'Score after tuning'] = best_score
    report.loc[i, 'Best params'] = str(best_params)
    model_list.append(best_model)
    i+=1

report

100%|██████████| 7/7 [00:28<00:00,  4.09s/it]


Unnamed: 0,Model,Score,Score after tuning,Best params
0,XGBoost,0.790262,0.807032,"{'n_estimators': 100, 'min_child_weight': 1, '..."
1,Logistic Regression,0.808989,0.799045,"{'solver': 'liblinear', 'penalty': 'l2', 'max_..."
2,Random Forest,0.794007,0.784439,"{'n_estimators': 200, 'max_features': 'sqrt', ..."
3,KNN,0.573034,0.636684,"{'weights': 'distance', 'n_neighbors': 5, 'met..."
4,Naive Bayes,0.801498,0.797303,{'var_smoothing': 1e-06}
5,Decision Tree,0.737828,0.786142,"{'min_samples_split': 10, 'min_samples_leaf': ..."
6,Grradient Boosting,0.820225,0.808594,"{'subsample': 0.5, 'n_estimators': 200, 'min_s..."


### Optional Details

##### 1. Tuning for **Logistic Regression** model

In [444]:
# lr = LogisticRegression()


# tuning(lr, param_grid_lr, kf, grid_search_cv = True)

##### 2. Tuning for **Random Forest** model

In [445]:
# rf = RandomForestClassifier()
# param_grid_rf = {
#     'n_estimators': [100, 200, 300],
#     'max_features': ['auto', 'sqrt'],
#     'max_depth': [10]
# }
# tuning(rf, param_grid_rf, kf, grid_search_cv = True)


##### 3. Tuning for **Naive Bayes** model

In [446]:
# gnb = GaussianNB()
# param_grid_gnb = {
#     'var_smoothing': np.logspace(0, -9, num=100)
# }

# tuning(gnb, param_grid_gnb, kf, grid_search_cv = True)


##### 4. Tuning for **Decision Tree** model

In [447]:
# dt = DecisionTreeClassifier()
# param_grid_dt = {
#     'max_depth': [None, 10, 20, 30],
#     'min_samples_split': [2, 10, 20],
#     'min_samples_leaf': [1, 5, 10]
# }

# tuning(dt, param_grid_dt, kf, grid_search_cv =True)

##### 5. Tuning for **K Nearest Neighbor** model

In [448]:
# knn = KNeighborsClassifier()
# param_grid_knn = {
#     'n_neighbors': [3, 5, 10],
#     'weights': ['uniform', 'distance'],
#     'metric': ['euclidean', 'manhattan']
# }

# tuning(knn, param_grid_knn, kf, grid_search_cv =True)

##### 6. Tuning for **XGB** model(Extreme Gradient Boosting)

In [449]:
# xgb = XGBClassifier()
# param_grid_xgb = {
#     'n_estimators': [100, 200],
#     'learning_rate': [0.01, 0.1, 0.2],
#     'max_depth': [3, 5, 7]
# }

# best_score,best_params = tuning(xgb, param_grid_xgb, kf, randomized_search_cv=True)

# **Step 6: Inference**

In [450]:
best_index = 0 # observe by yourself

y_pred = model_list[best_index].predict(df_test)


ValueError: feature_names mismatch: ['PassengerId', 'Pclass', 'Sex', 'SibSp', 'Parch', 'Age_bin', 'Title', 'Embarked_C', 'Embarked_Q', 'Embarked_S'] ['PassengerId', 'Pclass', 'Sex', 'SibSp', 'Parch', 'Age_bin', 'Embarked_C', 'Embarked_Q', 'Embarked_S']
expected Title in input data

In [None]:
y_pred # Your prediction

In [None]:
pd.Series(y_pred)

## **Step 7: Submit to Kaggle**

In [None]:
df = pd.read_csv('gender_submission.csv')

In [None]:
df["Survived"] = pd.Series(y_pred)

In [None]:
# Check
df.iloc[6,1]

In [None]:
df.to_csv("yiming_submission.csv", index=False)