# Titanic Survival Rate Predictions

### Importing packages

In [827]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import learning_curve
import matplotlib.pyplot as plt
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder


### Explorating Data

In [828]:
# Importing data csv as pandas df.
raw_train = pd.read_csv('train.csv',index_col=0)
raw_test = pd.read_csv('test.csv',index_col=0)

In [829]:
# General overview of the data
raw_train.head(20)

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S
6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q
7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S
8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.075,,S
9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11.1333,,S
10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,,C


In [830]:
# General information.
raw_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 1 to 891
Data columns (total 11 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  891 non-null    int64  
 1   Pclass    891 non-null    int64  
 2   Name      891 non-null    object 
 3   Sex       891 non-null    object 
 4   Age       714 non-null    float64
 5   SibSp     891 non-null    int64  
 6   Parch     891 non-null    int64  
 7   Ticket    891 non-null    object 
 8   Fare      891 non-null    float64
 9   Cabin     204 non-null    object 
 10  Embarked  889 non-null    object 
dtypes: float64(2), int64(4), object(5)
memory usage: 83.5+ KB


In [831]:
# Null values. Seems like we'll have some null values to handle in Age, Cabin, Fare and Embarked.
print(raw_train.isnull().sum())
print(raw_test.isnull().sum())

Survived      0
Pclass        0
Name          0
Sex           0
Age         177
SibSp         0
Parch         0
Ticket        0
Fare          0
Cabin       687
Embarked      2
dtype: int64
Pclass        0
Name          0
Sex           0
Age          86
SibSp         0
Parch         0
Ticket        0
Fare          1
Cabin       327
Embarked      0
dtype: int64


In [832]:
raw_train.describe()

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,714.0,891.0,891.0,891.0
mean,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,0.0,1.0,0.42,0.0,0.0,0.0
25%,0.0,2.0,20.125,0.0,0.0,7.9104
50%,0.0,3.0,28.0,0.0,0.0,14.4542
75%,1.0,3.0,38.0,1.0,0.0,31.0
max,1.0,3.0,80.0,8.0,6.0,512.3292


In [833]:
# There are no duplicates in out datasets.
dup_train = raw_train.duplicated().any()
dup_test = raw_test.duplicated().any()
print(dup_train)
print(dup_test)

False
False


# Splitting out master train file into train and valid 80/20 in order to be able to measure performance.
x = raw_train.drop(["Survived"],axis=1)
y = raw_train["Survived"]
x_train, x_valid, y_train, y_valid = train_test_split(x, y, test_size=0.2, random_state=6)

### Data cleaning

# We'll replace NaN values for Age, Fare and Embarked. Cabin has too many NaN to be useful in our models.
# Where applicable, we'll always use the same values calculated for train in test. To avoid leakage.

x_train["Age"] = x_train["Age"].fillna(x_train["Age"].median())
raw_train["Age"] = raw_train["Age"].fillna(raw_train["Age"].median())
raw_test["Age"] = raw_test["Age"].fillna(raw_train["Age"].median())

x_train["Embarked"] = x_train["Embarked"].fillna(x_train["Embarked"].mode()[0])
x_valid["Embarked"] = x_valid["Embarked"].fillna(x_train["Embarked"].mode()[0])
raw_test["Embarked"] = raw_test["Embarked"].fillna(x_train["Embarked"].mode()[0])

x_train["Fare"] = x_train["Fare"].fillna(x_train["Fare"].mean())
x_valid["Fare"] = x_valid["Fare"].fillna(x_train["Fare"].mean())
raw_test["Fare"] = raw_test["Fare"].fillna(x_train["Fare"].mean())

x_train['Cabin'].fillna('Missing', inplace=True)
x_valid['Cabin'].fillna('Missing', inplace=True)
raw_test['Cabin'].fillna('Missing', inplace=True)

In [834]:
# We'll replace NaN values for Age, Fare and Embarked. Cabin has too many NaN to be useful in our models.
# Where applicable, we'll always use the same values calculated for train in test. To avoid leakage.

raw_train["Age"] = raw_train["Age"].fillna(raw_train["Age"].median())
raw_test["Age"] = raw_test["Age"].fillna(raw_train["Age"].median())

raw_train["Embarked"] = raw_train["Embarked"].fillna(raw_train["Embarked"].mode()[0])
raw_test["Embarked"] = raw_test["Embarked"].fillna(raw_train["Embarked"].mode()[0])

raw_train["Fare"] = raw_train["Fare"].fillna(raw_train["Fare"].mean())
raw_test["Fare"] = raw_test["Fare"].fillna(raw_train["Fare"].mean())

raw_train['Cabin'].fillna('Missing', inplace=True)
raw_test['Cabin'].fillna('Missing', inplace=True)

In [835]:
print(raw_train.isnull().sum())
print(raw_test.isnull().sum())

Survived    0
Pclass      0
Name        0
Sex         0
Age         0
SibSp       0
Parch       0
Ticket      0
Fare        0
Cabin       0
Embarked    0
dtype: int64
Pclass      0
Name        0
Sex         0
Age         0
SibSp       0
Parch       0
Ticket      0
Fare        0
Cabin       0
Embarked    0
dtype: int64


In [836]:
#Creating the label encoder
encoder_c = LabelEncoder()
encoder_e = LabelEncoder()

#Fitting the encoder on the raw_train['Cabin'] column
encoder_c.fit(pd.concat([raw_train['Cabin'], raw_test['Cabin']]))
encoder_e.fit(raw_train['Embarked'])

#Transforming the x_train['Cabin'], x_valid['Cabin'] and raw_test['Cabin'] columns using the fitted encoder
raw_train['Cabin'] = encoder_c.transform(raw_train['Cabin'])
raw_test['Cabin'] = encoder_c.transform(raw_test['Cabin'])

raw_train['Embarked'] = encoder_e.transform(raw_train['Embarked'])
raw_test['Embarked'] = encoder_e.transform(raw_test['Embarked'])
raw_train.head(10)

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,185,2
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,106,0
3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,185,2
4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,70,2
5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,185,2
6,0,3,"Moran, Mr. James",male,28.0,0,0,330877,8.4583,185,1
7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,163,2
8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.075,185,2
9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11.1333,185,2
10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,185,0


In [837]:
# Dictionary for numerical conversion of column sex
factors = {'male': 0, 'female': 1}

# Applying the same conversion to all df
raw_train['Sex'] = raw_train['Sex'].map(factors)
raw_test['Sex'] = raw_test['Sex'].map(factors)

# Creating one variable for SibSp and Parch.
x_train['fsize'] = x_train['SibSp'] + x_train['Parch']
x_valid['fsize'] = x_valid['SibSp'] + x_valid['Parch']
raw_test['fsize'] = raw_test['SibSp'] + raw_test['Parch']

# 0 if alone and 1 if not alone
x_train['is_alone'] = x_train['fsize'].apply(lambda x: 0 if x == 0 else 1)
x_valid['is_alone'] = x_valid['fsize'].apply(lambda x: 0 if x == 0 else 1)
raw_test['is_alone'] = raw_test['fsize'].apply(lambda x: 0 if x == 0 else 1)


# Creating Fare per person
x_train['farepp'] = x_train['Fare'] / (x_train['fsize'] + 1)
x_valid['farepp'] = x_valid['Fare'] / (x_valid['fsize'] + 1)
raw_test['farepp'] = raw_test['Fare'] / (raw_test['fsize'] + 1)

# Extracting titles
x_train["Title"] = x_train["Name"].str.extract("([A-Za-z]+\.)", expand=False)     
x_valid["Title"] = x_valid["Name"].str.extract("([A-Za-z]+\.)", expand=False)  
raw_test["Title"] = raw_test["Name"].str.extract("([A-Za-z]+\.)", expand=False)  

# Getting the unique values of the column
unique_titles = pd.concat((x_train['Title'], x_valid['Title'], raw_test['Title'])).drop_duplicates()
unique_titles.head(10)

# Dictionary for numerical conversion of column sex
factors_title = {'Mr.': 0, 'Mrs.': 1, 'Miss.': 2, 'Rev.': 3, 'Master.': 4, 'Capt.': 5, 'Lady.': 6, 'Col.': 7, 'Dr.': 8, 'Don.': 9}

# Applying the same conversion to all df
x_train['Title'] = x_train['Title'].map(factors_title)
x_valid['Title'] = x_valid['Title'].map(factors_title)
raw_test['Title'] = raw_test['Title'].map(factors_title)

x_train['Title'].fillna(-1, inplace=True)
x_valid['Title'].fillna(-1, inplace=True)
raw_test['Title'].fillna(-1, inplace=True)

x_train.head()

In [838]:
# Creating one variable for SibSp and Parch.
raw_train['fsize'] = raw_train['SibSp'] + raw_train['Parch']
raw_test['fsize'] = raw_test['SibSp'] + raw_test['Parch']

# 0 if alone and 1 if not alone
raw_train['is_alone'] = raw_train['fsize'].apply(lambda x: 0 if x == 0 else 1)
raw_test['is_alone'] = raw_test['fsize'].apply(lambda x: 0 if x == 0 else 1)

# Creating Fare per person
raw_train['farepp'] = raw_train['Fare'] / (raw_train['fsize'] + 1)
raw_test['farepp'] = raw_test['Fare'] / (raw_test['fsize'] + 1)

In [839]:
# Define the bins
bins = [0, 16, 32, 48, 64, 80]

# Assign the age categories to both dataframes
raw_train['Age_cat'] = pd.cut(raw_train['Age'], bins=bins, labels=False)
raw_test['Age_cat'] = pd.cut(raw_test['Age'], bins=bins, labels=False)


raw_train['Age_cat'].unique()

array([1, 2, 3, 0, 4])

In [840]:
# Creating fare categories.
def create_fare_cat(df):
    df['Fare_cat'] = 0
    df.loc[ df['Fare'] <= 7.91, 'Fare_cat'] = 0
    df.loc[(df['Fare'] > 7.91) & (df['Fare'] <= 14.454), 'Fare_cat'] = 1
    df.loc[(df['Fare'] > 14.454) & (df['Fare'] <= 31), 'Fare_cat'] = 2
    df.loc[ df['Fare'] > 31, 'Fare_cat'] = 3
    df['Fare_cat'] = df['Fare_cat'].astype(int)
    return df

raw_train = create_fare_cat(raw_train)
raw_test = create_fare_cat(raw_test)

raw_train['Fare_cat'].unique()
raw_train['Fare_cat'].info()

<class 'pandas.core.series.Series'>
Int64Index: 891 entries, 1 to 891
Series name: Fare_cat
Non-Null Count  Dtype
--------------  -----
891 non-null    int64
dtypes: int64(1)
memory usage: 13.9 KB


In [841]:
# Extracting titles
raw_train["Title"] = raw_train["Name"].str.extract("([A-Za-z]+\.)", expand=False)     
raw_test["Title"] = raw_test["Name"].str.extract("([A-Za-z]+\.)", expand=False)  

# Getting the unique values of the column
unique_titles = pd.concat((raw_train['Title'], raw_test['Title'])).drop_duplicates()
unique_titles.head(10)

# Dictionary for numerical conversion of column sex
factors_title = {'Mr.': 0, 'Mrs.': 1, 'Miss.': 2, 'Rev.': 3, 'Master.': 4, 'Capt.': 5, 'Lady.': 6, 'Col.': 7, 'Dr.': 8, 'Don.': 9}

# Applying the same conversion to all df
raw_train['Title'] = raw_train['Title'].map(factors_title)
raw_test['Title'] = raw_test['Title'].map(factors_title)

raw_train['Title'].fillna(-1, inplace=True)
raw_test['Title'].fillna(-1, inplace=True)

raw_train.head()

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,fsize,is_alone,farepp,Age_cat,Fare_cat,Title
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
1,0,3,"Braund, Mr. Owen Harris",0,22.0,1,0,A/5 21171,7.25,185,2,1,1,3.625,1,0,0.0
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",1,38.0,1,0,PC 17599,71.2833,106,0,1,1,35.64165,2,3,1.0
3,1,3,"Heikkinen, Miss. Laina",1,26.0,0,0,STON/O2. 3101282,7.925,185,2,0,0,7.925,1,1,2.0
4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",1,35.0,1,0,113803,53.1,70,2,1,1,26.55,2,3,1.0
5,0,3,"Allen, Mr. William Henry",0,35.0,0,0,373450,8.05,185,2,0,0,8.05,2,1,0.0


In [842]:
# Preparing files before starting to fit ML models to the data
y_clean_train = raw_train['Survived']
x_clean_train = raw_train.drop(['Name', 'Ticket', 'Cabin','SibSp', 'Parch', 'fsize','Fare', 'Age', 'Survived'], axis=1)
clean_test = raw_test.drop(['Name', 'Ticket', 'Cabin', 'SibSp', 'Parch', 'fsize', 'Fare', 'Age'], axis=1)

def normalize(df):
    result = df.copy()
    for feature_name in df.columns:
        max_value = df[feature_name].max()
        min_value = df[feature_name].min()
        result[feature_name] = (df[feature_name] - min_value) / (max_value - min_value)
    return result

x_train = normalize(x_train)
x_valid = normalize(x_valid)

print(x_train.head(30))

### Decision Tree

In [843]:
dtree = DecisionTreeClassifier(random_state=6)

# Define the parameter grid
param_grid = {'max_depth': [2, 3, 4, 5, 6, 7, 8, 9, 10],
              'min_samples_split': [2, 3, 4, 5, 6, 7, 8, 9, 10]}

gs_dtree = GridSearchCV(dtree, param_grid, cv=10)

# Fit the grid search to the training data
gs_dtree.fit(x_clean_train, y_clean_train)

# Print the best parameters and the best score
print("Best parameters: {}".format(gs_dtree.best_params_))
print("Best score: {:.2f}".format(gs_dtree.best_score_))

Best parameters: {'max_depth': 6, 'min_samples_split': 9}
Best score: 0.82


In [844]:
# Create an instance of the random forest classifier
dtree = RandomForestClassifier(random_state=6)

# Use k-fold cross-validation to evaluate the model
scores = cross_val_score(dtree, x_clean_train, y_clean_train, cv=10)

# Print the mean and standard deviation of the scores
print("Mean score: {:.2f}".format(scores.mean()))
print("Standard deviation: {:.2f}".format(scores.std()))

Mean score: 0.80
Standard deviation: 0.03


predictions_tree = gs_dtree.predict(x_valid)

# Comparaison performance train vs valid
print('Training set score: {:.4f}'.format(gs_dtree.score(x_train, y_train)))
print('Test set score: {:.4f}'.format(gs_dtree.score(x_valid, y_valid)))

# Classification report
print(classification_report(y_valid,predictions_tree))

# Learning curve Tree
from sklearn.model_selection import learning_curve
import matplotlib.pyplot as plt

X = pd.concat([x_train, x_valid], ignore_index=True)
Y = pd.concat([y_train, y_valid], ignore_index=True)

# Generate the training and validation set sizes
train_sizes, train_scores, valid_scores = learning_curve(gs_dtree, X, Y, cv=5, scoring='accuracy')

# Plot the learning curve
plt.plot(train_sizes, train_scores.mean(axis=1), label='Training Score')
plt.plot(train_sizes, valid_scores.mean(axis=1), label='Validation Score')
plt.xlabel('Training Set Size')
plt.ylabel('Accuracy')
plt.legend()
plt.show()

### Random Forest

In [845]:
# Define the parameter grid
param_grid = {'n_estimators': [150, 200, 300, 350],
              'max_depth': [5, 7, 10, 15],
              'min_samples_split': [5, 10, 15, 20, 25]}

# Create an instance of the random forest classifier
rfc = RandomForestClassifier(random_state=6)

# Create a grid search object with the classifier and parameter grid
gs_rfc = GridSearchCV(rfc, param_grid, cv=10)

# Fit the grid search to the training data
gs_rfc.fit(x_clean_train, y_clean_train)

# Print the best parameters and the best score
print("Best parameters: {}".format(gs_rfc.best_params_))
print("Best score: {:.2f}".format(gs_rfc.best_score_))

Best parameters: {'max_depth': 10, 'min_samples_split': 10, 'n_estimators': 150}
Best score: 0.82


In [846]:
# Use k-fold cross-validation to evaluate the model
scores = cross_val_score(rfc, x_clean_train, y_clean_train, cv=10)

# Print the mean and standard deviation of the scores
print("Mean score: {:.2f}".format(scores.mean()))
print("Standard deviation: {:.2f}".format(scores.std()))

Mean score: 0.80
Standard deviation: 0.03


predictions_rfc = rfc.predict(x_valid)

# Classification report
print(classification_report(y_valid,predictions_rfc))

# Comparaison performance train vs valid
print('Training set score: {:.4f}'.format(rfc.score(x_train, y_train)))
print('Test set score: {:.4f}'.format(rfc.score(x_valid, y_valid)))

# Learning curve Random Forest

# Generate the training and validation set sizes
train_sizes, train_scores, valid_scores = learning_curve(rfc, X, Y, cv=5, scoring='accuracy')

# Plot the learning curve
plt.plot(train_sizes, train_scores.mean(axis=1), label='Training Score')
plt.plot(train_sizes, valid_scores.mean(axis=1), label='Validation Score')
plt.xlabel('Training Set Size')
plt.ylabel('Accuracy')
plt.legend()
plt.show()

# Boosting

In [847]:
# Define the parameter grid
param_grid = {'learning_rate': [0.1, 0.5, 1.0],
              'n_estimators': [50, 100, 150, 200]}

# Create an instance of the boosting classifier
boost = AdaBoostClassifier(random_state=6)

# Create a grid search object with the classifier and parameter grid
gs_boost = GridSearchCV(boost, param_grid, cv=10)

# Fit the grid search to the training data
gs_boost.fit(x_clean_train, y_clean_train)

# Print the best parameters and the best score
print("Best parameters: {}".format(gs_boost.best_params_))
print("Best score: {:.2f}".format(gs_boost.best_score_))

Best parameters: {'learning_rate': 1.0, 'n_estimators': 200}
Best score: 0.82


# Use k-fold cross-validation to evaluate the model
scores = cross_val_score(gs_boost, x_clean_train, y_clean_train, cv=10)

# Print the mean and standard deviation of the scores
print("Mean score: {:.2f}".format(scores.mean()))
print("Standard deviation: {:.2f}".format(scores.std()))

predictions_boost = boost.predict(x_valid)

print('Training set score: {:.4f}'.format(boost.score(x_train, y_train)))
print('Test set score: {:.4f}'.format(boost.score(x_valid, y_valid)))

print(classification_report(y_valid,predictions_boost))

# Learning curve Tree
from sklearn.model_selection import learning_curve
import matplotlib.pyplot as plt

# Generate the training and validation set sizes
train_sizes, train_scores, valid_scores = learning_curve(boost, X, Y, cv=5, scoring='accuracy')

# Plot the learning curve
plt.plot(train_sizes, train_scores.mean(axis=1), label='Training Score')
plt.plot(train_sizes, valid_scores.mean(axis=1), label='Validation Score')
plt.xlabel('Training Set Size')
plt.ylabel('Accuracy')
plt.legend()
plt.show()

### Light GBM

In [848]:
# Define the parameter grid
param_grid = {'n_estimators': [75, 100, 150, 200],
              'num_leaves': [10, 20, 30],
              'learning_rate': [0.1, 0.05, 0.01],
              'min_child_samples': [10, 15, 20, 25]
              }

ltgbm = lgb.LGBMClassifier(random_state=6)

# Create a grid search object with the classifier and parameter grid
gs_ltgbm = GridSearchCV(ltgbm, param_grid, cv=10)

# Fit the grid search to the training data
gs_ltgbm.fit(x_clean_train, y_clean_train)

# Print the best parameters and the best score
print("Best parameters: {}".format(gs_ltgbm.best_params_))
print("Best score: {:.2f}".format(gs_ltgbm.best_score_))


Best parameters: {'learning_rate': 0.05, 'min_child_samples': 15, 'n_estimators': 200, 'num_leaves': 10}
Best score: 0.84


# Use k-fold cross-validation to evaluate the model
scores = cross_val_score(gs_ltgbm, x_clean_train, y_clean_train, cv=10)

# Print the mean and standard deviation of the scores
print("Mean score: {:.2f}".format(scores.mean()))
print("Standard deviation: {:.2f}".format(scores.std()))

predictions_ltgbm = ltgbm.predict(x_valid)

print('Training set score: {:.4f}'.format(ltgbm.score(x_train, y_train)))
print('Test set score: {:.4f}'.format(ltgbm.score(x_valid, y_valid)))

print(classification_report(y_valid,predictions_ltgbm))

# Learning curve Tree

# Generate the training and validation set sizes
train_sizes, train_scores, valid_scores = learning_curve(ltgbm, X, Y, cv=5, scoring='accuracy')

# Plot the learning curve
plt.plot(train_sizes, train_scores.mean(axis=1), label='Training Score')
plt.plot(train_sizes, valid_scores.mean(axis=1), label='Validation Score')
plt.xlabel('Training Set Size')
plt.ylabel('Accuracy')
plt.legend()
plt.show()

### Applying model on test dataset for submission

In [852]:
predictions_test_rfc = gs_rfc.predict(clean_test)

In [853]:
final_predictions = pd.DataFrame({'Survived': predictions_test_rfc}, index=clean_test.index)
final_predictions

Unnamed: 0_level_0,Survived
PassengerId,Unnamed: 1_level_1
892,0
893,1
894,0
895,0
896,0
...,...
1305,0
1306,1
1307,0
1308,0


In [854]:
final_predictions.to_csv('survival_submission.csv')