In [None]:
#pip installs and imports
%pip install scikit-learn 
%pip install pandas
%pip install optuna
%pip install xgboost
%pip install lightgbm
%pip install imbalanced-learn
%pip install plotly

In [150]:
import pandas as pd
import numpy as np
import optuna


import plotly.graph_objects as go
import plotly.express as px
import plotly.offline as py

from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score, cross_val_predict, StratifiedKFold
from sklearn.feature_selection import mutual_info_classif


In [151]:
pd.set_option('display.max_rows', 50)
# load in data
train_df = pd.read_csv('./train.csv')
test_df = pd.read_csv('./test.csv') 

# pop out 'y'
y = train_df['Survived']
train_features_df = train_df.drop(['Survived'], axis=1)

# merge train and test for feature engineering
X = pd.concat([train_features_df, test_df], axis=0, ignore_index=True)

                                                             Feature Engineering

In [152]:
# Use the map function to replace the 'Sex' column
X['Sex'] = X['Sex'].map({'male': 0, 'female': 1})

In [153]:
# Create a boxplot
fig = px.box(X, x="Embarked", y="Fare", color="Pclass",
             title="Fare distribution across Embarked and Pclass categories",
             labels={"Fare": "Fare", "Embarked": "Embarked", "Pclass": "Pclass"},
             category_orders={"Pclass": ["1", "2", "3"]})  # Optional: to order the classes

# Show the plot
fig.show()

In [154]:
# Passenger 62 (Missing Embarked): Fare = 80$, Pclass = 1, therefore, Embarked = 'C' because median of 'C' is at 76.7$, and the others are much lower.
# Passenger 830 (Missing Embakred):  Fare = 80$, Pclass = 1, therefore Embarked = 'C'.
# Passenger 1044 (Missing Fare): Pclass = 3, Embarked = S. Median of that type is 8.05. Therefore Fare = 8.05.

X.loc[61, 'Embarked'] = 'C'
X.loc[829, 'Embarked'] = 'C'
X.loc[1043, 'Fare'] = 8.05

In [155]:
drops = ['PassengerId', 'Ticket', 'Cabin', 'Name']
X = X.drop(drops, axis = 1)

In [156]:
X = pd.get_dummies(X, columns=['Embarked'], prefix='Embarked')
X = pd.get_dummies(X, columns=['Pclass'], prefix='Pclass')
X = pd.get_dummies(X, columns=['Sex'], prefix='Sex')
X = pd.get_dummies(X, columns=['SibSp'], prefix='SibSp')
X = pd.get_dummies(X, columns=['Parch'], prefix='Parch')

In [157]:
imputer = IterativeImputer(max_iter=10, random_state=0)
X_imputed = imputer.fit_transform(X)
X_imputed = pd.DataFrame(X_imputed, columns=X.columns)

pd.set_option('display.max_columns', 50)
pd.set_option('display.max_rows', 50)

X = X_imputed

X

Unnamed: 0,Age,Fare,Embarked_C,Embarked_Q,Embarked_S,Pclass_1,Pclass_2,Pclass_3,Sex_0,Sex_1,SibSp_0,SibSp_1,SibSp_2,SibSp_3,SibSp_4,SibSp_5,SibSp_8,Parch_0,Parch_1,Parch_2,Parch_3,Parch_4,Parch_5,Parch_6,Parch_9
0,22.000000,7.2500,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,38.000000,71.2833,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,26.000000,7.9250,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,35.000000,53.1000,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,35.000000,8.0500,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1304,27.918938,8.0500,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1305,39.000000,108.9000,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1306,38.500000,7.2500,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1307,27.918938,8.0500,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [163]:
# Split X back into training and test sets
X_train = X[:891]
X_test = X[891:]
X_train.shape, y.shape, X_test.shape

((891, 25), (891,), (418, 25))

In [169]:
# Initialize a random forest classifier
rf = RandomForestClassifier(n_estimators=400, random_state=42)

# Fit the model on the training data
rf.fit(X_train, y)
rf.score(X_train, y)
rf_accuracy = round(rf.score(X_train, y) * 100, 2)
rf_accuracy

98.2

In [178]:
# Initialize a random forest classifier
rf = RandomForestClassifier(n_estimators=400, random_state=42)

# Compute cross-validated accuracy scores
scores = cross_val_score(rf, X_train, y, cv=5)

# Print the accuracy for each fold
print("Accuracy for each fold: ", scores)

# And the mean accuracy across 5 folds
print("Mean cross-validation accuracy: ", scores.mean())
rf.fit(X_train, y)

Accuracy for each fold:  [0.75977654 0.79775281 0.85955056 0.78089888 0.83707865]
Mean cross-validation accuracy:  0.8070114870378507


In [179]:
# Make predictions on the test data
predictions = rf.predict(X_test)

# Create a DataFrame for submission
submission = pd.DataFrame({"Survived": predictions})
submission.index = np.arange(892, 892+len(submission))
submission.index.name = 'PassengerId'
submission.to_csv('submission2.csv')