# Challenge 1: Titanic - Machine Learning From Disaster 

### Khai báo thư viện

In [1]:
import os, sys
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from IPython import display
import joblib

from sklearn.preprocessing import OneHotEncoder, LabelEncoder, OrdinalEncoder
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier

from sklearn.model_selection import train_test_split

### Nạp dữ liệu

In [5]:
save_dir = "../exps/data"

df_train = pd.read_excel(f"{save_dir}/train.xlsx")
df_valid = pd.read_excel(f"{save_dir}/valid.xlsx")

In [4]:
df_train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,332,0,1,"Partner, Mr. Austen",1,0.566474,0,0,113043,0.055628,56,2
1,734,0,2,"Berriman, Mr. William John",1,0.28374,0,0,28425,0.025374,147,2
2,383,0,3,"Tikkanen, Mr. Juho",1,0.396833,0,0,STON/O 2. 3101293,0.015469,147,2
3,705,0,3,"Hansen, Mr. Henrik Juul",1,0.321438,1,0,350025,0.01533,147,2
4,814,0,3,"Andersson, Miss. Ebba Iris Alfrida",0,0.070118,4,2,347082,0.061045,147,2


In [6]:
df_valid.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,710,1,3,"Moubarek, Master. Halim Gonios (""William George"")",1,0.346569,1,1,2661,0.029758,147,0
1,440,0,2,"Kvillner, Mr. Johan Henrik Johannesson",1,0.384267,0,0,C.A. 18723,0.020495,147,2
2,841,0,3,"Alhomaki, Mr. Ilmari Rudolf",1,0.246042,0,0,SOTON/O2 3101287,0.015469,147,2
3,721,1,2,"Harper, Miss. Annie Jessie ""Nina""",0,0.070118,0,1,248727,0.064412,147,2
4,40,1,3,"Nicola-Yarred, Miss. Jamila",0,0.170646,1,0,2651,0.021942,147,0


### Tách X, y

In [18]:
X_train = df_train.drop("Survived", axis=1)
X_train = X_train.drop("Name",axis=1)
X_train = X_train.drop("Ticket",axis=1)
y_train = df_train["Survived"]

X_valid = df_valid.drop("Survived", axis=1)
X_valid = X_valid.drop("Name", axis=1)
X_valid = X_valid.drop("Ticket", axis=1)
y_valid = df_valid["Survived"]

print(f"Train shape: {X_train.shape}, Valid shape: {X_valid.shape}")

Train shape: (712, 9), Valid shape: (179, 9)


### Lựa chọn mô hình

In [22]:
models = {
    ('Logistic Regression', LogisticRegression(max_iter=1000)),
    ('Decision Tree', DecisionTreeClassifier(random_state=42)),
    ('Random Forest', RandomForestClassifier(random_state=42)),
    ('SVM', SVC(probability=True)),
    ('Gradient Boosting', GradientBoostingClassifier(random_state=42)),
    ('XGBoost', XGBClassifier(n_estimators=200,random_state=42,eval_metric='logloss'))
}

### Huấn luyện và đánh giá từng mô hình

In [23]:
results = []

for name,model in models:
    model.fit(X_train,y_train)
    y_pred = model.predict(X_valid)

    acc = accuracy_score(y_valid, y_pred)
    prec = precision_score(y_valid, y_pred)
    rec = recall_score(y_valid, y_pred)
    f1 = f1_score(y_valid, y_pred)

    results.append([name, acc, prec, rec, f1])
    print(classification_report(y_valid, y_pred, digits=4))

              precision    recall  f1-score   support

           0     0.8182    0.8571    0.8372       105
           1     0.7826    0.7297    0.7552        74

    accuracy                         0.8045       179
   macro avg     0.8004    0.7934    0.7962       179
weighted avg     0.8035    0.8045    0.8033       179

              precision    recall  f1-score   support

           0     0.8364    0.8762    0.8558       105
           1     0.8116    0.7568    0.7832        74

    accuracy                         0.8268       179
   macro avg     0.8240    0.8165    0.8195       179
weighted avg     0.8261    0.8268    0.8258       179

              precision    recall  f1-score   support

           0     0.8142    0.8762    0.8440       105
           1     0.8030    0.7162    0.7571        74

    accuracy                         0.8101       179
   macro avg     0.8086    0.7962    0.8006       179
weighted avg     0.8096    0.8101    0.8081       179

              preci

In [24]:
df_results = pd.DataFrame(results, columns=["Model", "Accuracy", "Precision", "Recall", "F1"])
df_results = df_results.sort_values(by="F1", ascending=False)
display.display(df_results)

Unnamed: 0,Model,Accuracy,Precision,Recall,F1
1,Gradient Boosting,0.826816,0.811594,0.756757,0.783217
2,Random Forest,0.810056,0.80303,0.716216,0.757143
0,Logistic Regression,0.804469,0.782609,0.72973,0.755245
4,XGBoost,0.776536,0.736111,0.716216,0.726027
5,Decision Tree,0.743017,0.670732,0.743243,0.705128
3,SVM,0.631285,0.75,0.162162,0.266667


Nhận xét:
+ Gradient Boosting cho kết quả cao nhất