# **Step 1: lib**

In [2]:
# Data Science Core
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# EDA
from sklearn.preprocessing import LabelEncoder

# Model
from sklearn.model_selection import train_test_split, ShuffleSplit, cross_validate
from xgboost import XGBClassifier

# Evaluation
from sklearn.metrics import accuracy_score

pd.set_option('display.max_columns', None)
sns.set_theme()

# **Step 2: Load Data**

In [3]:
df_train = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')
fullset = pd.concat([df_train, df_test], axis=0)

In [4]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


# **Step 3: EDA**

In [5]:
# df_train[["SibSp", "Survived"]].groupby(['SibSp'], as_index=False).mean().sort_values(by='Survived', ascending=False)
# df_train[['Parch', 'Survived']].groupby(['Parch'], as_index = False).mean().sort_values(by = 'Survived', ascending = False)
# df_train[['Fare', 'Survived']].groupby(['Fare'], as_index = False).mean().sort_values(by = 'Survived', ascending = False)
df_train[['Embarked', 'Survived']].groupby(['Embarked'], as_index = False).mean().sort_values(by = 'Survived', ascending = False)
# df_train[['Cabin', 'Survived']].groupby(['Cabin'], as_index = False).mean().sort_values(by = 'Survived', ascending = False)

Unnamed: 0,Embarked,Survived
0,C,0.553571
1,Q,0.38961
2,S,0.336957


In [6]:
df_train['Age'].fillna(df_train['Age'].median(), inplace = True)
df_train['Embarked'].fillna(df_train['Embarked'].mode()[0], inplace = True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_train['Age'].fillna(df_train['Age'].median(), inplace = True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_train['Embarked'].fillna(df_train['Embarked'].mode()[0], inplace = True)


In [7]:
df_train = df_train.drop(['Cabin', 'Fare', 'Ticket', 'Name'], axis = 'columns')

# **Step 4: Feature Engineering**

In [8]:
categorical_features_binary = ["Sex"]
categorical_features_onehot = ["Embarked"]

label_encoder = LabelEncoder()
for feature in categorical_features_binary:
    df_train[feature] = label_encoder.fit_transform(df_train[feature])

In [9]:
df_train = pd.get_dummies(df_train, columns=categorical_features_onehot, dtype=int)

# **Step 5: Modeling**

In [10]:
X = df_train.drop(['Survived'], axis=1)
y = df_train['Survived']

X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.3, random_state=42)

clf = XGBClassifier()
clf.fit(X_train, y_train)
y_pred = clf.predict(X_valid)
accuracy = accuracy_score(y_valid, y_pred)

print(f'Accuracy: {accuracy:.2f}')

Accuracy: 0.77
