In [36]:
import pandas as pd
from xgboost import XGBClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

df = pd.read_csv('data/train.csv')

df['Cabin'] = df['Cabin'].str[0].fillna('U')
df['Age'] = df['Age'].fillna(df['Age'].mean())

encoder = LabelEncoder()
df['Sex'] = encoder.fit_transform(df['Sex'])
df['Cabin'] = encoder.fit_transform(df['Cabin'])

X = df.drop(['PassengerId', 'Name', 'Ticket', 'Survived', 'Embarked'], axis=1)
y = df['Survived']
X

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin
0,3,1,22.000000,1,0,7.2500,8
1,1,0,38.000000,1,0,71.2833,2
2,3,0,26.000000,0,0,7.9250,8
3,1,0,35.000000,1,0,53.1000,2
4,3,1,35.000000,0,0,8.0500,8
...,...,...,...,...,...,...,...
886,2,1,27.000000,0,0,13.0000,8
887,1,0,19.000000,0,0,30.0000,1
888,3,0,29.699118,1,2,23.4500,8
889,1,1,26.000000,0,0,30.0000,2


In [37]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

model = XGBClassifier(
    objective='binary:logistic',
    eval_metric='logloss',
    booster='gbtree',
    max_depth=5,
    learning_rate=0.01,
    n_estimators=100,
    n_jobs=-1,
    subsample=0.8,             
    colsample_bytree=0.8,       
    tree_method='hist',          
    random_state=42
)

model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print('Accuracy:', accuracy_score(y_test, y_pred))
print('Precision:', precision_score(y_test, y_pred))
print('Recall:', recall_score(y_test, y_pred))
print('F1:', f1_score(y_test, y_pred))

Accuracy: 0.8268156424581006
Precision: 0.8958333333333334
Recall: 0.6231884057971014
F1: 0.7350427350427351


In [38]:
test_df = pd.read_csv('data/test.csv')

test_df['Cabin'] = test_df['Cabin'].str[0].fillna('U')
test_df['Age'] = test_df['Age'].fillna(test_df['Age'].mean())
test_df['Sex'] = encoder.fit_transform(test_df['Sex'])
test_df['Cabin'] = encoder.fit_transform(test_df['Cabin'])
PassengerId = test_df['PassengerId']
t_train = test_df.drop(['PassengerId', 'Name', 'Ticket', 'Embarked'], axis=1)
t_test = model.predict(t_train)

output = pd.DataFrame({'PassengerId': PassengerId, 'Survived': t_test})
output.to_csv('Submission.csv', index=False)