In [74]:
import pandas as pd


df_train = pd.read_csv("final_training.csv")
df_test = pd.read_csv("final_testing.csv")


sex_map = {"male": 0, "female": 1}
df_train["Sex"] = df_train["Sex"].map(sex_map)
df_test["Sex"] = df_test["Sex"].map(sex_map)


print(df_train["Sex"].value_counts())
print(df_test["Sex"].value_counts())


Sex
0    577
1    314
Name: count, dtype: int64
Sex
0    266
1    152
Name: count, dtype: int64


In [75]:
embarked_train = pd.get_dummies(df_train["Embarked"], prefix="Embarked")
embarked_test = pd.get_dummies(df_test["Embarked"], prefix="Embarked")

embarked_train, embarked_test = embarked_train.align(embarked_test, join="outer", axis=1, fill_value=0)

df_train = pd.concat([df_train.drop("Embarked", axis=1), embarked_train], axis=1)
df_test = pd.concat([df_test.drop("Embarked", axis=1), embarked_test], axis=1)


In [76]:
df_train[["Embarked_C","Embarked_Q","Embarked_S"]] = df_train[["Embarked_C","Embarked_Q","Embarked_S"]].astype(int)
df_test[["Embarked_C","Embarked_Q","Embarked_S"]] = df_test[["Embarked_C","Embarked_Q","Embarked_S"]].astype(int)

In [77]:
df_train.drop(columns=["Ticket_Prefix"], inplace=True)
df_test.drop(columns=["Ticket_Prefix"], inplace=True)

In [None]:
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report

y = df_train["Survived"]
X = df_train.drop("Survived", axis=1)

passenger_ids = df_test["PassengerId"] if "PassengerId" in df_test.columns else None
X_test = df_test.drop("PassengerId", axis=1, errors="ignore")

X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Initialize XGBoost
model = XGBClassifier(
    n_estimators=300,
    max_depth=4,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    use_label_encoder=False,
    eval_metric="logloss"
)

# Train
model.fit(X_train, y_train)

# Validate
y_pred = model.predict(X_val)
print("Validation Accuracy:", accuracy_score(y_val, y_pred))
print(classification_report(y_val, y_pred))

# Predict on test set
y_test_pred = model.predict(X_test)


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Validation Accuracy: 0.8100558659217877
              precision    recall  f1-score   support

           0       0.83      0.86      0.85       110
           1       0.77      0.72      0.75        69

    accuracy                           0.81       179
   macro avg       0.80      0.79      0.80       179
weighted avg       0.81      0.81      0.81       179



In [83]:
submission = pd.DataFrame({
    "PassengerId": passenger_ids,
    "Survived": y_test_pred
})
submission.to_csv("xgb_submission_final.csv", index=False)
