In [1]:
import numpy as np
import pandas as pd
import os
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score

# Define paths to data
data_train_path = 'input/train.csv'
data_test_path = 'input/test.csv'

# Load train and test data
train_df = pd.read_csv(data_train_path)
test_df = pd.read_csv(data_test_path)

In [2]:
def clean_and_engineer_features(df_in):
    df = df_in.copy()
    # --- 1. Handle Missing Values ---
    df['Age'].fillna(df['Age'].median(), inplace=True)
    df['Embarked'].fillna(df['Embarked'].mode()[0], inplace=True)
    df['Fare'].fillna(df['Fare'].median(), inplace=True)

    # --- 2. Feature Engineering ---
    # Extract Title from Name
    df['Title'] = df['Name'].str.extract(r' ([A-Za-z]+)\.', expand=False)
    df['Title'] = df['Title'].replace(['Lady', 'Countess','Capt', 'Col','Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')
    df['Title'] = df['Title'].replace('Mlle', 'Miss')
    df['Title'] = df['Title'].replace('Ms', 'Miss')
    df['Title'] = df['Title'].replace('Mme', 'Mrs')

    # Create FamilySize feature
    df['FamilySize'] = df['SibSp'] + df['Parch'] + 1

    # Create IsAlone feature
    df['IsAlone'] = 0
    df.loc[df['FamilySize'] == 1, 'IsAlone'] = 1

    # Extract Cabin Initial
    df['CabinInitial'] = df['Cabin'].str[0]
    df['CabinInitial'].fillna('U', inplace=True) # U for Unknown

    # --- 3. Create Dummy Variables for Categorical Features ---
    df['Sex'] = df['Sex'].map({'male': 0, 'female': 1})

    categorical_cols = ['Embarked', 'Pclass', 'Title', 'CabinInitial']
    df_dummies = pd.get_dummies(df[categorical_cols], drop_first=True, dtype=int)
    df = pd.concat([df, df_dummies], axis=1)
    df.drop(columns=categorical_cols, inplace=True)

    # --- 4. Normalize Numerical Columns ---
    numerical_cols = ['Age', 'Fare', 'SibSp', 'Parch', 'FamilySize']
    scaler = MinMaxScaler()
    df[numerical_cols] = scaler.fit_transform(df[numerical_cols])

    # --- 5. Final Data Preparation ---
    df.drop(columns=['Ticket', 'Cabin', 'Name'], inplace=True)
    return df

In [3]:
train_data_cleaned = clean_and_engineer_features(train_df)
test_data_cleaned = clean_and_engineer_features(test_df)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Age'].fillna(df['Age'].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Embarked'].fillna(df['Embarked'].mode()[0], inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we

In [4]:
X = train_data_cleaned.drop(columns=['Survived', 'PassengerId'])
y = train_data_cleaned['Survived']

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

X_test = test_data_cleaned.drop(columns=['PassengerId'])
X_test_aligned = X_test.reindex(columns=X_train.columns, fill_value=0)

xgb_classifier = XGBClassifier(random_state=42)
xgb_classifier.fit(X_train, y_train)

y_pred_val = xgb_classifier.predict(X_val)

accuracy = accuracy_score(y_val, y_pred_val)
print(f"Model Accuracy with XGBoost: {accuracy:.4f}")

Model Accuracy with XGBoost: 0.7989


In [5]:
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 4, 5],
    'learning_rate': [0.1, 0.05, 0.01],
    'subsample': [0.8, 0.9, 1.0]
}

grid_search = GridSearchCV(estimator=XGBClassifier(random_state=42),
                           param_grid=param_grid,
                           cv=5,
                           n_jobs=-1,
                           verbose=2)

grid_search.fit(X_train, y_train)

print(f"Best parameters found: {grid_search.best_params_}")
print(f"Best accuracy found: {grid_search.best_score_:.4f}")

Fitting 5 folds for each of 81 candidates, totalling 405 fits
Best parameters found: {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 100, 'subsample': 0.9}
Best accuracy found: 0.8399


In [6]:
best_xgb = grid_search.best_estimator_
y_pred_test = best_xgb.predict(X_test_aligned)

submission = pd.DataFrame({
    'PassengerId': test_df['PassengerId'],
    'Survived': y_pred_test
})

submission.to_csv('submission_advanced.csv', index=False)

print('Submission file with tuned XGBoost model created successfully!')

Submission file with tuned XGBoost model created successfully!


In [8]:
!kaggle competitions submit -c titanic -f submission_advanced.csv -m "Submission using XGBoost tuned"

100%|██████████████████████████████████████| 2.77k/2.77k [00:00<00:00, 2.89kB/s]
Successfully submitted to Titanic - Machine Learning from Disaster