# Import libraries

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import matplotlib.pyplot as plt

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Load datasets

In [None]:
train_data = pd.read_csv('/kaggle/input/titanic/train.csv')
test_data = pd.read_csv('/kaggle/input/titanic/test.csv')
train_data['source'] = 'train'
test_data['source'] = 'test'
data = pd.concat([train_data, test_data], ignore_index=True)

# EDA

## Training set

In [None]:
print("Columns in the dataset:")
print(train_data.columns)

print("\nFirst few rows of the training data:")
print(train_data.head())

print("\nSummary statistics of the training data:")
print(train_data.describe())

print("\nMissing values in the dataset:")
print(train_data.isnull().sum())

## Test data

In [None]:
print("Columns in the dataset:")
print(test_data.columns)

print("\nFirst few rows of the test data:")
print(test_data.head())

print("\nSummary statistics of the test data:")
print(test_data.describe())

print("\nMissing values in the dataset:")
print(test_data.isnull().sum())

# Feature Engineering

In [None]:
data = pd.concat([train_data, test_data], ignore_index=True)

# Extract Title from Name
data['Title'] = data['Name'].str.extract('([A-Za-z]+)\.', expand=False)
data['Title'] = data['Title'].replace(['Lady', 'Countess', 'Capt', 'Col',
                                       'Don', 'Dr', 'Major', 'Rev', 'Sir',
                                       'Jonkheer', 'Dona'], 'Rare')
data['Title'] = data['Title'].replace(['Mlle', 'Ms'], 'Miss')
data['Title'] = data['Title'].replace('Mme', 'Mrs')

# Create Family Size Feature
data['FamilySize'] = data['SibSp'] + data['Parch'] + 1

# Create IsAlone Feature
data['IsAlone'] = 0 
data.loc[data['FamilySize'] == 1, 'IsAlone'] = 1

#### Fare Binning
data['FareBin'] = pd.qcut(data['Fare'], 4, labels=['Low', 'Medium', 'High', 'Very_High'])

# Drop unnecessary features
data.drop(['PassengerId', 'Name', 'Ticket', 'Cabin', 'SibSp', 'Parch', 'Fare'], axis=1, inplace=True)

categorical_cols = ['Sex', 'Embarked', 'Title', 'FareBin']
data = pd.get_dummies(data, columns=categorical_cols, dummy_na=True, drop_first=True)

train_data = data[data['source'] == 'train'].drop(['source'], axis=1)
test_data = data[data['source'] == 'test'].drop(['source', 'Survived'], axis=1)


## Print features

In [None]:
print("Columns in the training dataset:")
print(train_data.columns)
print(train_data.head())

print("Columns in the testing dataset:")
print(test_data.columns)

# Model Building

## Prepare test data and training data

In [None]:
from sklearn.model_selection import train_test_split

X = train_data.drop('Survived', axis=1)
y = train_data['Survived']

X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42
)

## Use XGBoosting

In [None]:
import xgboost as xgb
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, f1_score

numerical_cols = ['Age', 'FamilySize']

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_cols)
    ],
    remainder='passthrough'
)

xgb_clf = xgb.XGBClassifier(use_label_encoder=False, eval_metric='mlogloss')

pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', xgb_clf)
])
param_grid = {
    'classifier__n_estimators': [100, 300],
    'classifier__max_depth': [3, 5],
    'classifier__learning_rate': [0.01, 0.05, 0.1],
    'classifier__subsample': [0.8, 1.0],
    'classifier__min_child_weight': [1, 3],
    'classifier__gamma': [0, 0.1]    
}

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

grid_search = GridSearchCV(estimator=pipeline, 
                           param_grid=param_grid, 
                           scoring='accuracy',
                           cv=cv, 
                           verbose=1, 
                           n_jobs=-1) 

grid_search.fit(X_train, y_train)

best_params = grid_search.best_params_

print(f"Best Parameters: {best_params}")

final_model = grid_search.best_estimator_

train_predictions = final_model.predict(X_train)

train_accuracy = accuracy_score(y_train, train_predictions)
print(f"Training Accuracy: {train_accuracy:.4f}")

val_predictions = final_model.predict(X_val)

val_accuracy = accuracy_score(y_val, val_predictions)
val_f1_score = f1_score(y_val, val_predictions)
print(f"Validation Accuracy: {val_accuracy:.4f}")
print(f"Validation F1 Score: {val_f1_score:.4f}")



# Submission

In [None]:
predictions = final_model.predict(test_data)
original_test = pd.read_csv('/kaggle/input/titanic/test.csv')

# Prepare submission DataFrame
submission = pd.DataFrame({
    'PassengerId': original_test['PassengerId'],
    'Survived': predictions.astype(int)
})
submission.to_csv('submission.csv', index=False)