# Import libraries

In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import matplotlib.pyplot as plt

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/titanic/train.csv
/kaggle/input/titanic/test.csv
/kaggle/input/titanic/gender_submission.csv


# Load datasets

In [2]:
train_data = pd.read_csv('/kaggle/input/titanic/train.csv')
test_data = pd.read_csv('/kaggle/input/titanic/test.csv')
train_data['source'] = 'train'
test_data['source'] = 'test'
data = pd.concat([train_data, test_data], ignore_index=True)

# EDA

## Training set

In [3]:
print("Columns in the dataset:")
print(train_data.columns)

print("\nFirst few rows of the training data:")
print(train_data.head())

print("\nSummary statistics of the training data:")
print(train_data.describe())

print("\nMissing values in the dataset:")
print(train_data.isnull().sum())

Columns in the dataset:
Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked', 'source'],
      dtype='object')

First few rows of the training data:
   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   

                                                Name     Sex   Age  SibSp  \
0                            Braund, Mr. Owen Harris    male  22.0      1   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                             Heikkinen, Miss. Laina  female  26.0      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
4                           Allen, Mr. William Henry    male  35.0      0   

   Parch            Ticket     Fare Cabin Embarked source  
0      0         A/5 21171   7.2500

## Test data

In [4]:
print("Columns in the dataset:")
print(test_data.columns)

print("\nFirst few rows of the test data:")
print(test_data.head())

print("\nSummary statistics of the test data:")
print(test_data.describe())

print("\nMissing values in the dataset:")
print(test_data.isnull().sum())

Columns in the dataset:
Index(['PassengerId', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch',
       'Ticket', 'Fare', 'Cabin', 'Embarked', 'source'],
      dtype='object')

First few rows of the test data:
   PassengerId  Pclass                                          Name     Sex  \
0          892       3                              Kelly, Mr. James    male   
1          893       3              Wilkes, Mrs. James (Ellen Needs)  female   
2          894       2                     Myles, Mr. Thomas Francis    male   
3          895       3                              Wirz, Mr. Albert    male   
4          896       3  Hirvonen, Mrs. Alexander (Helga E Lindqvist)  female   

    Age  SibSp  Parch   Ticket     Fare Cabin Embarked source  
0  34.5      0      0   330911   7.8292   NaN        Q   test  
1  47.0      1      0   363272   7.0000   NaN        S   test  
2  62.0      0      0   240276   9.6875   NaN        Q   test  
3  27.0      0      0   315154   8.6625   NaN        S

# Feature Engineering

In [5]:
data = pd.concat([train_data, test_data], ignore_index=True)

# Extract Title from Name
data['Title'] = data['Name'].str.extract('([A-Za-z]+)\.', expand=False)
data['Title'] = data['Title'].replace(['Lady', 'Countess', 'Capt', 'Col',
                                       'Don', 'Dr', 'Major', 'Rev', 'Sir',
                                       'Jonkheer', 'Dona'], 'Rare')
data['Title'] = data['Title'].replace(['Mlle', 'Ms'], 'Miss')
data['Title'] = data['Title'].replace('Mme', 'Mrs')

# Create Family Size Feature
data['FamilySize'] = data['SibSp'] + data['Parch'] + 1

# Create IsAlone Feature
data['IsAlone'] = 0 
data.loc[data['FamilySize'] == 1, 'IsAlone'] = 1

#### Age Binning
data['AgeBin'] = pd.cut(data['Age'], bins=[0, 12, 20, 40, 120],
                        labels=['Child', 'Teenager', 'Adult', 'Elder'])

#### Fare Binning
data['FareBin'] = pd.qcut(data['Fare'], 4, labels=['Low', 'Medium', 'High', 'Very_High'])

# Drop unnecessary features
data.drop(['PassengerId', 'Name', 'Ticket', 'Cabin', 'SibSp', 'Parch'], axis=1, inplace=True)

categorical_cols = ['Sex', 'Embarked', 'Title', 'AgeBin', 'FareBin']
data = pd.get_dummies(data, columns=categorical_cols, dummy_na=True)

train_data = data[data['source'] == 'train'].drop(['source'], axis=1)
test_data = data[data['source'] == 'test'].drop(['source', 'Survived'], axis=1)

# Model Building

## Prepare test data and training data

In [6]:
from sklearn.model_selection import train_test_split

X = train_data.drop('Survived', axis=1)
y = train_data['Survived']

X_train, X_valid, y_train, y_valid = train_test_split(
    X, y, test_size=0.2, random_state=42
)

## Use XGBoosting

In [7]:
import xgboost as xgb
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score

xgb_clf = xgb.XGBClassifier(use_label_encoder=False, eval_metric='mlogloss')

param_grid = {
    'n_estimators': [100, 300],
    'max_depth': [3, 5],
    'learning_rate': [0.01, 0.1],
    'subsample': [0.7, 0.8, 1.0],
}

grid_search = GridSearchCV(estimator=xgb_clf, 
                           param_grid=param_grid, 
                           scoring='accuracy',
                           cv=5, 
                           verbose=1, 
                           n_jobs=-1) 

grid_search.fit(X_train, y_train)

best_params = grid_search.best_params_

print(f"Best Parameters: {best_params}")

final_model = xgb.XGBClassifier(
    **best_params,
    use_label_encoder=False,
    eval_metric='logloss'
)

final_model.fit(X, y)

train_predictions = final_model.predict(X)

train_accuracy = accuracy_score(y, train_predictions)
print(f"Training Accuracy: {train_accuracy:.4f}")


Fitting 5 folds for each of 24 candidates, totalling 120 fits
Best Parameters: {'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 300, 'subsample': 0.8}
Training Accuracy: 0.8620


# Submission

In [8]:
predictions = final_model.predict(test_data)
original_test = pd.read_csv('/kaggle/input/titanic/test.csv')

# Prepare submission DataFrame
submission = pd.DataFrame({
    'PassengerId': original_test['PassengerId'],
    'Survived': predictions.astype(int)
})
submission.to_csv('submission.csv', index=False)