# Import libraries

In [25]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import matplotlib.pyplot as plt

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/titanic/train.csv
/kaggle/input/titanic/test.csv
/kaggle/input/titanic/gender_submission.csv


# Load datasets

In [26]:
train_data = pd.read_csv('/kaggle/input/titanic/train.csv')
test_data = pd.read_csv('/kaggle/input/titanic/test.csv')

passenger_id = test_data['PassengerId']
full_data = [train_data, test_data]

# Feature Engineering

In [27]:
for dataset in full_data:
    # Extract Titles from Names
    dataset['Title'] = dataset['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)
    
    # Replace rare titles with 'Rare' and standardize common titles
    dataset['Title'] = dataset['Title'].replace(['Lady', 'Countess','Capt', 'Col',\
     'Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')
    dataset['Title'] = dataset['Title'].replace('Mlle', 'Miss')
    dataset['Title'] = dataset['Title'].replace('Ms', 'Miss')
    dataset['Title'] = dataset['Title'].replace('Mme', 'Mrs')
    
    # Map titles to numerical values
    title_mapping = {'Mr':1, 'Miss':2, 'Mrs':3, 'Master':4, 'Rare':5}
    dataset['Title'] = dataset['Title'].map(title_mapping)
    dataset['Title'] = dataset['Title'].fillna(0)
    
    # Create FamilySize feature
    dataset['FamilySize'] = dataset['SibSp'] + dataset['Parch'] + 1
    
    # Create IsAlone feature
    dataset['IsAlone'] = 1  # Initialize to 1 (means alone)
    dataset.loc[dataset['FamilySize'] > 1, 'IsAlone'] = 0  # Update to 0 if not alone
    
    # Fill missing Embarked with mode
    dataset['Embarked'] = dataset['Embarked'].fillna(dataset['Embarked'].mode()[0])
    
    # Map Embarked to numerical values
    embarked_mapping = {'S':0, 'C':1, 'Q':2}
    dataset['Embarked'] = dataset['Embarked'].map(embarked_mapping)
    
    # Map Sex to numerical values
    sex_mapping = {'male':0, 'female':1}
    dataset['Sex'] = dataset['Sex'].map(sex_mapping)
    
    # Fill missing Fare with median
    dataset['Fare'] = dataset['Fare'].fillna(dataset['Fare'].median())
    
    # Create Fare bands and map to numerical values
    dataset['FareBin'] = pd.qcut(dataset['Fare'], 4)
    fare_mapping = {interval: idx for idx, interval in enumerate(dataset['FareBin'].unique().categories)}
    dataset['Fare'] = dataset['FareBin'].map(fare_mapping)
    dataset['Fare'] = dataset['Fare'].astype(int)
    dataset.drop('FareBin', axis=1, inplace=True)
    
    # Create Age bands and map to numerical values
    dataset['Age'] = dataset['Age'].fillna(dataset['Age'].median())
    dataset['AgeBin'] = pd.qcut(dataset['Age'], 4)
    age_mapping = {interval: idx for idx, interval in enumerate(dataset['AgeBin'].unique().categories)}
    dataset['Age'] = dataset['AgeBin'].map(age_mapping)
    dataset['Age'] = dataset['Age'].astype(int)
    dataset.drop('AgeBin', axis=1, inplace=True)
    
    # Drop unnecessary features
    dataset.drop(['PassengerId', 'Name', 'Ticket', 'Cabin', 'SibSp', 'Parch'], axis=1, inplace=True)


# EDA

## Training set

In [28]:
print("Columns in the dataset:")
print(train_data.columns)

print("\nFirst few rows of the training data:")
print(train_data.head())

print("\nSummary statistics of the training data:")
print(train_data.describe())

print("\nMissing values in the dataset:")
print(train_data.isnull().sum())

Columns in the dataset:
Index(['Survived', 'Pclass', 'Sex', 'Age', 'Fare', 'Embarked', 'Title',
       'FamilySize', 'IsAlone'],
      dtype='object')

First few rows of the training data:
   Survived  Pclass  Sex  Age  Fare  Embarked  Title  FamilySize  IsAlone
0         0       3    0    0     0         0      1           2        0
1         1       1    1    3     3         1      3           2        0
2         1       3    1    1     1         0      2           1        1
3         1       1    1    2     3         0      3           2        0
4         0       3    0    2     1         0      1           1        1

Summary statistics of the training data:
         Survived      Pclass         Sex         Age        Fare    Embarked  \
count  891.000000  891.000000  891.000000  891.000000  891.000000  891.000000   
mean     0.383838    2.308642    0.352413    1.379349    1.497194    0.361392   
std      0.486592    0.836071    0.477990    1.114652    1.118156    0.635673   
m

## Test data

In [29]:
print("Columns in the dataset:")
print(test_data.columns)

print("\nFirst few rows of the test data:")
print(test_data.head())

print("\nSummary statistics of the test data:")
print(test_data.describe())

print("\nMissing values in the dataset:")
print(test_data.isnull().sum())

Columns in the dataset:
Index(['Pclass', 'Sex', 'Age', 'Fare', 'Embarked', 'Title', 'FamilySize',
       'IsAlone'],
      dtype='object')

First few rows of the test data:
   Pclass  Sex  Age  Fare  Embarked  Title  FamilySize  IsAlone
0       3    0    2     0         2      1           1        1
1       3    1    3     0         0      3           2        0
2       2    0    3     1         2      1           1        1
3       3    0    1     1         0      1           1        1
4       3    1    0     1         0      3           3        0

Summary statistics of the test data:
           Pclass         Sex         Age        Fare    Embarked       Title  \
count  418.000000  418.000000  418.000000  418.000000  418.000000  418.000000   
mean     2.265550    0.363636    1.373206    1.473684    0.464115    1.741627   
std      0.841838    0.481622    1.133510    1.140292    0.685516    1.006091   
min      1.000000    0.000000    0.000000    0.000000    0.000000    1.000000   


# Model Building

## Prepare test data and training data

In [30]:
X_train = train_data.drop('Survived', axis=1)
y_train = train_data['Survived']
X_test = test_data.copy()

## Train a random forest classifier and a gradient boosting model with grid search

In [31]:
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV, cross_val_score, StratifiedKFold

models = {
    'RandomForest': RandomForestClassifier(random_state=42),
    'GradientBoosting': GradientBoostingClassifier(random_state=42)
}

params = {
    'RandomForest': {
        'n_estimators': [100, 200],
        'max_depth': [5, 10],
        'min_samples_split': [2, 5],
        'min_samples_leaf': [1, 2],
    },
    'GradientBoosting': {
        'n_estimators': [100, 200],
        'learning_rate': [0.05, 0.1],
        'max_depth': [3, 5],
    }
}

best_estimators = {}
for name in models:
    grid = GridSearchCV(models[name], params[name], cv=5, scoring='accuracy', n_jobs=-1)
    grid.fit(X_train, y_train)
    best_estimators[name] = grid.best_estimator_
    cv_score = cross_val_score(grid.best_estimator_, X_train, y_train, cv=5, scoring='accuracy')
    print(f"{name} Cross-Validation Accuracy: {cv_score.mean():.4f}")

best_model = max(best_estimators.items(), key=lambda x: cross_val_score(x[1], X_train, y_train, cv=5).mean())[1]
best_model.fit(X_train, y_train)
y_pred = best_model.predict(X_test)



RandomForest Cross-Validation Accuracy: 0.8283
GradientBoosting Cross-Validation Accuracy: 0.8238


# Submission

In [34]:
predictions = best_xgb_model.predict(X_test)

submission = pd.DataFrame({
    'PassengerId': passenger_id,
    'Survived': y_pred
})
submission.to_csv('submission.csv', index=False)