In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/titanic/train.csv
/kaggle/input/titanic/test.csv


In [2]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import GridSearchCV
from sklearn.impute import SimpleImputer
import xgboost as xgb
from sklearn.preprocessing import LabelEncoder

In [3]:
train = pd.read_csv("/kaggle/input/titanic/train.csv")
test = pd.read_csv("/kaggle/input/titanic/test.csv")

In [4]:
train.head(3)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S


In [5]:
# Separate features and target
X_train = train.drop('Survived', axis=1)
y_train = train['Survived']

# data cleaning

In [6]:
# handeling missing values
train.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [7]:
class Preprocessor(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass

    def fit(self, X, y=None):
        return self

    def transform(self, X):


        # Remove Passengerid

        X=X.drop("PassengerId",axis=1)

        # 1. Handle missing values
        imputer = SimpleImputer(strategy='median')  # Replace with median for numerical columns
        X[['Age', 'Fare'  ]] = imputer.fit_transform(X[['Age', 'Fare']])
        X['Embarked'].fillna(X['Embarked'].mode()[0], inplace=True)
        X['Cabin'] = X['Cabin'].str[0].fillna('U') 
        
        # 2. Create new features
        X['FamilySize'] = X['SibSp'] + X['Parch'] + 1
        X['IsAlone'] = 0
        X.loc[X['FamilySize'] == 1, 'IsAlone'] = 1

         # 3. Interaction between Sex and Pclass (new feature)
        X['Sex_Pclass'] = X['Sex'] * X['Pclass']

        # 4. Title extraction
        X['Title'] = X['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)
        X['Title'] = X['Title'].replace(['Ms', 'Mlle', 'Mme', 'Lady', 'Countess', 'Dona'], 'Miss')
        X['Title'] = X['Title'].replace(['Capt', 'Don', 'Sir', 'Dr', 'Rev', 'Major', 'Col', 'Jonkheer', 'Master'], 'Mr') 
       
        # 5. Drop the Name column since we already extracted Title
        X = X.drop('Name', axis=1)

        return X

In [8]:
# Define preprocessing pipelines
numerical_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, ['Age', 'SibSp', 'Parch', 'Fare', 'FamilySize', 'IsAlone']),
        ('cat', categorical_transformer, ['Sex', 'Embarked', 'Title', 'Pclass', 'Sex_Pclass'])
    ])

# Create main pipeline
pipeline = Pipeline(steps=[
    ('custom_preprocessor', Preprocessor()), 
    ('feature_engineering', preprocessor)
])

In [9]:
# Transform data
train_processed = pipeline.fit_transform(X_train)
test_processed = pipeline.transform(test)

In [10]:
# Split data into training and validation sets
X_train_prosessed, X_val, y_train, y_val = train_test_split(
    train_processed, y_train, test_size=0.2, random_state=42
)

In [11]:
# Define individual classifiers
rf_clf = RandomForestClassifier(n_estimators=100, random_state=42)
lr_clf = LogisticRegression(max_iter=1000, random_state=42)
svc_clf = SVC(probability=True, random_state=42) 

In [12]:
# Create Voting Classifier
voting_clf = VotingClassifier(estimators=[
    ('rf', rf_clf),
    ('lr', lr_clf),
    ('svc', svc_clf)
], voting='soft') 

# Train the Voting Classifier
voting_clf.fit(X_train_prosessed, y_train)

In [13]:
# Make predictions on validation set
y_pred = voting_clf.predict(X_val)

# Evaluate model performance
accuracy = accuracy_score(y_val, y_pred)
print(f"Validation Accuracy: {accuracy:.2f}")

# Make predictions on test data

y_pred_test = voting_clf.predict(test_processed)

# Create submission file
submission = pd.DataFrame({'PassengerId': test['PassengerId'], 'Survived': y_pred_test})
submission.to_csv('submission.csv', index=False)

Validation Accuracy: 0.81


In [14]:
# Model Building and Evaluation

# 1. Initialize models for comparison
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
logreg_model = LogisticRegression(random_state=42)
svc_model = SVC(random_state=42)
#xgb_model = xgb.XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='logloss', enable_categorical=False) 

# 2. Hyperparameter tuning using GridSearchCV for Random Forest and XGBoost

# Random Forest Grid Search
rf_param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [3, 5, 10],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2]
}
rf_grid_search = GridSearchCV(rf_model, rf_param_grid, cv=5, scoring='accuracy', n_jobs=-1)
rf_grid_search.fit(X_train_prosessed, y_train)

# XGBoost Grid Search
'''xgb_param_grid = {
    'learning_rate': [0.05, 0.1, 0.2],
    'n_estimators': [100, 200],
    'max_depth': [3, 5, 10],
    'subsample': [0.7, 1.0],
    'colsample_bytree': [0.7, 1.0]
}
xgb_grid_search = GridSearchCV(xgb_model, xgb_param_grid, cv=5, scoring='accuracy', n_jobs=-1)
xgb_grid_search.fit(X_train_prosessed, y_train)
'''

# Best model and hyperparameters
print(f"Best Random Forest Params: {rf_grid_search.best_params_}")
#print(f"Best XGBoost Params: {xgb_grid_search.best_params_}")

# 3. Final Model Evaluation

# Use best models (can also combine them using Voting Classifier)
best_rf_model = rf_grid_search.best_estimator_
#best_xgb_model = xgb_grid_search.best_estimator_

# Ensemble of models using Voting Classifier
ensemble_model = VotingClassifier(estimators=[('rf', best_rf_model), 
                                              ('logreg', logreg_model), 
                                              ('svc', svc_model),
                                              #('xgb', best_xgb_model)
                                              ], voting='hard')

# Train ensemble model
ensemble_model.fit(X_train_prosessed, y_train)

# Evaluate the ensemble model on validation set
y_val_pred = ensemble_model.predict(X_val)
val_accuracy = accuracy_score(y_val, y_val_pred)
print(f"Ensemble Model Validation Accuracy: {val_accuracy * 100:.2f}%")

# 4. Final Test Set Predictions

# Predict on the test set
y_pred_test2 = ensemble_model.predict(test_processed)

Best Random Forest Params: {'max_depth': 5, 'min_samples_leaf': 2, 'min_samples_split': 5, 'n_estimators': 50}
Ensemble Model Validation Accuracy: 80.45%
