<a href="https://colab.research.google.com/github/aditya301cs/Daily-Data-Science-ML/blob/main/Machine_Learning_Model_Comparison_on_Titanic_Dataset.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Model Evaluation and Comparison Using Train‚ÄìTest Split and Cross-Validation


# Import Required Libraries

In [1]:
import numpy as np
import pandas as pd

# Models
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

# Preprocessing
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder # Added OneHotEncoder here
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

# Evaluation
from sklearn.metrics import accuracy_score

import warnings
warnings.filterwarnings("ignore")

#Load Dataset

In [2]:
df = pd.read_csv("titanic.csv")
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


## üìÅ Titanic Dataset Overview

Target Variable:
- **Survived** ‚Üí 0 = Did not survive, 1 = Survived

Feature Types:
- Numerical: Age, Fare, SibSp, Parch
- Categorical: Sex, Embarked, Pclass
- Irrelevant for prediction: PassengerId, Name, Ticket, Cabin


#Drop Irrelevant Columns

In [3]:
df = df.drop(columns=["PassengerId", "Name", "Ticket", "Cabin"])


#Feature & Target Separation

In [4]:
X = df.drop("Survived", axis=1)
y = df["Survived"]


#Exploratory Data Analysis (EDA)

In [5]:
# Dataset information
df.info()

# Statistical summary
df.describe()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  891 non-null    int64  
 1   Pclass    891 non-null    int64  
 2   Sex       891 non-null    object 
 3   Age       714 non-null    float64
 4   SibSp     891 non-null    int64  
 5   Parch     891 non-null    int64  
 6   Fare      891 non-null    float64
 7   Embarked  889 non-null    object 
dtypes: float64(2), int64(4), object(2)
memory usage: 55.8+ KB


Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,714.0,891.0,891.0,891.0
mean,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,0.0,1.0,0.42,0.0,0.0,0.0
25%,0.0,2.0,20.125,0.0,0.0,7.9104
50%,0.0,3.0,28.0,0.0,0.0,14.4542
75%,1.0,3.0,38.0,1.0,0.0,31.0
max,1.0,3.0,80.0,8.0,6.0,512.3292


In [6]:
print('Missing values per column:')
display(df.isnull().sum())

print('\nTarget variable distribution:')
display(df['Survived'].value_counts())

Missing values per column:


Unnamed: 0,0
Survived,0
Pclass,0
Sex,0
Age,177
SibSp,0
Parch,0
Fare,0
Embarked,2



Target variable distribution:


Unnamed: 0_level_0,count
Survived,Unnamed: 1_level_1
0,549
1,342


#Identify Numerical & Categorical Columns

In [7]:
num_features = ["Age", "Fare", "SibSp", "Parch"]
cat_features = ["Sex", "Embarked", "Pclass"]


#Preprocessing Pipelines

In [8]:
# Numerical preprocessing
num_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

# Categorical preprocessing
cat_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("encoder", OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=42)

In [10]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(712, 7)
(179, 7)
(712,)
(179,)


#Initialize Models

In [11]:
models = {
    "Logistic Regression": LogisticRegression(),
    "Support Vector Machine": SVC(),
    "Random Forest": RandomForestClassifier(random_state=42),
    "Gradient Boosting": GradientBoostingClassifier(random_state=42)
}


# Train‚ÄìTest Split Evaluation

In [12]:
from sklearn.compose import ColumnTransformer

# Create a preprocessor using ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', num_pipeline, num_features),
        ('cat', cat_pipeline, cat_features)
    ],
    remainder='passthrough'
)

print("üîπ Train‚ÄìTest Split Accuracy\n")

train_test_results = {}

for name, model in models.items():
    # Create a full pipeline that first preprocesses and then trains the model
    full_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                                      ('classifier', model)])

    full_pipeline.fit(X_train, y_train)
    y_pred = full_pipeline.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    train_test_results[name] = acc

    print(f"{name}: {acc:.4f}")

üîπ Train‚ÄìTest Split Accuracy

Logistic Regression: 0.7989
Support Vector Machine: 0.8156
Random Forest: 0.8212
Gradient Boosting: 0.8212


# Cross-Validation Evaluation

In [13]:
print("\n\ud83d\udd39 10-Fold Cross-Validation Accuracy\n")

cv_results = {}

for name, model in models.items():
    # Create a full pipeline including preprocessing for cross-validation
    full_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                                      ('classifier', model)])

    scores = cross_val_score(full_pipeline, X, y, cv=10, scoring="accuracy")
    cv_results[name] = np.mean(scores)

    print(f"{name}: Mean CV Accuracy = {np.mean(scores):.4f}")

ERROR:tornado.application:Exception in callback functools.partial(<bound method OutStream._flush of <ipykernel.iostream.OutStream object at 0x798e7bc7b310>>)
Traceback (most recent call last):
  File "/usr/local/lib/python3.12/dist-packages/jupyter_client/session.py", line 104, in json_packer
    ).encode("utf8", errors="surrogateescape")
      ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
UnicodeEncodeError: 'utf-8' codec can't encode characters in position 30-31: surrogates not allowed

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/usr/local/lib/python3.12/dist-packages/tornado/ioloop.py", line 758, in _run_callback
    ret = callback()
          ^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/ipykernel/iostream.py", line 518, in _flush
    self.session.send(
  File "/usr/local/lib/python3.12/dist-packages/jupyter_client/session.py", line 848, in send
    to_send = self.serialize(msg, ident)
              ^^^^^^

Logistic Regression: Mean CV Accuracy = 0.7991
Support Vector Machine: Mean CV Accuracy = 0.8238
Random Forest: Mean CV Accuracy = 0.8171
Gradient Boosting: Mean CV Accuracy = 0.8249


# Compare Results

In [14]:
results_df = pd.DataFrame({
    "Train-Test Accuracy": train_test_results,
    "Cross-Validation Accuracy": cv_results
})

results_df.sort_values(by="Cross-Validation Accuracy", ascending=False)


Unnamed: 0,Train-Test Accuracy,Cross-Validation Accuracy
Gradient Boosting,0.821229,0.824944
Support Vector Machine,0.815642,0.82382
Random Forest,0.821229,0.817141
Logistic Regression,0.798883,0.799126


## üß† Model Evaluation ‚Äì Key Learnings (Titanic Dataset)

- Train‚Äìtest accuracy gives a quick performance snapshot
- Cross-validation provides a more reliable estimate of generalization
- Differences between train-test and CV scores indicate model stability
- The model with the highest CV accuracy is preferred for tuning

This systematic comparison ensures robust model selection
before applying hyperparameter tuning.


# Interview-Ready One-Liner (Memorize)
- I evaluated multiple classifiers on the Titanic dataset using both train‚Äìtest split and cross-validation, and selected the model with the best cross-validated performance for further tuning.