# Titanic Survival Prediction


**Models Used:**
- Logistic Regression
- Random Forest Classifier

In [1]:
import pandas as pd
import numpy as np
import re

## Load Training Data

In [2]:
# Load the training data
train = pd.read_csv("/kaggle/input/titanic/train.csv")
test = pd.read_csv("/kaggle/input/titanic/test.csv")

## Feature Engineering

In [3]:
def extract_title(name):
    title_search = re.search(' ([A-Za-z]+)\.', name)
    if title_search:
        return title_search.group(1)
    return ""

train['Title'] = train['Name'].apply(extract_title)

In [4]:
title_mapping = {
    "Mr": "Mr",
    "Miss": "Miss",
    "Mrs": "Mrs",
    "Master": "Master",
    "Dr": "Rare", "Rev": "Rare", "Col": "Rare", "Major": "Rare",
    "Mlle": "Miss", "Ms": "Miss", "Mme": "Mrs",
    "Countess": "Rare", "Don": "Rare", "Jonkheer": "Rare", "Sir": "Rare", "Lady": "Rare", "Capt": "Rare"
}

In [5]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

In [6]:
train['Title'] = train['Title'].map(title_mapping)
train['Title'] = train['Title'].fillna('Rare')


le = LabelEncoder()
train['Title'] = le.fit_transform(train['Title'])

train['FamilySize'] = train['SibSp'] + train['Parch'] + 1
train['IsAlone'] = (train['FamilySize'] == 1).astype(int)

train['Age'] = train['Age'].fillna(train['Age'].median())
train['AgeBand'] = pd.cut(train['Age'], 5, labels=False)

train['Fare'] = train['Fare'].fillna(train['Fare'].median())
train['FareBand'] = pd.qcut(train['Fare'], 4, labels=False)

train['Sex'] = train['Sex'].replace({'male': 0, 'female': 1})
# One-hot encoding the Embarked column
train['Embarked'] = train['Embarked'].fillna('S')
ohe = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
embarked_oh = ohe.fit_transform(train[['Embarked']])
embarked_oh_df = pd.DataFrame(embarked_oh, columns=ohe.get_feature_names_out(['Embarked']))
embarked_oh_df.index = train.index

# Combining
train = pd.concat([train, embarked_oh_df], axis=1)

# Drop columns
train_processed = train.drop(['PassengerId', 'Name', 'Ticket', 'Cabin', 'Embarked', 'SibSp', 'Parch', 'Age', 'Fare'], axis=1)

# Separate features and target
X = train_processed.drop('Survived', axis=1)
y = train_processed['Survived']

  train['Sex'] = train['Sex'].replace({'male': 0, 'female': 1})


## Split Data & Scale Features

In [7]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [8]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)

## Logistic Regression Model

In [9]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

In [10]:
log_reg = LogisticRegression(random_state=42)
log_reg.fit(X_train_scaled, y_train)

val_predictions_log = log_reg.predict(X_val_scaled)

print("Logistic Regression Accuracy:", accuracy_score(y_val, val_predictions_log))
print(classification_report(y_val, val_predictions_log))

Logistic Regression Accuracy: 0.8044692737430168
              precision    recall  f1-score   support

           0       0.82      0.86      0.84       105
           1       0.78      0.73      0.76        74

    accuracy                           0.80       179
   macro avg       0.80      0.79      0.80       179
weighted avg       0.80      0.80      0.80       179



## Random Forest Classifier

In [11]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

In [12]:
rf_model = RandomForestClassifier(n_estimators=100, max_depth=5, random_state=42)
rf_model.fit(X_train_scaled, y_train)

val_predictions_rf = rf_model.predict(X_val_scaled)

print("Random Forest Accuracy:", accuracy_score(y_val, val_predictions_rf))
print(classification_report(y_val, val_predictions_rf))

Random Forest Accuracy: 0.8212290502793296
              precision    recall  f1-score   support

           0       0.82      0.90      0.85       105
           1       0.83      0.72      0.77        74

    accuracy                           0.82       179
   macro avg       0.82      0.81      0.81       179
weighted avg       0.82      0.82      0.82       179



## Load and Preprocess Test Data

In [13]:
passenger_id = test['PassengerId']    # preserve PassengerId

test['Title'] = test['Name'].apply(extract_title)
test['Title'] = test['Title'].map(title_mapping)
test['Title'] = test['Title'].fillna('Rare')
test['Title'] = le.transform(test['Title'])

test['FamilySize'] = test['SibSp'] + test['Parch'] + 1
test['IsAlone'] = (test['FamilySize'] == 1).astype(int)

age_median = train['Age'].median()
fare_median = train['Fare'].median()

test['Age'] = test['Age'].fillna(age_median)
test['AgeBand'] = pd.cut(test['Age'], 5, labels=False)

test['Fare'] = test['Fare'].fillna(fare_median)
test['FareBand'] = pd.qcut(test['Fare'], 4, labels=False)

test['Sex'] = test['Sex'].replace({'male': 0, 'female': 1})

test['Embarked'] = test['Embarked'].fillna('S')
embarked_oh_test = ohe.transform(test[['Embarked']])
embarked_oh_test_df = pd.DataFrame(embarked_oh_test, columns=ohe.get_feature_names_out(['Embarked']))
embarked_oh_test_df.index = test.index

test = pd.concat([test, embarked_oh_test_df], axis=1)

test_processed = test.drop(['PassengerId', 'Name', 'Ticket', 'Cabin', 'Embarked', 'SibSp', 'Parch', 'Age', 'Fare'], axis=1)



  test['Sex'] = test['Sex'].replace({'male': 0, 'female': 1})


In [14]:
test_predictions = rf_model.predict(test_processed)

submission = pd.DataFrame({
    'PassengerId': passenger_id,
    'Survived': test_predictions
})

submission.to_csv("titanic_submission.csv", index=False)
print("created")

created


