<a href="https://colab.research.google.com/github/adams-gc/projects/blob/main/TITANIC_CLASSIFICATION.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Import necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Set plot style
plt.style.use('ggplot')


In [None]:
# Load Titanic dataset
train_data = pd.read_csv('/content/train (1).csv')
test_data = pd.read_csv('/content/test (1).csv')


In [None]:
# Quick data overview
print(f"############Train data shape:#####################")
print(f"Train data shape: {train_data.shape}")
print(f"###############Test data shape:############## ")
print(f"Test data shape: {test_data.shape}")
print(f"###############Train data head:############## ")
print(train_data.info())
print(f"###############Train data head:############## ")
print(train_data.describe())
print(f"###############Train data null:############## ")
print(f"Null values:\n{train_data.isnull().sum()}")


In [None]:
# Visualizing missing values
sns.heatmap(train_data.isnull(), cbar=False)
plt.title("Missing Values Heatmap")
plt.show()


In [None]:
# Summary statistics for numerical columns
print("Summary statistics of numerical columns:")
print(train_data.describe())


In [None]:
# Analyze survival distribution
fig, axes = plt.subplots(1, 2, figsize=(12, 6))
train_data['Survived'].value_counts().plot.pie(autopct='%1.1f%%', ax=axes[0], explode=[0, 0.1], shadow=True)
sns.countplot(x='Survived', data=train_data, ax=axes[1])
axes[0].set_title('Survival Distribution')
axes[1].set_title('Survival Count')
plt.show()

In [None]:
# Gender-based survival analysis
plt.figure(figsize=(8, 5))
sns.countplot(data=train_data, x='Survived', hue='Sex', palette='coolwarm')
plt.title("Survival Distribution by Gender")
plt.xlabel("Survived")
plt.ylabel("Count")
plt.legend(title='Gender')
plt.show()

In [None]:
train_data.groupby(['Sex', 'Survived'])['Survived'].count()

As we can see there was a high percentage of Female Survior

In [None]:
# Class-based survival analysis
plt.figure(figsize=(8, 5))
sns.countplot(data=train_data, x='Survived', hue='Pclass', palette='Set1')
plt.title("Survival Distribution by Passenger Class")
plt.xlabel("Survived")
plt.ylabel("Count")
plt.legend(title='Passenger Class')
plt.show()

In [None]:
# Crosstab of survival and class for a tabular view
print("Survival counts by class:")
print(pd.crosstab(train_data['Survived'], train_data['Pclass']))


In [None]:
# Age distribution analysis by survival
plt.figure(figsize=(10, 6))
sns.violinplot(data=train_data, x='Survived', y='Age', hue='Pclass', split=True, palette='Set3')
plt.title("Age Distribution by Survival and Class")
plt.xlabel("Survived")
plt.ylabel("Age")
plt.legend(title='Passenger Class')
plt.show()

In [None]:
print(f'The youngest Passenger: {train_data["Age"].min()}')
print(f'The Oldest Passenger: {train_data["Age"].max()}')
print(f'The average age Passenger: {round(train_data["Age"].mean(),2)} ')

Observations: class 1 is more survial
- The survial Rate from age 20-40 in Pclass 1 is highe



In [None]:

# Extract titles from names for better understanding of passenger types
# The titles like 'Mr.', 'Mrs.', etc., may help in analyzing age or survival trends
train_data['Title'] =train_data['Name'].str.extract('([A-Za-z]+)\.', expand=False)
print("Extracted Titles:")
print(train_data['Title'].value_counts())


In [None]:
# Fill missing ages based on the average age for each title
# Create a dictionary mapping titles to their mean ages
title_age_map = train_data.groupby('Title')['Age'].mean().to_dict()
train_data['Age'] = train_data.apply(lambda row: title_age_map[row['Title']] if pd.isnull(row['Age']) else row['Age'], axis=1)
print("Filled missing Age values based on Title averages.")


In [None]:
train_data['Initial']=0
for i in train_data:
    train_data['Initial'] = train_data['Name'].str.extract("([A-Za-z]+)\.")
pd.crosstab(train_data['Initial'], train_data['Sex']).T

In [None]:
df=train_data

In [None]:

# Verify if there are any remaining missing values in the 'Age' column
print("Missing values in 'Age' after imputation:")
print(df['Age'].isnull().sum())

In [None]:
# Visualize the age distribution after filling missing values
plt.figure(figsize=(8, 5))
sns.histplot(df['Age'], bins=20, kde=True, color='blue')
plt.title("Age Distribution After Imputation")
plt.xlabel("Age")
plt.ylabel("Frequency")
plt.show()

In [None]:
df['Age'].isnull().any()

Embarked

In [None]:
pd.crosstab([df['Embarked'], df['Pclass']], [df['Sex'],df['Survived']], margins=True).T

In [None]:
# Handle missing values in the 'Embarked' column by filling with the most common value
most_common_embarked = df['Embarked'].mode()[0]  # Find the most frequent value
df['Embarked'].fillna(most_common_embarked, inplace=True)  # Fill missing values
print(f"Missing values in 'Embarked' after imputation: {df['Embarked'].isnull().sum()}")


In [None]:
# Visualize the distribution of the 'Embarked' column
plt.figure(figsize=(8, 5))
sns.countplot(data=df, x='Embarked', palette='pastel')
plt.title("Distribution of Embarked Locations")
plt.xlabel("Embarked")
plt.ylabel("Count")
plt.show()


In [None]:
# Fill missing values in 'Fare' column with the median value
median_fare = df['Fare'].median()
df['Fare'].fillna(median_fare, inplace=True)
print(f"Missing values in 'Fare' after imputation: {df['Fare'].isnull().sum()}")



In [None]:
# Visualize the distribution of the 'Fare' column
plt.figure(figsize=(8, 5))
sns.histplot(df['Fare'], bins=30, kde=True, color='green')
plt.title("Fare Distribution")
plt.xlabel("Fare")
plt.ylabel("Frequency")
plt.show()

majority of the passenger from either class embarked from S

In [None]:
# Drop unnecessary columns that are not useful for analysis or modeling
columns_to_drop = ['PassengerId', 'Name', 'Ticket', 'Cabin']
df.drop(columns=columns_to_drop, inplace=True)
print("Dropped unnecessary columns.")

# Check the updated shape of the dataset after cleaning
print(f"Updated Dataset Shape: {df.shape}")


feature engineering



In [None]:
# Feature Engineering: Creating new features to improve model performance

# 1. Extracting Family Size from 'SibSp' (siblings/spouses aboard) and 'Parch' (parents/children aboard)
df['FamilySize'] = df['SibSp'] + df['Parch'] + 1  # Adding 1 to include the passenger themselves
print("Created 'FamilySize' feature.")
print(df['FamilySize'])


In [None]:
# Visualizing Family Size
plt.figure(figsize=(8, 5))
sns.countplot(data=df, x='FamilySize', palette='coolwarm')
plt.title("Family Size Distribution")
plt.xlabel("Family Size")
plt.ylabel("Count")
plt.show()

In [None]:
# 2. Creating a binary feature to indicate whether the passenger is traveling alone
# 2. Creating a binary feature to indicate whether the passenger is traveling alone
df['IsAlone'] = (df['FamilySize'] == 1).astype(int)  # 1 if alone, 0 otherwise
print("Created 'IsAlone' feature.")
print(df['IsAlone'])

In [None]:
# 3. Categorizing passengers into Age Groups
bins = [0, 12, 18, 35, 60, np.inf]  # Age groups: Child, Teen, Young Adult, Adult, Senior
labels = ['Child', 'Teen', 'Young Adult', 'Adult', 'Senior']
df['AgeGroup'] = pd.cut(df['Age'], bins=bins, labels=labels)
print("Created 'AgeGroup' feature.")
print(df['AgeGroup'])


In [None]:
# Visualizing Age Group Distribution
plt.figure(figsize=(8, 5))
sns.countplot(data=df, x='AgeGroup', palette='viridis')
plt.title("Age Group Distribution")
plt.xlabel("Age Group")
plt.ylabel("Count")
plt.show()

In [None]:
# 4. Creating Fare Per Person Feature
# 4. Creating Fare Per Person Feature
df['FarePerPerson'] = df['Fare'] / df['FamilySize']  # Average fare per family member
print("Created 'FarePerPerson' feature.")
print(df['FarePerPerson'])

In [None]:
# 5. Flagging passengers in premium classes (First Class and high fares)
# 5. Flagging passengers in premium classes (First Class and high fares)
df['PremiumClass'] = ((df['Pclass'] == 1) & (df['Fare'] > df['Fare'].median())).astype(int)
print("Created 'PremiumClass' feature.")
print(f'{df["PremiumClass"]}')

In [None]:
# Visualizing Premium Class Distribution
plt.figure(figsize=(8, 5))
sns.countplot(data=df, x='PremiumClass', palette='mako')
plt.title("Premium Class Distribution")
plt.xlabel("Premium Class (1: Premium, 0: Not Premium)")
plt.ylabel("Count")
plt.show()


In [None]:

# 6. Encoding 'AgeGroup' as numerical for modeling
df = pd.get_dummies(df, columns=['AgeGroup'], drop_first=True)
print("Encoded 'AgeGroup' feature.")
df

In [None]:
# 7. Creating Interaction Feature: Survival by Family Size
df['SurvivalByFamily'] = df.groupby('FamilySize')['Survived'].transform('mean')
print("Created 'SurvivalByFamily' feature (mean survival rate by family size).")
df['SurvivalByFamily']

In [None]:
#
# 8. Title Normalization (Revisited)
# Grouping similar titles into broader categories
df['Title'] = df['Title'].replace(['Lady', 'Countess', 'Capt', 'Col', 'Don', 'Dr', 'Major',
                                    'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')
df['Title'] = df['Title'].replace('Mlle', 'Miss')
df['Title'] = df['Title'].replace('Ms', 'Miss')
df['Title'] = df['Title'].replace('Mme', 'Mrs')
print("Normalized 'Title' feature.")

# Convert categorical columns ('Sex', 'Embarked', and 'Title') into dummy/indicator variables
df = pd.get_dummies(df, columns=['Sex', 'Embarked', 'Title'], drop_first=True)
print("Converted categorical columns into dummy variables.")

# Preview of the dataset after feature engineering
print("Dataset after feature engineering:")
print(df.head())


Predicitive modeling

In [None]:
# Step 1: Import Required Libraries
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_auc_score, roc_curve
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

In [None]:
df.columns

In [None]:
data=test_data.columns
data

In [None]:
#Step 2: Prepare Data for Modeling
#Separating features (X) and target variable (y)
# X = df.drop(columns=['Survived'])  # Drop unused columns
# y = df['Survived']
#feature=test_data
feature = ['PassengerId','Name', 'Age','Ticket','Fare', 'Cabin']
X = test_data.drop(columns=feature) # Changed data to test_data and removed axis argument
y = df['Survived']
y.dropna()

In [None]:
# One-hot encode categorical features
X = pd.get_dummies(X, drop_first=True)
X

In [None]:
#  #Splitting data into training and testing sets (80% train, 20% test)
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
#Step 2: Prepare Data for Modeling
#Separating features (X) and target variable (y)
X = df.drop(columns=['Survived'])  # Use df (train_data) to create X
y = df['Survived']
y.dropna() # This line doesn't modify y in place; consider y = y.dropna() if needed

# One-hot encode categorical features
X = pd.get_dummies(X, drop_first=True)

# Splitting data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Step 3: Logistic Regression Model
log_model = LogisticRegression(max_iter=1000, random_state=42)
log_model.fit(X_train, y_train)
y_pred_log = log_model.predict(X_test)

# Evaluate Logistic Regression
print("Logistic Regression Performance:")
print("Accuracy:", accuracy_score(y_test, y_pred_log))
print(classification_report(y_test, y_pred_log))


In [None]:
# Step 4: Random Forest Model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)
y_pred_rf = rf_model.predict(X_test)

# Evaluate Random Forest
print("Random Forest Performance:")
print("Accuracy:", accuracy_score(y_test, y_pred_rf))
print(classification_report(y_test, y_pred_rf))


In [None]:

# Step 5: XGBoost Model
xgb_model = XGBClassifier(n_estimators=100, learning_rate=0.1, random_state=42, use_label_encoder=False, eval_metric='logloss')
xgb_model.fit(X_train, y_train)
y_pred_xgb = xgb_model.predict(X_test)

# Evaluate XGBoost
print("XGBoost Performance:")
print("Accuracy:", accuracy_score(y_test, y_pred_xgb))
print(classification_report(y_test, y_pred_xgb))


In [None]:
# Step 6: ROC Curve and AUC
models = {'Logistic Regression': log_model, 'Random Forest': rf_model, 'XGBoost': xgb_model}
plt.figure(figsize=(10, 7))

for name, model in models.items():
    y_proba = model.predict_proba(X_test)[:, 1]  # Predicted probabilities for class 1
    fpr, tpr, _ = roc_curve(y_test, y_proba)
    auc_score = roc_auc_score(y_test, y_proba)
    plt.plot(fpr, tpr, label=f'{name} (AUC = {auc_score:.2f})')

plt.plot([0, 1], [0, 1], 'k--', lw=2)  # Random guess line
plt.title("ROC Curve")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.legend()
plt.show()

In [None]:
# Step 7: Confusion Matrix for Best Model (Random Forest Example)
cm = confusion_matrix(y_test, y_pred_rf)
plt.figure(figsize=(6, 5))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['Not Survived', 'Survived'], yticklabels=['Not Survived', 'Survived'])
plt.title("Confusion Matrix - Random Forest")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()


In [None]:
# !pip install --upgrade xgboost
# !pip install --upgrade scikit-learn
# #!pip install scikit-learn==1.0



option is to wrap XGBClassifier inside a custom scikit-learn estimator to avoid the __sklearn_tags__ issue. This wrapper will implement a simple interface for XGBClassifier so that scikit-learn's cross-validation can be used.

In [None]:
# from sklearn.base import BaseEstimator, ClassifierMixin
# from xgboost import XGBClassifier

# class XGBClassifierWrapper(BaseEstimator, ClassifierMixin):
#     def __init__(self, **kwargs):
#         self.model = XGBClassifier(**kwargs)

#     def fit(self, X, y):
#         self.model.fit(X, y)
#         return self

#     def predict(self, X):
#         return self.model.predict(X)

#     def score(self, X, y):
#         return self.model.score(X, y)

# # Now use this wrapper in your models dictionary
# models = {
#     'Logistic Regression': LogisticRegression(max_iter=1000, random_state=42),
#     'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42),
#     'XGBoost': XGBClassifierWrapper(n_estimators=100, learning_rate=0.1, random_state=42, use_label_encoder=False, eval_metric='logloss', objective='binary:logistic')
# }

# # Cross-validation with StratifiedKFold
# for name, model in models.items():
#     cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
#     cv_scores = cross_val_score(model, X, y, cv=cv, scoring='accuracy')
#     print(f"{name} Cross-Validation Accuracy Scores: {cv_scores}")
#     print(f"{name} Mean Cross-Validation Accuracy: {np.mean(cv_scores):.4f}")
#     print(f"{name} Standard Deviation of Cross-Validation Accuracy: {np.std(cv_scores):.4f}")


 Use XGBClassifier Without cross_val_score
If none of the above solutions work, you can perform cross-validation manually by splitting the dataset and fitting the XGBClassifier without using

In [None]:
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Define the models to compare
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000, random_state=42),
    "Decision Tree": DecisionTreeClassifier(random_state=42),
    "KNN": KNeighborsClassifier(),
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42)
}

# Initialize the cross-validation strategy
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Store cross-validation scores for each model
cv_scores_dict = {}

# Perform cross-validation for each model
for name, model in models.items():
    cv_scores = []
    for train_index, test_index in cv.split(X, y):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]

        model.fit(X_train, y_train)
        score = model.score(X_test, y_test)
        cv_scores.append(score)

    cv_scores_dict[name] = np.array(cv_scores)

    print(f"{name} Cross-Validation Accuracy Scores: {cv_scores}")
    print(f"{name} Mean Cross-Validation Accuracy: {np.mean(cv_scores):.4f}")
    print(f"{name} Standard Deviation of Cross-Validation Accuracy: {np.std(cv_scores):.4f}")

# Create a box plot for the models' cross-validation accuracy scores
plt.figure(figsize=(8, 6))  # Set figure size for better visibility

# Draw the boxplot
sns.boxplot(data=list(cv_scores_dict.values()), orient='h', palette='Set2')

# Set the x-axis labels to the model names
plt.yticks(np.arange(len(models)), models.keys(), fontsize=12)

# Title and labels
plt.title('Model Comparison: Cross-Validation Accuracy Distribution', fontsize=16, fontweight='bold')
plt.xlabel('Accuracy', fontsize=14)

# Show gridlines
plt.grid(True, linestyle='--', linewidth=0.5, alpha=0.7)

# Show the plot
plt.show()


Confusion Matrix
Confusion Matrix gives the number of correct and incorrect classifications made by the classifier.