In [3]:
# Question 2: Feature Engineering & Hyperparameter Tuning on the Titanic Dataset

# Step 1: Load the Titanic dataset (Assume you have a file named titanic.csv ).
# Step 2: Create features and handle missing values.
# Step 3: Train a pipeline using a Random Forest with GridSearchCV.
# Step 4: Evaluate the tuned model with cross-validation.

In [5]:
import pandas as pd
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

# Step 1: Create a simulated Titanic-like dataset
np.random.seed(42)

n_samples = 200

data = {
    'Pclass': np.random.choice([1, 2, 3], size=n_samples),
    'Sex': np.random.choice(['male', 'female'], size=n_samples),
    'Age': np.random.normal(30, 10, size=n_samples),
    'SibSp': np.random.randint(0, 5, size=n_samples),
    'Parch': np.random.randint(0, 5, size=n_samples),
    'Fare': np.random.uniform(10, 100, size=n_samples),
    'Embarked': np.random.choice(['C', 'Q', 'S'], size=n_samples),
    'Survived': np.random.choice([0, 1], size=n_samples)
}

# Introduce some missing values
df = pd.DataFrame(data)
df.loc[df.sample(frac=0.1).index, 'Age'] = np.nan
df.loc[df.sample(frac=0.05).index, 'Embarked'] = np.nan

# Step 2: Define features and target
X = df.drop('Survived', axis=1)
y = df['Survived']

# Define categorical and numerical columns
categorical_cols = ['Sex', 'Embarked', 'Pclass']
numerical_cols = ['Age', 'SibSp', 'Parch', 'Fare']

# Preprocessing
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median'))
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(transformers=[
    ('num', numeric_transformer, numerical_cols),
    ('cat', categorical_transformer, categorical_cols)
])

# Step 3: Build pipeline and tune hyperparameters
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(random_state=42))
])

param_grid = {
    'classifier__n_estimators': [100, 200],
    'classifier__max_depth': [5, 10, None],
    'classifier__min_samples_split': [2, 5]
}

grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='accuracy')
grid_search.fit(X, y)

# Step 4: Evaluate the tuned model
print("Best Parameters:", grid_search.best_params_)

cv_scores = cross_val_score(grid_search.best_estimator_, X, y, cv=5, scoring='accuracy')
print("Cross-Validation Accuracy Scores:", cv_scores)
print("Mean CV Accuracy:", cv_scores.mean())


# Display basic info
df.head()


Best Parameters: {'classifier__max_depth': 10, 'classifier__min_samples_split': 5, 'classifier__n_estimators': 200}
Cross-Validation Accuracy Scores: [0.45  0.55  0.425 0.475 0.425]
Mean CV Accuracy: 0.46499999999999997


Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Survived
0,3,female,,4,3,10.416882,C,0
1,1,male,31.905987,0,2,40.014925,S,0
2,3,female,36.069757,3,1,45.835182,Q,0
3,3,male,26.510988,2,0,58.365604,S,0
4,1,female,36.061294,0,4,92.787005,S,1
