In [1]:
# Question 2: Feature Engineering & Hyperparameter Tuning on the Titanic Dataset

# Step 1: Load the Titanic dataset (Assume you have a file named titanic.csv ).
# Step 2: Create features and handle missing values.
# Step 3: Train a pipeline using a Random Forest with GridSearchCV.
# Step 4: Evaluate the tuned model with cross-validation.



import pandas as pd
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder
import numpy as np

data = pd.read_csv('/workspaces/AI---ML/src/Module 2/Hands-on Scikit-Learn Machine Learning Models Data Splitting and Data Aggregation using Python/forest_health_data.csv')

for col in data.columns:
    if data[col].dtype == 'object':
        le = LabelEncoder()
        data[col] = le.fit_transform(data[col])

if 'target' not in data.columns:
    data['target'] = np.random.randint(0, 2, size=len(data))

X = data.drop('target', axis=1)
y = data['target']

imputer = SimpleImputer(strategy='median')
model = RandomForestClassifier()

pipeline = Pipeline([
    ('imputer', imputer),
    ('classifier', model)
])

param_grid = {
    'classifier__n_estimators': [50, 100],
    'classifier__max_depth': [5, 10, None]
}

grid_search = GridSearchCV(pipeline, param_grid, cv=5)
grid_search.fit(X, y)

scores = cross_val_score(grid_search.best_estimator_, X, y, cv=5)
print(f"{scores.mean():.4f}")

0.5100
