In [None]:
import pandas as pd

In [4]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_score, recall_score
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, cross_val_score
import numpy as np

In [5]:
# Load the dataset
titanic_url = "https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv"
data = pd.read_csv(titanic_url)


In [6]:
# Preprocess the Dataset

# Handle Missing Values
numeric_features = ['Age', 'Fare']
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())])

categorical_features = ['Embarked', 'Sex', 'Pclass']
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])

In [7]:
# Remove Duplicate Data
data = data.drop_duplicates()
# Split the dataset
X = data[['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']]
y = data['Survived']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [8]:
# Create a preprocessing and modeling pipeline
model_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                                 ('classifier', RandomForestClassifier())])

In [9]:
# Define the parameter grid for hyperparameter tuning
param_grid = {
    'classifier__n_estimators': [50, 100, 200],
    'classifier__max_features': [ 'sqrt'],
    'classifier__max_depth': [4, 6, 8],
    'classifier__criterion': ['gini', 'entropy']
}

In [10]:
# Perform Grid Search
grid_search = GridSearchCV(model_pipeline, param_grid, cv=5, scoring='precision')
grid_search.fit(X_train, y_train)

In [11]:
# Best model from grid search
best_model = grid_search.best_estimator_

In [12]:
# Evaluate the best model
y_pred = best_model.predict(X_test)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)

print("RandomForestClassifier - Best Model from GridSearchCV")
print(f"Precision: {precision}")
print(f"Recall: {recall}")


RandomForestClassifier - Best Model from GridSearchCV
Precision: 0.8275862068965517
Recall: 0.6486486486486487


In [13]:
model_pipeline_lr = Pipeline(steps=[('preprocessor', preprocessor),
                                    ('classifier', LogisticRegression(max_iter=200))])

In [14]:
# Define the parameter grid for Logistic Regression
param_grid_lr = {
    'classifier__penalty': ['l1', 'l2', 'elasticnet', None],
    'classifier__C': np.logspace(-4, 4, 20),
    'classifier__solver': ['lbfgs', 'liblinear', 'saga'],
    'classifier__l1_ratio': [0.5]  # l1_ratio is used only if penalty='elasticnet'
}

In [15]:
# Filter incompatible combinations
param_grid_lr_filtered = [
    {'classifier__penalty': ['l1'], 'classifier__C': np.logspace(-4, 4, 20), 'classifier__solver': ['liblinear', 'saga']},
    {'classifier__penalty': ['l2'], 'classifier__C': np.logspace(-4, 4, 20), 'classifier__solver': ['lbfgs', 'liblinear', 'saga']},
    {'classifier__penalty': ['elasticnet'], 'classifier__C': np.logspace(-4, 4, 20), 'classifier__solver': ['saga'], 'classifier__l1_ratio': [0.5]},
    {'classifier__penalty': [None], 'classifier__C': np.logspace(-4, 4, 20), 'classifier__solver': ['lbfgs', 'saga']}
]

In [16]:
# Perform Randomized Search
random_search = RandomizedSearchCV(model_pipeline_lr, param_distributions=param_grid_lr_filtered, n_iter=100, cv=5, scoring='recall', random_state=42)
random_search.fit(X_train, y_train)



In [17]:
# Best model from random search
best_model_lr = random_search.best_estimator_

In [18]:
# Evaluate the best model
y_pred_lr = best_model_lr.predict(X_test)
precision_lr = precision_score(y_test, y_pred_lr)
recall_lr = recall_score(y_test, y_pred_lr)

In [19]:
print("LogisticRegression - Best Model from RandomizedSearchCV")
print(f"Precision: {precision_lr}")
print(f"Recall: {recall_lr}")

LogisticRegression - Best Model from RandomizedSearchCV
Precision: 0.7571428571428571
Recall: 0.7162162162162162
