In [15]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV


In [4]:
url = "https://raw.githubusercontent.com/datasciencedojo/datasets/refs/heads/master/titanic.csv"
df = pd.read_csv(url)

# Select relevant features
df = df[['Pclass', 'Sex', 'Age', 'Fare', 'Embarked', 'Survived']]

In [5]:
df

Unnamed: 0,Pclass,Sex,Age,Fare,Embarked,Survived
0,3,male,22.0,7.2500,S,0
1,1,female,38.0,71.2833,C,1
2,3,female,26.0,7.9250,S,1
3,1,female,35.0,53.1000,S,1
4,3,male,35.0,8.0500,S,0
...,...,...,...,...,...,...
886,2,male,27.0,13.0000,S,0
887,1,female,19.0,30.0000,S,1
888,3,female,,23.4500,S,0
889,1,male,26.0,30.0000,C,1


In [8]:
df.fillna({"Age":df["Age"].median()}, inplace=True)
df.fillna({"Embarked":df["Embarked"].mode()[0]}, inplace=True)

In [9]:
X = df.drop(columns=['Survived'])
y = df["Survived"]

In [10]:
# Apply features sclaing and encoding
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ["Age", "Fare"]),
        ('cat', OneHotEncoder(), ['Pclass', 'Sex', "Embarked"])
    ]
)



In [13]:
X_preprocessed = preprocessor.fit_transform(X)
log_model = LogisticRegression()
log_scores = cross_val_score(log_model, X_preprocessed, y, cv = 5, scoring = 'accuracy')
print(log_scores.mean())

0.7890088506685079


In [16]:
rf_model = RandomForestClassifier(random_state=42)
rf_scores = cross_val_score(rf_model, X_preprocessed, y, cv = 5, scoring='accuracy')
print(rf_scores.mean())

0.808097420124286


In [17]:
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10]
}

In [18]:
grid_search = GridSearchCV(
    estimator=RandomForestClassifier(random_state=42),
    param_grid= param_grid,
    scoring='accuracy',
    cv = 5,
    n_jobs=1
)

In [19]:
grid_search.fit(X_preprocessed, y)
# Display the best Hyperparameter and score
print(grid_search.best_params_)
print(grid_search.best_score_)

{'max_depth': 10, 'min_samples_split': 5, 'n_estimators': 100}
0.8339087314041805
