In [11]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import metrics
import os
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

TITANIC_PATH = os.path.join("datasets")
def load_titanic_data(filename, titanic_path=TITANIC_PATH):
    csv_path = os.path.join(titanic_path, filename)
    return pd.read_csv(csv_path)

train_data = load_titanic_data("train.csv")
test_data = load_titanic_data("test.csv")
y_train = train_data['Survived']
X_train = train_data.drop('Survived', axis=1)

X_test = test_data

imputer = SimpleImputer(strategy='median')
age_data_imputed = imputer.fit_transform(X_train[['Age']])
X_train['Age'] = age_data_imputed

age_data_imputed = imputer.fit_transform(X_test[['Age']])
fare_data_imputed = imputer.fit_transform(X_test[['Fare']])
X_test['Age'] = age_data_imputed
X_test['Fare'] = fare_data_imputed

label_encoder = LabelEncoder()
for column in X_train.columns:
    if X_train[column].dtype == 'object':  
        X_train[column] = label_encoder.fit_transform(X_train[column])

for column in X_test.columns:
    if X_test[column].dtype == 'object':  
        X_test[column] = label_encoder.fit_transform(X_test[column])

X_train = X_train.drop(['Ticket', 'Name'], axis=1)
X_test = X_test.drop(['Ticket', 'Name'], axis=1)

kfold = StratifiedKFold(n_splits=5,shuffle=True, random_state=123)



pipeline = Pipeline([('scaler', StandardScaler()), ('classifier', RandomForestClassifier())])

param_grid = {
    'scaler__with_mean': [True, False],  
    'scaler__with_std': [True, False],
    'classifier__n_estimators': [50, 100, 200],
    'classifier__max_depth': [2, 5, 20],
    'classifier__min_samples_split': [4, 5, 10],
    'classifier__max_features': [4,5,7,10],
    'classifier__bootstrap': [True]
}

grid = GridSearchCV(pipeline, param_grid, cv=kfold, return_train_score=True)
grid.fit(X_train, y_train)
print(grid.best_params_)
result = grid.predict(X_test)
df = pd.DataFrame()
df['PassengerId'] = X_test['PassengerId']
df['Survived'] = result
df.to_csv('predictions.csv', index=False)


{'classifier__bootstrap': True, 'classifier__max_depth': 5, 'classifier__max_features': 3, 'classifier__min_samples_split': 4, 'classifier__n_estimators': 200, 'scaler__with_mean': True, 'scaler__with_std': False}
