<a href="https://www.kaggle.com/code/bisheshchakraborty/titanic-rf-with-gridsearchcv?scriptVersionId=145143848" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [None]:
%config Completer.use_jedi = False

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

In [None]:
# Import the datasets
train_data = pd.read_csv('/kaggle/input/titanic/train.csv')
test_data = pd.read_csv('/kaggle/input/titanic/test.csv')

In [None]:
# Check data types of variables
train_data.info()

In [None]:
# View top 5 data
train_data.head(50)

In [None]:
# Check basic statistical summary of numeric columns
train_data.describe()

<h2>Drop irrelevant columns</h2>

In [None]:
# Drop 'Name' and 'Cabin' columns
train_data.drop(columns=['Name','Cabin','Ticket'], inplace=True)

test_data.drop(columns=['Name','Cabin','Ticket'], inplace=True)

<h2>Check for duplicate rows</h2>

In [None]:
# Check for duplicate rows
train_data[train_data.duplicated()]

test_data[test_data.duplicated()]

<h2>Impute missing values</h2>

In [None]:
# Check count of 'NaN' values in columns
train_data.isna().sum()

In [None]:
# Check the categorical data in 'Embarked' column
train_data['Embarked'].value_counts(dropna=False)

In [None]:
# Check the NaN rows
train_data[train_data['Embarked'].isna()]

In [None]:
# Either drop these rows or impute with most frequent category
train_data['Embarked'].fillna('S', inplace=True)

test_data['Embarked'].fillna('S', inplace=True)

In [None]:
# Check mean and median of 'Age'
train_data['Age'].mean(), train_data['Age'].median()

In [None]:
# Impute missing values of 'Age' column with mean/median
train_data['Age'].fillna(train_data['Age'].median(), inplace=True)

test_data['Age'].fillna(test_data['Age'].median(), inplace=True)

In [None]:
# Test data has a missing value in 'Fare' column
test_data[test_data['Fare'].isna()]

In [None]:
test_data['Fare'].fillna(test_data['Fare'].median(), inplace=True)

<h2>Create dummies for categorical variables</h2>

In [None]:
train_data.dtypes

In [None]:
# Create dummies for categorical variables dropping the first
train_data = pd.get_dummies(data=train_data, columns=['Pclass','Sex','Embarked'], drop_first=True)

In [None]:
test_data = pd.get_dummies(data=test_data, columns=['Pclass','Sex','Embarked'], drop_first=True)

<h2>Build Model</h2>

In [None]:
X_train = train_data.drop(columns=['Survived'])
X_test = test_data
y_train = train_data['Survived']

## Cross Validation with Grid Search

In [None]:
grid = {
    'n_estimators': [100, 200, 400, 500],
    'max_features': ['sqrt', 'log2'],
    'max_depth': [3, 4, 5, 7, 8]
}

In [None]:
rf_cv = GridSearchCV(estimator=RandomForestClassifier(), param_grid = grid, scoring='accuracy', n_jobs=4, cv=5)

In [None]:
rf_cv.fit(X_train, y_train)

In [None]:
rf_cv.best_params_

In [None]:
rf_cv_final = rf_cv.best_estimator_

In [None]:
rf_cv_pred = rf_cv_final.predict(X_test)

In [None]:
rf_cv_pred

In [None]:
# rf_model = RandomForestClassifier(n_estimators = 200)

In [None]:
# rf_model.fit(X_train, y_train)

In [None]:
# rf_model_pred = rf_model.predict(X_test)

In [None]:
output = pd.DataFrame({'PassengerId': X_test.PassengerId, 'Survived': rf_cv_pred})

In [None]:
output.to_csv('/kaggle/working/submission.csv', index=False)

In [None]:
# logreg = LogisticRegression(max_iter=3000)

In [None]:
# logreg.fit(X_train, y_train)

In [None]:
# Y_pred_logreg = logreg.predict(X_train)

In [None]:
# acc_logreg_train = accuracy_score(y_train, Y_pred_logreg)
# acc_logreg_train

In [None]:
# predictions = logreg.predict(X_test)

In [None]:
# output = pd.DataFrame({'PassengerId': X_test.PassengerId, 'Survived': predictions})

In [None]:
# output.to_csv('/kaggle/working/submission.csv', index=False)
# print('Submission successful')