In [None]:
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder
import numpy as np
import random as rd


#### Loading Titanic Dataset (Train and Test) 

In [None]:
train_df = pd.read_csv('train.csv')  # Loading the train dataset
test_df = pd.read_csv('test.csv')    # Loading the test dataset

In [None]:
train_df

In [None]:
test_df

In [None]:
train_df.isnull().sum()  # check Train dataframe for missing values

In [None]:
test_df.isnull().sum()    # chech Test dataframe for missing values

#### Handle missing Value

In [None]:
imputer = SimpleImputer(strategy='most_frequent')
train_df['Age'] = imputer.fit_transform(train_df[['Age']])
test_df['Age'] = imputer.transform(test_df[['Age']])
train_df = train_df.drop(['Name', 'Ticket'], axis=1)
test_df = test_df.drop(['Name', 'Ticket'], axis=1)

cabin_train = ['B96', 'B98', 'G6', 'C23', 'C25', 'C27']
rd.shuffle(cabin_train)  # shuffle the cabin_train list
train_df['Cabin'] = train_df['Cabin'].apply(lambda x: rd.choice(cabin_train) if pd.isna(x) else x)

cabin_test = ['B57', 'B59', 'B63', 'C66', 'B45', 'C89', 'C55', 'C57', 'A32']
rd.shuffle(cabin_test)  # shuffle the cabin_test list
test_df['Cabin'] = test_df['Cabin'].apply(lambda x: rd.choice(cabin_test) if pd.isna(x) else x)
train_df

####  Pre-processing

In [None]:
label_encoder = LabelEncoder()
train_df['Sex'] = label_encoder.fit_transform(train_df['Sex'])
test_df['Sex'] = label_encoder.transform(test_df['Sex'])
train_df['Cabin'] = label_encoder.fit_transform(train_df['Cabin'])
test_df['Cabin'] = label_encoder.fit_transform(test_df['Cabin'])
train_df['Embarked'] = label_encoder.fit_transform(train_df['Embarked'])
test_df['Embarked'] = label_encoder.fit_transform(test_df['Embarked'])

In [None]:
X = train_df.drop(['Survived'], axis=1)
y = train_df['Survived']
display(X)
display(y)

In [None]:
def Features_select(train_df, threshold):
    Feature_selected = set()                   # set to store selected features
    Corr_df = X.corr()
    for i in range(len(Corr_df.columns)):     #  loop through the columns of the dataframe
        for n in range(i):
            if abs(Corr_df.iloc[i, n]) > threshold:
                Feature = Corr_df.columns[i]
                Feature_selected.add(Feature)
    return  Feature_selected


Feature = Features_select(X, 0.5)
X = X.drop(Feature, axis=1)
x_test = test_df.drop(Feature, axis=1)

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Hyperparameter tuning using GridSearchCV
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

rf_clf = RandomForestClassifier(random_state=42)

grid_search = GridSearchCV(estimator=rf_clf, param_grid=param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)

best_params = grid_search.best_params_
print("Best Hyperparameters for RandomForestClassifier:", best_params)

best_rf_clf = RandomForestClassifier(random_state=42, **best_params)
best_rf_clf.fit(X_train, y_train)

# Make predictions on the validation set
y_pred = best_rf_clf.predict(X_val)

# Evaluate the model
accuracy = np.mean(y_pred == y_val)
print(f'Accuracy performance for RandomForestClassifier {accuracy}')

# Make predictions on the test set
test_predictions = best_rf_clf.predict(x_test)

# Create a CSV file of survival predictions
predicted_survival = pd.DataFrame({'PassengerId': test_df['PassengerId'], 'Survived': test_predictions})
predicted_survival.to_csv('Survival_aizabayo.csv', index=False)

In [None]:
predicted_survival       # display the predicted survival in dataframe