In [158]:
import pandas as pd
import warnings
import numpy as np
warnings.filterwarnings('ignore')
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

In [159]:
train['Name'] = train['Name'].astype(str)
train['Initials'] = train['Name'].str.extract(r'([A-Z][a-z]+\.)')

In [95]:
test['Name'] = test['Name'].astype(str)
test['Initials'] = test['Name'].str.extract(r'([A-Z][a-z]+\.)')

In [96]:
survive_initials = train.groupby(["Initials", "Survived"]).size().unstack(fill_value=0).reset_index()
survive_initials = survive_initials.rename(columns={0:"no",1:"yes"})
survive_initials["Ratio"] = survive_initials["yes"]/(survive_initials["no"]+survive_initials["yes"])

In [97]:
train = train.merge(survive_initials[['Initials', 'Ratio']], on='Initials', how='left')

In [98]:
test = test.merge(survive_initials[['Initials', 'Ratio']], on='Initials', how='left')

In [99]:
train.drop(["Initials"], axis=1, inplace=True)

In [100]:
test.drop(["Initials"], axis=1, inplace=True)

In [101]:
bin = [0,15,30,45,60,80,100]
label = [6,5,4,3,2,1]
train["AgeGroup"] = pd.cut(train["Age"], bins = bin, labels = label)
test["AgeGroup"] = pd.cut(test["Age"], bins = bin, labels = label)

In [102]:
train = train.drop(["Cabin", "Name", "Ticket"], axis=1)
test = test.drop(["Cabin", "Name", "Ticket"], axis=1)

In [103]:
train = train.drop("PassengerId", axis=1)
passenger_id = test["PassengerId"]
test = test.drop("PassengerId", axis=1)

In [104]:
from sklearn.preprocessing import LabelEncoder
label_encoder=LabelEncoder()
label_encoder.fit(train["Sex"])
train["Sex"] = label_encoder.transform(train["Sex"])
test["Sex"] = label_encoder.transform(test["Sex"])

In [105]:
target = train["Survived"]
train = train.drop("Survived", axis=1)

In [106]:
def label_encoder(val):
    if pd.isna(val):
        return np.nan
    elif val in mapping:
        return mapping[val]
    else:
        return max
def mapper(df1, variable, mapping, count):
  for i in df1.loc[:,variable].unique():
    mapping[i] = count
    count += 1
  return (mapping, count)

In [107]:
mapping = {}
count = 0
mapping, count = mapper(train, "Embarked", mapping, count)
train.loc[:,"Embarked"] = train.loc[:,"Embarked"].apply(label_encoder)
test.loc[:,"Embarked"] = test.loc[:,"Embarked"].apply(label_encoder)

In [111]:
from sklearn.impute import KNNImputer

for i in test.loc[:,"Pclass"].unique():
  train_temp = train.loc[train.loc[:,"Pclass"] == i]
  test_temp = test.loc[test.loc[:,"Pclass"] == i]
  imputer = KNNImputer(n_neighbors=5)
  imputer.fit(train_temp)
  globals()[f"{i}_train"] = pd.DataFrame(imputer.transform(train_temp), columns=train_temp.columns, index=train_temp.index)
  globals()[f"{i}_test"] = pd.DataFrame(imputer.transform(test_temp), columns=test_temp.columns, index=test_temp.index)

In [148]:
df_train = pd.DataFrame(columns=train_temp.columns)
df_test = pd.DataFrame(columns=train_temp.columns)
for i in range(1,4):
  df_train = pd.concat([df_train,globals()[f"{i}_train"]])
  df_test = pd.concat([df_test,globals()[f"{i}_test"]])

In [122]:
df_train = df_train.sort_index()
df_test = df_test.sort_index()

In [149]:
df_test

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Ratio,AgeGroup
11,1.0,1.0,46.00,0.0,0.0,26.0000,0.0,0.156673,3.0
12,1.0,0.0,23.00,1.0,0.0,82.2667,0.0,0.792000,5.0
14,1.0,0.0,47.00,1.0,0.0,61.1750,0.0,0.792000,3.0
20,1.0,1.0,55.00,1.0,0.0,59.4000,1.0,0.156673,3.0
22,1.0,0.0,45.00,0.0,0.0,31.6833,0.0,0.792000,3.6
...,...,...,...,...,...,...,...,...,...
412,3.0,0.0,28.00,0.0,0.0,7.7750,0.0,0.697802,5.0
413,3.0,1.0,34.00,0.0,0.0,8.0500,0.0,0.156673,4.2
415,3.0,1.0,38.50,0.0,0.0,7.2500,0.0,0.156673,4.0
416,3.0,1.0,34.00,0.0,0.0,8.0500,0.0,0.156673,4.2


In [126]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score
classifier = RandomForestClassifier()
X_train, X_test, y_train, y_test = train_test_split(df_train, target, test_size= 0.2)
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)
print(accuracy_score(y_pred, y_test))
print(roc_auc_score(y_pred, y_test))


0.8491620111731844
0.8521721641190668


In [136]:
classifier = RandomForestClassifier()

param_grid = {'n_estimators': np.arange(200, 350, 50),
    'max_features': ['sqrt', 'log2', None],
    'max_depth': np.arange(35, 45, 10),
    'min_samples_split': np.arange(7, 10),
    'min_samples_leaf': np.arange(7, 10),
    'bootstrap': [True, False]
}

# Set up Grid Search with cross-validation
grid_search = GridSearchCV(estimator=classifier, param_grid=param_grid,
                           cv=6,  # Number of cross-validation folds
                           n_jobs=-1,  # Number of parallel jobs to run (-1 means using all processors)
                           verbose=2,
                           scoring='roc_auc')  # Metric to optimize

# Fit Grid Search to the data
grid_search.fit(df_train, target)

Fitting 6 folds for each of 108 candidates, totalling 648 fits


In [137]:
best_params = grid_search.best_params_
print("Best Parameters:", best_params)

Best Parameters: {'bootstrap': True, 'max_depth': 35, 'max_features': 'sqrt', 'min_samples_leaf': 8, 'min_samples_split': 8, 'n_estimators': 300}


In [138]:
best_model = grid_search.best_estimator_

y_pred = grid_search.predict(X_test)
y_prob = grid_search.predict_proba(X_test)[:, 1]

print(accuracy_score(y_test,y_pred))
print(roc_auc_score(y_test,y_prob))

0.888268156424581
0.9489102564102565


In [152]:
best_rf = grid_search.best_estimator_
y_pred_test = best_rf.predict(df_test)

In [153]:
y_pred_test = pd.Series(y_pred_test, name="Survived")

In [154]:
to_some_file = pd.concat([passenger_id,y_pred_test], axis=1)

In [155]:
to_some_file.to_csv("submission.csv", index=False)