In [41]:
import pandas as pd

train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

In [42]:
train['Name'] = train['Name'].astype(str)
train['Initials'] = train['Name'].str.extract(r'([A-Z][a-z]+\.)')
train['Initials'].unique()

array(['Mr.', 'Mrs.', 'Miss.', 'Master.', 'Don.', 'Rev.', 'Dr.', 'Mme.',
       'Ms.', 'Major.', 'Lady.', 'Sir.', 'Mlle.', 'Col.', 'Capt.',
       'Countess.', 'Jonkheer.'], dtype=object)

In [44]:
test['Name'] = test['Name'].astype(str)
test['Initials'] = test['Name'].str.extract(r'([A-Z][a-z]+\.)')

In [45]:
survive_initials = train.groupby(["Initials", "Survived"]).size().unstack(fill_value=0).reset_index()
survive_initials = survive_initials.rename(columns={0:"no",1:"yes"})
survive_initials["Ratio"] = survive_initials["yes"]/(survive_initials["no"]+survive_initials["yes"])

In [46]:
train = train.merge(survive_initials[['Initials', 'Ratio']], on='Initials', how='left')

In [48]:
test = test.merge(survive_initials[['Initials', 'Ratio']], on='Initials', how='left')

In [47]:
train.drop(["Initials"], axis=1, inplace=True)

In [49]:
test.drop(["Initials"], axis=1, inplace=True)

In [50]:
bin = [0,15,30,45,60,80,100]
label = [6,5,4,3,2,1]
train["AgeGroup"] = pd.cut(train["Age"], bins = bin, labels = label)
test["AgeGroup"] = pd.cut(test["Age"], bins = bin, labels = label)

In [51]:
train = train.drop("Cabin", axis=1)
test = test.drop("Cabin", axis=1)
train = train.drop("Name", axis=1)
test = test.drop("Name", axis=1)


In [52]:
train = train.drop("PassengerId", axis=1)
passenger_id = test["PassengerId"]
test = test.drop("PassengerId", axis=1)

In [53]:
train = train.drop("Ticket", axis=1)
test = test.drop("Ticket", axis=1)

In [54]:
from sklearn.preprocessing import LabelEncoder
label_encoder=LabelEncoder()
train["Sex"] = label_encoder.fit_transform(train["Sex"])
test["Sex"] = label_encoder.fit_transform(test["Sex"])
train["Embarked"] = label_encoder.fit_transform(train["Embarked"])
test["Embarked"] = label_encoder.fit_transform(test["Embarked"])

In [55]:
from sklearn.impute import KNNImputer

imputer = KNNImputer(n_neighbors=5)

def impute_group(group):
    return pd.DataFrame(imputer.fit_transform(group), columns=group.columns, index=group.index)

train_imputed = train.groupby('Pclass').apply(impute_group)
train_imputed = test.groupby('Pclass').apply(impute_group)

  train_imputed = train.groupby('Pclass').apply(impute_group)
  train_imputed = test.groupby('Pclass').apply(impute_group)


In [56]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score
X = train.drop("Survived", axis=1)
y = train["Survived"]
classifier = RandomForestClassifier()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.2)
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)
print(accuracy_score(y_pred, y_test))
print(roc_auc_score(y_pred, y_test))


0.8659217877094972
0.865210843373494


In [57]:
classifier = RandomForestClassifier()
param_grid = {
    'n_estimators': [50, 100, 150, 200],       # Number of trees in the forest
    'max_depth': [None, 10, 20, 30, 40],       # Maximum depth of each tree
    'min_samples_split': [2, 5, 10],            # Minimum number of samples required to split an internal node
    'min_samples_leaf': [1, 2, 4],              # Minimum number of samples required to be at a leaf node
    'bootstrap': [True, False]                  # Whether bootstrap samples are used when building trees
}

# Set up Grid Search with cross-validation
grid_search = GridSearchCV(estimator=classifier, param_grid=param_grid, 
                           cv=5,  # Number of cross-validation folds
                           n_jobs=-1,  # Number of parallel jobs to run (-1 means using all processors)
                           verbose=2,  # Controls the verbosity of the output
                           scoring='roc_auc')  # Metric to optimize

# Fit Grid Search to the data
grid_search.fit(X, y)

Fitting 5 folds for each of 360 candidates, totalling 1800 fits


In [58]:
best_params = grid_search.best_params_
classifier = RandomForestClassifier(**best_params)

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2)

classifier.fit(X_train, y_train)

y_pred = classifier.predict(X_test)
y_prob = classifier.predict_proba(X_test)[:, 1]

print(accuracy_score(y_test,y_pred))
print(roc_auc_score(y_test,y_prob))

0.8268156424581006
0.8600953895071541


In [59]:
best_rf = grid_search.best_estimator_
y_pred = best_rf.predict(test)

In [60]:
y_pred = pd.Series(y_pred, name="Survived")

In [61]:
to_some_file = pd.concat([passenger_id,pd.Series(y_pred)], axis=1)

In [62]:
to_some_file.to_csv("gender_predictions.csv", index=False)