In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

from xgboost import XGBClassifier

In [2]:
data = pd.read_csv('train.csv')
data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [3]:
data = data.drop(columns=['PassengerId', 'Name', 'Ticket', 'Cabin'])
data['Sex'] = data['Sex'].map({'male': 0, 'female': 1})
data['Embarked'] = data['Embarked'].map({'C': 0, 'Q': 1, 'S': 2})

data.fillna(data.mean(), inplace=True)

data.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,0,22.0,1,0,7.25,2.0
1,1,1,1,38.0,1,0,71.2833,0.0
2,1,3,1,26.0,0,0,7.925,2.0
3,1,1,1,35.0,1,0,53.1,2.0
4,0,3,0,35.0,0,0,8.05,2.0


In [4]:
train_val, test = train_test_split(data, test_size=0.2, random_state=42)

train, val = train_test_split(train_val, test_size=0.25, random_state=42)

In [5]:
X_train = train.drop(columns=['Survived'])
y_train = train['Survived']

X_val = val.drop(columns=['Survived'])
y_val = val['Survived']

X_test = test.drop(columns=['Survived'])
y_test = test['Survived']

In [6]:
models = {
    'Random Forest': (
        RandomForestClassifier(),
        {
            'n_estimators': [i for i in range(40, 101, 10)],
            'max_depth': [i for i in range(1, 11)]
        }
    ),
    
    'XGBoost': (
        XGBClassifier(),
        {
            'n_estimators': [i for i in range(40, 101, 10)],
            'max_depth': [i for i in range(1, 11)]
        }
    ),
    
    'Logistic Regression': (
        LogisticRegression(max_iter=200),
        {
            'C': [0.01, 0.1, 1]
        }
    ),
    
    'KNN': (
        KNeighborsClassifier(),
        {
            'n_neighbors': [i for i in range(1, 41)],
        }
    )
}

In [7]:
model = RandomForestClassifier(max_depth=4, n_estimators=100)
model.fit(X_train, y_train)

importances = model.feature_importances_
features = X_test.columns
indices = np.argsort(importances)

imp_f = sorted([i for i in zip(features[indices], importances[indices])], key=lambda x: x[1])
print(imp_f)

for n_important_features in [2, 4, 7]:
    important_features = [list(reversed(features[indices]))[i] for i in range(n_important_features)]

    print(
        f"\nWith {n_important_features} important features: "
        f"{', '.join(important_features)}."
    )
    
    X_val_nif = X_val[important_features]
    X_train_nif = X_train[important_features]
    X_test_nif = X_test[important_features]
    
    for name, (model, params) in models.items():
        grid_search = GridSearchCV(model, params, cv=3)
        grid_search.fit(X_val_nif, y_val)
        
        best_params = grid_search.best_params_
        
        best_model = model.set_params(**best_params)
        best_model.fit(X_train_nif, y_train)
        
        y_test_pred = best_model.predict(X_test_nif)
        accuracy = accuracy_score(y_test, y_test_pred)
        
        print(f"    {name}:\n {best_params} with accuracy: {accuracy}")
        


[('Embarked', 0.028762229262847768), ('Parch', 0.0329227250981512), ('SibSp', 0.050349009903333075), ('Age', 0.09629616540435772), ('Pclass', 0.141883873603329), ('Fare', 0.1720301156249623), ('Sex', 0.4777558811030192)]

With 2 important features: Sex, Fare.
    Random Forest:
 {'max_depth': 1, 'n_estimators': 60} with accuracy: 0.7486033519553073
    XGBoost:
 {'max_depth': 3, 'n_estimators': 70} with accuracy: 0.7988826815642458
    Logistic Regression:
 {'C': 1} with accuracy: 0.776536312849162
    KNN:
 {'n_neighbors': 5} with accuracy: 0.7206703910614525

With 4 important features: Sex, Fare, Pclass, Age.
    Random Forest:
 {'max_depth': 7, 'n_estimators': 70} with accuracy: 0.8212290502793296
    XGBoost:
 {'max_depth': 2, 'n_estimators': 50} with accuracy: 0.8268156424581006
    Logistic Regression:
 {'C': 1} with accuracy: 0.7988826815642458
    KNN:
 {'n_neighbors': 28} with accuracy: 0.7150837988826816

With 7 important features: Sex, Fare, Pclass, Age, SibSp, Parch, Embark