In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('train.csv')

df.drop(columns=['Name','Ticket','Cabin'], inplace=True)

ordinal_label = {k: i for i, k in enumerate(df['Sex'].unique(), 0)}
df['Sex'] = df['Sex'].map(ordinal_label)

ordinal_label = {k: i for i, k in enumerate(df['Embarked'].unique(), 0)}
df['Embarked'] = df['Embarked'].map(ordinal_label)

df['Embarked'].fillna('S')
df.dropna(inplace=True)
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,1,0,3,0,22.0,1,0,7.25,0
1,2,1,1,1,38.0,1,0,71.2833,1
2,3,1,3,1,26.0,0,0,7.925,0
3,4,1,1,1,35.0,1,0,53.1,0
4,5,0,3,0,35.0,0,0,8.05,0


In [3]:
df.isnull().sum()

PassengerId    0
Survived       0
Pclass         0
Sex            0
Age            0
SibSp          0
Parch          0
Fare           0
Embarked       0
dtype: int64

In [4]:
X = df.drop(columns='Survived')
y = df['Survived']

In [5]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3)
y_train = y_train.ravel()
y_test = y_test.ravel()

In [6]:
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.ensemble import RandomForestClassifier

In [8]:
clf = RandomForestClassifier(n_estimators=100, n_jobs=-1)
n_features = 4

In [9]:
# Build step forward feature selection
sfs1 = SequentialFeatureSelector(clf,
           n_features_to_select=n_features,
           direction='backward',
           scoring='accuracy',
           cv=5)

In [10]:
# Perform SFFS
sfs1 = sfs1.fit(X_train, y_train)

In [11]:
# Which features?

feat_cols = list(X_train.columns[sfs1.get_support()])
print(feat_cols)

['PassengerId', 'Pclass', 'Sex', 'Age']


In [12]:
#Build full model with selected features
clf = RandomForestClassifier(n_estimators=1000, random_state=42, max_depth=4)
clf.fit(X_train[feat_cols], y_train)

y_train_pred = clf.predict(X_train[feat_cols])
print("Training accuracy is {}".format(accuracy_score(y_train, y_train_pred)))

y_test_pred = clf.predict(X_test[feat_cols])
print("Training accuracy is {}".format(accuracy_score(y_test, y_test_pred)))

Training accuracy is 0.843687374749499
Training accuracy is 0.7906976744186046


In [13]:
#Build full model on ALL features, for comparison
clf = RandomForestClassifier(n_estimators=1000, random_state=42, max_depth=4)
clf.fit(X_train, y_train)

y_train_pred = clf.predict(X_train)
print("Training accuracy is {}".format(accuracy_score(y_train, y_train_pred)))

y_test_pred = clf.predict(X_test)
print("Testing accuracy is {}".format(accuracy_score(y_test, y_test_pred)))

Training accuracy is 0.8677354709418837
Testing accuracy is 0.7813953488372093
