# Exercise 2b: Feature engineering

In [1]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
import re
import seaborn as sns
import matplotlib.pyplot as plt

In [3]:
X_train = pd.read_csv("ex2_train.csv")
y_train = pd.read_csv("ex2_class_train.csv")
X_test = pd.read_csv("ex2_test.csv")
y_test = pd.read_csv("ex2_class_test.csv")

In [4]:
# define a utility function to print out the prediction performance
def evaluate_result(y_te, y_pr, x_text, clf_arg):
    print(f'Accuracy: {accuracy_score(y_te, y_pr):.4f}')
    print(f'Precision: {precision_score(y_te, y_pr):.4f}')
    print(f'Recall: {recall_score(y_te, y_pr):.4f}')
    print(f'F1-score: {f1_score(y_te, y_pr):.4f}')
    print(f'AUC-ROC: {roc_auc_score(y_te, clf_arg.predict_proba(x_text)[:, 1]):.4f}')

## Prototyping (without feature engineering)

In [5]:
def preprocess(data_in):
    data = data_in.drop(columns=['Name'])
    
    data['Age'].fillna(data['Age'].median(), inplace=True)
    data['Embarked'].fillna(data['Embarked'].mode()[0], inplace=True)
    data['Fare'].fillna(data['Fare'].median(), inplace=True)

    # Convert categorical variables to dummy/indicator variables
    data = pd.get_dummies(data, columns=['Sex', 'Embarked'], drop_first=True)
    return data

In [6]:
X_train_processed = preprocess(X_train)
X_test_processed = preprocess(X_test)

clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train_processed, y_train.values.ravel())
y_pred = clf.predict(X_test_processed)

print('Random Forest Model without Feature Engineering')
evaluate_result(y_test, y_pred, X_test_processed, clf)

Random Forest Model without Feature Engineering
Accuracy: 0.8101
Precision: 0.7778
Recall: 0.7568
F1-score: 0.7671
AUC-ROC: 0.8732


## Feature engineering

The classification using simple preprocessed data gives only mediocre performance.

**TODO: You should make use of the insights from your EDA (ex2a) to complete the following feature engineering function below.** Later the function will replace the simple preprocessing.

You will pass the exercise if your feature engineering can improve the performance (i.e., winning in three or more metrics).

In [7]:
scaler = StandardScaler()

def normalize(data_in, scaler=scaler, fit=True):
    data = data_in.copy()
    columns_to_normalize = ['Age', 'Fare']
    normalized_data = data[columns_to_normalize]
    if fit:
      normalized_data = scaler.fit_transform(normalized_data)
    else:
      normalized_data = scaler.transform(normalized_data)
    data[columns_to_normalize] = normalized_data
    return data


def feature_engineering(data_in, fit_scaler=True):
    df = data_in.copy()

    common_titles = ["Mr", "Mrs", "Miss"]
    df['Title'] = df['Name'].str.extract(r',\s*([^\.]+)\.', expand=False).apply(lambda x: 'common' if x in common_titles else 'special')

    df = df.drop(columns=['Name'])
    
    df['Age'].fillna(df['Age'].median(), inplace=True)
    
    df['Embarked'].fillna(df['Embarked'].mode()[0], inplace=True)
    df['Fare'].fillna(df['Fare'].median(), inplace=True)

    df = pd.get_dummies(df, columns=['Sex', 'Embarked', 'Title', 'Pclass'], drop_first=True)

    df = normalize(df, fit=fit_scaler)
    return df

In [8]:
X_train_processed2 = feature_engineering(X_train, fit_scaler=True)
X_test_processed2 = feature_engineering(X_test, fit_scaler=False)

clf2 = RandomForestClassifier(n_estimators=100, random_state=42)
clf2.fit(X_train_processed2, y_train.values.ravel())
y_pred2 = clf2.predict(X_test_processed2)

print('Random Forest Model with Feature Engineering')
evaluate_result(y_test, y_pred2, X_test_processed2, clf2)

Random Forest Model with Feature Engineering
Accuracy: 0.8268
Precision: 0.7867
Recall: 0.7973
F1-score: 0.7919
AUC-ROC: 0.8729


In [9]:
evaluate_result(y_test, y_pred, X_test_processed, clf)

Accuracy: 0.8101
Precision: 0.7778
Recall: 0.7568
F1-score: 0.7671
AUC-ROC: 0.8732


Win in 4 over 5 metrics.