In [143]:
!pip install category_encoders==2.*

Collecting category_encoders==2.*
  Downloading category_encoders-2.6.3-py2.py3-none-any.whl (81 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.9/81.9 kB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: category_encoders
Successfully installed category_encoders-2.6.3


In [156]:
# Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# sklearn imports
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from category_encoders import OrdinalEncoder
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split

from category_encoders import OneHotEncoder

In [9]:
# Data paths via Github
train_path = 'https://raw.githubusercontent.com/attyfinch/spaceship_titanic_predictive_model_kaggle/main/spaceship-titanic/train.csv'
test_path = 'https://raw.githubusercontent.com/attyfinch/spaceship_titanic_predictive_model_kaggle/main/spaceship-titanic/test.csv'

In [140]:
def wrangle(path):
    df = pd.read_csv(path)

    # Fill blanks with NaN values
    df.replace(r'^\s*$', np.nan, regex=True)

    # Rename columns
    df.rename(columns={'PassengerId': 'id',
                       'HomePlanet': 'home_planet',
                       'CryoSleep': 'cryosleep',
                       'Cabin': 'cabin',
                       'Destination': 'destination',
                       'Age': 'age',
                       'VIP': 'vip',
                       'RoomService': 'room_service',
                       'FoodCourt': 'food_court',
                       'ShoppingMall': 'shopping_mall',
                       'Spa': 'spa',
                       'VRDeck': 'vr_deck',
                       'Transported': 'transported',
                      }, inplace=True)

    # Replacing NaN values
    df['home_planet'].fillna(df['home_planet'].mode()[0], inplace=True)
    df['cryosleep'].fillna(df['cryosleep'].mode()[0], inplace=True)
    df['cabin'].fillna(df['cabin'].mode()[0], inplace=True)
    df['destination'].fillna(df['destination'].mode()[0], inplace=True)
    df['age'].fillna(df['age'].median(), inplace=True)
    df['vip'].fillna(df['vip'].mode()[0], inplace=True)
    df['room_service'].fillna(df['room_service'].mode()[0], inplace=True)
    df['food_court'].fillna(df['food_court'].mode()[0], inplace=True)
    df['shopping_mall'].fillna(df['shopping_mall'].mode()[0], inplace=True)
    df['spa'].fillna(df['spa'].mode()[0], inplace=True)
    df['vr_deck'].fillna(df['vr_deck'].mode()[0], inplace=True)

    # Turn Bool columns into binary values
    df['cryosleep'] = df['cryosleep'].astype(int)
    df['vip'] = df['vip'].astype(int)
    df['transported'] = df['transported'].astype(int)

    # Drop columns due to various reasons, like HCCC
    df.drop(columns=['Name'], inplace=True)

    # home_to_destination feature
    df['home_to_destination'] = df['home_planet'] + ' ' + df['destination']
    df.drop(columns=['home_planet', 'destination'], inplace=True)

    # deck, num, side features
    df['deck'] = df['cabin'].str.split('/', expand=True)[0]
    df['num'] = df['cabin'].str.split('/', expand=True)[1]
    df['side'] = df['cabin'].str.split('/', expand=True)[2]
    df.drop(columns=['cabin'], inplace=True)

    # used ammenities
    # Note: I may want to apply a spend threshold to used ammenities
    # I may also want to break ammenities into sub groups in a future iteration
    df['ammenities_spend'] = df['room_service'] + df['food_court'] + df['shopping_mall'] + df['spa'] + df['vr_deck']
    df['used_ammenities'] = df['ammenities_spend'] > 0
    df['used_ammenities'] = df['used_ammenities'].astype(int)
    df.drop(columns=['room_service', 'food_court', 'shopping_mall', 'spa', 'vr_deck'], inplace=True)

    # group number feature
    df['group_number'] = df['id'].str.split('_', expand=True)[0]
    df['group_number'] = df['group_number'].astype(int)

    group_df = (
        df.groupby(by='group_number')
        .agg(group_size = ('group_number', 'count'))
        .reset_index()
        )

    # Witholding this feature until I get to tuning stage
    # group_df.loc[group_df['group_size'] == 1, 'group_type'] = 'solo'
    # group_df.loc[group_df['group_size'] == 2, 'group_type'] = 'couple'
    # group_df.loc[group_df['group_size'] >= 3, 'group_type'] = 'family'

    # inserting group size feature into df
    df = df.merge(group_df, on='group_number', how='left')
    df.drop(columns=['group_number'], inplace=True)

    df.set_index('id', inplace=True)
    return df

df = wrangle(train_path)
df.head()

Unnamed: 0_level_0,cryosleep,age,vip,transported,home_to_destination,deck,num,side,ammenities_spend,used_ammenities,group_size
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
0001_01,0,39.0,0,0,Europa TRAPPIST-1e,B,0,P,0.0,0,1
0002_01,0,24.0,0,1,Earth TRAPPIST-1e,F,0,S,736.0,1,1
0003_01,0,58.0,1,0,Europa TRAPPIST-1e,A,0,S,10383.0,1,2
0003_02,0,33.0,0,0,Europa TRAPPIST-1e,A,0,S,5176.0,1,2
0004_01,0,16.0,0,1,Earth TRAPPIST-1e,F,1,S,1091.0,1,1


### Feature Engineering Notes
Some possible updates to my current wrangle function:
- Break ammenities into 2-3 categories
- Add spend threshold to ammenities_used
- Elinate num column
- Add age brackets, and tune grouping numbers
- Split home_to_destiation back into separate cats
- Narrow group size to fewer categories

# Split Data

In [146]:
target = 'transported'
X = df.drop(columns=[target])
y = df[target]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Baseline

In [147]:
baseline_acc = y_train.value_counts(normalize=True).max()
print(f'Baseline Accuracy: {baseline_acc}')

Baseline Accuracy: 0.5033074489502445


# Build Models

In [158]:
log_model = make_pipeline(
    OneHotEncoder(),
    StandardScaler(),
    LogisticRegression(max_iter=1000)
)

log_model.fit(X_train, y_train)

dt_model = make_pipeline(
    OneHotEncoder(),
    StandardScaler(),
    DecisionTreeClassifier()
)

dt_model.fit(X_train, y_train)

rf_model = make_pipeline(
    OneHotEncoder(),
    StandardScaler(),
    RandomForestClassifier()
)

rf_model.fit(X_train, y_train)

# Check Metrics

In [161]:
train_acc = log_model.score(X_train, y_train)
test_acc = log_model.score(X_test, y_test)
print(f'Log Train Accuracy: {train_acc}')
print(f'Log Test Accuracy: {test_acc}')
print()
train_acc = dt_model.score(X_train, y_train)
test_acc = dt_model.score(X_test, y_test)
print(f'Decision Tree Train Accuracy: {train_acc}')
print(f'Decision Tree Test Accuracy: {test_acc}')
print()
train_acc = rf_model.score(X_train, y_train)
test_acc = rf_model.score(X_test, y_test)
print(f'RandomForest Train Accuracy: {train_acc}')
print(f'RandomForest Test Accuracy: {test_acc}')

Log Train Accuracy: 0.818521714121369
Log Test Accuracy: 0.6871765382403681

Decision Tree Train Accuracy: 0.9995685936151855
Decision Tree Test Accuracy: 0.7084531339850488

RandomForest Train Accuracy: 0.9995685936151855
RandomForest Test Accuracy: 0.7377803335250144
