In [454]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import OrdinalEncoder


In [455]:
# Read the data
df_train = pd.read_csv('data/train.csv')
df_test = pd.read_csv('data/test.csv')

In [456]:
#Determine if there is any columns with NaN values

missing_cols = df_train.columns[df_train.isna().any()].tolist()
print(missing_cols)

# Almost all the columns have NaN values, so we need to determine the percentage of NaN values in each column

def count_nans(df):
    for col in df.columns:
        print(col, df[col].isna().sum()/len(df[col])*100)

count_nans(df_train)

df_train = df_train[df_train['Cabin'].notna()]

['HomePlanet', 'CryoSleep', 'Cabin', 'Destination', 'Age', 'VIP', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck', 'Name']
PassengerId 0.0
HomePlanet 2.312205222592891
CryoSleep 2.4962613597147127
Cabin 2.289198205452663
Destination 2.093638559760727
Age 2.0591280340503855
VIP 2.3352122397331185
RoomService 2.082135051190613
FoodCourt 2.105142068330841
ShoppingMall 2.392729782583688
Spa 2.105142068330841
VRDeck 2.1626596111814105
Name 2.300701714022777
Transported 0.0


In [457]:
# Define the MAE function to compare the performance

def mae(X_train, X_valid, y_train, y_valid):
    model = RandomForestClassifier(n_estimators=50)
    model.fit(X_train, y_train)
    predictions = model.predict(X_valid)
    mae_res = mean_absolute_error(y_valid, predictions)
    
    return mae_res

In [458]:
# Select the target
y = df_train.Transported
# Select the features
X = df_train.drop(['Transported'], axis=1)

#Divide the data into training and validation sets
X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=0.8, test_size=0.2, random_state=0)

In [459]:
# print(X_train.sample(10))
# print(X_valid.sample(3))

def count_nans(df):
    for col in df.columns:
        print(col, df[col].isna().sum()/len(df[col])*100)

count_nans(X_train)

PassengerId 0.0
HomePlanet 2.2222222222222223
CryoSleep 2.6342899190581313
Cabin 0.0
Destination 2.0897718910963947
Age 2.0456217807211186
VIP 2.251655629139073
RoomService 2.0309050772626933
FoodCourt 2.0161883738042676
ShoppingMall 2.369389256806475
Spa 2.0161883738042676
VRDeck 2.1486387049300957
Name 2.384105960264901


In [460]:
# Configure the column Cabin. \d - any digit
def cabin_conf(input):
    input = str(input)
    cabin_conf_pattern = r"/\d+/"
    output = re.sub(cabin_conf_pattern, "/", input)
    return output

X_train["Cabin"] = X_train["Cabin"].apply(cabin_conf)
X_valid["Cabin"] = X_valid["Cabin"].apply(cabin_conf)

print(X_train.sample(10))


     PassengerId HomePlanet CryoSleep Cabin    Destination   Age    VIP  \
2656     2840_01      Earth     False   F/S    55 Cancri e  44.0  False   
7978     8536_01     Europa     False   E/S    55 Cancri e  28.0  False   
2417     2594_01     Europa      True   B/S    TRAPPIST-1e  34.0  False   
7351     7865_02     Europa      True   C/S    TRAPPIST-1e  32.0  False   
8108     8661_01      Earth     False   F/S    TRAPPIST-1e  19.0  False   
5841     6180_01      Earth      True   G/S  PSO J318.5-22  16.0  False   
6360     6727_01      Earth     False   G/P    TRAPPIST-1e  29.0  False   
8281     8843_01     Europa     False   C/S    TRAPPIST-1e  31.0  False   
1758     1866_01      Earth     False   F/P    TRAPPIST-1e  29.0  False   
5860     6206_01      Earth     False   F/S    TRAPPIST-1e  39.0  False   

      RoomService  FoodCourt  ShoppingMall     Spa   VRDeck  \
2656        672.0        0.0           0.0     0.0     20.0   
7978          0.0      101.0           0.0     0

In [461]:

nunique_cnt = X_train.nunique()
print(nunique_cnt)
low_card_cols = nunique_cnt[nunique_cnt < 20].index.tolist()
print("The following columns will be translated into categorical: " + str(low_card_cols))

ordinal_encoder = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)
X_train[low_card_cols] = ordinal_encoder.fit_transform(X_train[low_card_cols])
X_valid[low_card_cols] = ordinal_encoder.transform(X_valid[low_card_cols])

PassengerId     6795
HomePlanet         3
CryoSleep          2
Cabin             15
Destination        3
Age               80
VIP                2
RoomService     1096
FoodCourt       1283
ShoppingMall     968
Spa             1162
VRDeck          1127
Name            6619
dtype: int64
The following columns will be translated into categorical: ['HomePlanet', 'CryoSleep', 'Cabin', 'Destination', 'VIP']


In [462]:
print(mae(X_train, X_valid, y_train, y_valid))

ValueError: could not convert string to float: 'Lis Delazarson'