In [200]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import OrdinalEncoder


In [201]:
# Read the data
df_train = pd.read_csv('data/train.csv')
df_test = pd.read_csv('data/test.csv')

In [202]:
#Determine if there is any columns with NaN values

missing_cols = df_train.columns[df_train.isna().any()].tolist()
print(missing_cols)

# Almost all the columns have NaN values, so we need to determine the percentage of NaN values in each column

def count_nans(df):
    for col in df.columns:
        print(col, df[col].isna().sum()/len(df[col])*100)

count_nans(df_train)

['HomePlanet', 'CryoSleep', 'Cabin', 'Destination', 'Age', 'VIP', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck', 'Name']
PassengerId 0.0
HomePlanet 2.312205222592891
CryoSleep 2.4962613597147127
Cabin 2.289198205452663
Destination 2.093638559760727
Age 2.0591280340503855
VIP 2.3352122397331185
RoomService 2.082135051190613
FoodCourt 2.105142068330841
ShoppingMall 2.392729782583688
Spa 2.105142068330841
VRDeck 2.1626596111814105
Name 2.300701714022777
Transported 0.0


In [203]:
# Define the MAE function to compare the performance

def mae(X_train, X_valid, y_train, y_valid):
    model = RandomForestClassifier(n_estimators=50)
    model.fit(X_train, y_train)
    predictions = model.predict(X_valid)
    mae_res = mean_absolute_error(y_valid, predictions)
    
    return mae_res

In [204]:
# Select the target
y = df_train.Transported
# Select the features
X = df_train.drop(['Transported'], axis=1)

#Divide the data into training and validation sets
X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=0.8, test_size=0.2, random_state=0)

In [205]:
print(X_train.sample(3))
print(X_valid.sample(3))

     PassengerId HomePlanet CryoSleep     Cabin  Destination   Age    VIP  \
5126     5475_01      Earth     False  F/1134/P  TRAPPIST-1e  43.0  False   
1974     2110_01      Earth      True   G/339/P  TRAPPIST-1e  18.0  False   
4966     5300_01      Earth      True   G/864/S  55 Cancri e  51.0  False   

      RoomService  FoodCourt  ShoppingMall  Spa  VRDeck                Name  
5126          0.0        0.0           1.0  NaN     0.0   Stelle Lowelliott  
1974          0.0        0.0           0.0  0.0     0.0  Ernice Joynewtonks  
4966          0.0        0.0           0.0  0.0     0.0   Healle Klindsayer  
     PassengerId HomePlanet CryoSleep    Cabin  Destination   Age    VIP  \
2615     2798_02     Europa      True   A/23/P  55 Cancri e  38.0  False   
3651     3924_01      Earth     False  E/263/S  55 Cancri e  19.0  False   
700      0733_01      Earth     False  F/153/P  55 Cancri e  23.0  False   

      RoomService  FoodCourt  ShoppingMall    Spa  VRDeck               Na

In [206]:

nunique_cnt = X_train.nunique()
low_card_cols = nunique_cnt[nunique_cnt < 10].index.tolist()
print("The following columns will be translated into categorical: " + str(low_card_cols))

ordinal_encoder = OrdinalEncoder()
X_train[low_card_cols] = ordinal_encoder.fit_transform(X_train[low_card_cols])
X_valid[low_card_cols] = ordinal_encoder.transform(X_valid[low_card_cols])

The following columns will be translated into categorical: ['HomePlanet', 'CryoSleep', 'Destination', 'VIP']


In [207]:
print(mae(X_train, X_valid, y_train, y_valid))

ValueError: could not convert string to float: 'C/167/S'