In [498]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import OrdinalEncoder


In [499]:
# Read the data
df_train = pd.read_csv('data/train.csv')
df_test = pd.read_csv('data/test.csv')

In [500]:
#Determine if there is any columns with NaN values

missing_cols = df_train.columns[df_train.isna().any()].tolist()
print(missing_cols)

# Almost all the columns have NaN values, so we need to determine the percentage of NaN values in each column

def count_nans(df):
    for col in df.columns:
        print(col, df[col].isna().sum()/len(df[col])*100)

count_nans(df_train)

df_train = df_train[df_train['Cabin'].notna()]

df_train = df_train.drop(["Name"], axis=1)

['HomePlanet', 'CryoSleep', 'Cabin', 'Destination', 'Age', 'VIP', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck', 'Name']
PassengerId 0.0
HomePlanet 2.312205222592891
CryoSleep 2.4962613597147127
Cabin 2.289198205452663
Destination 2.093638559760727
Age 2.0591280340503855
VIP 2.3352122397331185
RoomService 2.082135051190613
FoodCourt 2.105142068330841
ShoppingMall 2.392729782583688
Spa 2.105142068330841
VRDeck 2.1626596111814105
Name 2.300701714022777
Transported 0.0


In [502]:
# Select the target
y = df_train.Transported
# Select the features
X = df_train.drop(['Transported', "PassengerId"], axis=1)

#Divide the data into training and validation sets
X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=0.8, test_size=0.2, random_state=0)

In [501]:
# Define the MAE function to compare the performance

def mae(X_train, X_valid, y_train, y_valid):
    model = RandomForestClassifier(n_estimators=50)
    model.fit(X_train, y_train)
    predictions = model.predict(X_valid)
    mae_res = mean_absolute_error(y_valid, predictions)
    
    return mae_res

In [503]:
# print(X_train.sample(10))
# print(X_valid.sample(3))

def count_nans(df):
    for col in df.columns:
        print(col, df[col].isna().sum()/len(df[col])*100)

count_nans(X_train)

HomePlanet 2.2222222222222223
CryoSleep 2.6342899190581313
Cabin 0.0
Destination 2.0897718910963947
Age 2.0456217807211186
VIP 2.251655629139073
RoomService 2.0309050772626933
FoodCourt 2.0161883738042676
ShoppingMall 2.369389256806475
Spa 2.0161883738042676
VRDeck 2.1486387049300957


In [504]:
# Configure the column Cabin. \d - any digit
def cabin_conf(input):
    input = str(input)
    cabin_conf_pattern = r"/\d+/"
    output = re.sub(cabin_conf_pattern, " ", input)
    return output

# Now I want to divide it into two different columns 
X_train["Cabin"] = X_train["Cabin"].apply(cabin_conf)
X_valid["Cabin"] = X_valid["Cabin"].apply(cabin_conf)

print(X_train.sample(10))


     HomePlanet CryoSleep Cabin    Destination   Age    VIP  RoomService  \
3906     Europa      True   B/S    TRAPPIST-1e  32.0  False          0.0   
4332      Earth     False   F/S    TRAPPIST-1e  53.0  False         19.0   
7523     Europa      True   A/S    55 Cancri e  37.0  False          0.0   
7645      Earth     False   G/S    TRAPPIST-1e   9.0  False          0.0   
2915     Europa     False   E/S    55 Cancri e  56.0  False          0.0   
1238       Mars     False   F/P            NaN   1.0  False          0.0   
7305      Earth     False   F/S  PSO J318.5-22  27.0  False          0.0   
4659      Earth     False   F/S  PSO J318.5-22  26.0  False          0.0   
4656      Earth      True   G/S    TRAPPIST-1e  61.0  False          0.0   
2328     Europa      True   D/S    TRAPPIST-1e  18.0  False          0.0   

      FoodCourt  ShoppingMall    Spa  VRDeck  
3906        0.0           0.0    0.0     0.0  
4332       81.0          85.0  768.0   489.0  
7523        0.0       

In [505]:

nunique_cnt = X_train.nunique()
print(nunique_cnt)
low_card_cols = nunique_cnt[nunique_cnt < 20].index.tolist()
print("The following columns will be translated into categorical: " + str(low_card_cols))

ordinal_encoder = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)
X_train[low_card_cols] = ordinal_encoder.fit_transform(X_train[low_card_cols])
X_valid[low_card_cols] = ordinal_encoder.transform(X_valid[low_card_cols])

HomePlanet         3
CryoSleep          2
Cabin             15
Destination        3
Age               80
VIP                2
RoomService     1096
FoodCourt       1283
ShoppingMall     968
Spa             1162
VRDeck          1127
dtype: int64
The following columns will be translated into categorical: ['HomePlanet', 'CryoSleep', 'Cabin', 'Destination', 'VIP']


In [506]:
# Use imputer for ['HomePlanet', 'CryoSleep', 'Cabin', 'Destination', 'VIP'] columns



In [507]:
print(mae(X_train, X_valid, y_train, y_valid))

ValueError: Input X contains NaN.
RandomForestClassifier does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values