In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import OrdinalEncoder
from sklearn.impute import SimpleImputer

from xgboost import XGBRegressor


In [2]:
# Read the data
df_train = pd.read_csv('data/train.csv')
df_test = pd.read_csv('data/test.csv')


In [3]:
#Determine if there is any columns with NaN values

missing_cols = df_train.columns[df_train.isna().any()].tolist()
print(missing_cols)

# Almost all the columns have NaN values, so we need to determine the percentage of NaN values in each column

def count_nans(df):
    for col in df.columns:
        print(col, df[col].isna().sum()/len(df[col])*100)

count_nans(df_train)

df_train = df_train[df_train['Cabin'].notna()]
df_train["Transported"] = df_train["Transported"].astype(int)

df_train = df_train.drop(["Name"], axis=1)

X_test = df_test.copy()
X_test = X_test.drop(["PassengerId"], axis=1)
X_test = X_test.drop(["Name"], axis=1)

['HomePlanet', 'CryoSleep', 'Cabin', 'Destination', 'Age', 'VIP', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck', 'Name']
PassengerId 0.0
HomePlanet 2.312205222592891
CryoSleep 2.4962613597147127
Cabin 2.289198205452663
Destination 2.093638559760727
Age 2.0591280340503855
VIP 2.3352122397331185
RoomService 2.082135051190613
FoodCourt 2.105142068330841
ShoppingMall 2.392729782583688
Spa 2.105142068330841
VRDeck 2.1626596111814105
Name 2.300701714022777
Transported 0.0


In [4]:
# Select the target
y = df_train.Transported
# Select the features
X = df_train.drop(['Transported', "PassengerId"], axis=1)

#Divide the data into training and validation sets
X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=0.8, test_size=0.2, random_state=0)

In [5]:
# Define the MAE function to compare the performance

def mae(X_train, X_valid, y_train, y_valid, model):
    model.fit(X_train, y_train)
    predictions = model.predict(X_valid)
    accuracy = accuracy_score(y_valid, predictions)
    
    return accuracy

In [6]:
# print(X_train.sample(10))
# print(X_valid.sample(3))

def count_nans(df):
    for col in df.columns:
        print(col, df[col].isna().sum()/len(df[col])*100)

count_nans(X_train)

HomePlanet 2.2222222222222223
CryoSleep 2.6342899190581313
Cabin 0.0
Destination 2.0897718910963947
Age 2.0456217807211186
VIP 2.251655629139073
RoomService 2.0309050772626933
FoodCourt 2.0161883738042676
ShoppingMall 2.369389256806475
Spa 2.0161883738042676
VRDeck 2.1486387049300957


In [7]:
# Configure the column Cabin. \d - any digit
def cabin_conf(input):
    input = str(input)
    cabin_conf_pattern = r"/\d+/"
    output = re.sub(cabin_conf_pattern, "/", input)
    return output

# Now I want to divide it into two different columns 
X_train["Cabin"] = X_train["Cabin"].apply(cabin_conf)
X_valid["Cabin"] = X_valid["Cabin"].apply(cabin_conf)
X_test["Cabin"] = X_test["Cabin"].apply(cabin_conf)


X_train[['Cabin_Deck', 'Cabin_Side']] = X_train['Cabin'].str.split('/', expand=True)
X_valid[['Cabin_Deck', 'Cabin_Side']] = X_valid['Cabin'].str.split('/', expand=True)
X_test[['Cabin_Deck', 'Cabin_Side']] = X_test['Cabin'].str.split('/', expand=True)

X_train.drop(["Cabin"], axis=1)
X_valid.drop(["Cabin"], axis=1)
X_test.drop(["Cabin"], axis=1)


print(X_train.sample(10))


     HomePlanet CryoSleep Cabin    Destination   Age    VIP  RoomService  \
1686       Mars     False   F/P    TRAPPIST-1e  73.0  False         27.0   
3134      Earth     False   F/P    TRAPPIST-1e  18.0  False          1.0   
519      Europa     False   C/P    TRAPPIST-1e  33.0  False          0.0   
7865      Earth      True   G/P  PSO J318.5-22  14.0  False          0.0   
1149      Earth     False   F/P    TRAPPIST-1e  51.0  False          0.0   
1596      Earth      True   G/S  PSO J318.5-22  31.0  False          0.0   
334       Earth     False   G/S    TRAPPIST-1e  22.0  False        116.0   
4240       Mars     False   D/S    TRAPPIST-1e  13.0  False         27.0   
501         NaN      True   C/P    55 Cancri e  30.0  False          0.0   
7702      Earth       NaN   G/P    TRAPPIST-1e  18.0  False          3.0   

      FoodCourt  ShoppingMall     Spa  VRDeck Cabin_Deck Cabin_Side  
1686        2.0          71.0  1208.0     0.0          F          P  
3134       56.0        

In [8]:

nunique_cnt = X_train.nunique()
print(nunique_cnt)
low_card_cols = nunique_cnt[nunique_cnt < 20].index.tolist()
print("The following columns will be translated into categorical: " + str(low_card_cols))

ordinal_encoder = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)
X_train[low_card_cols] = ordinal_encoder.fit_transform(X_train[low_card_cols])
X_valid[low_card_cols] = ordinal_encoder.transform(X_valid[low_card_cols])

X_test[low_card_cols] = ordinal_encoder.fit_transform(X_test[low_card_cols])

HomePlanet         3
CryoSleep          2
Cabin             15
Destination        3
Age               80
VIP                2
RoomService     1096
FoodCourt       1283
ShoppingMall     968
Spa             1162
VRDeck          1127
Cabin_Deck         8
Cabin_Side         2
dtype: int64
The following columns will be translated into categorical: ['HomePlanet', 'CryoSleep', 'Cabin', 'Destination', 'VIP', 'Cabin_Deck', 'Cabin_Side']


In [9]:
# Use imputer for ['HomePlanet', 'CryoSleep', 'Cabin', 'Destination', 'VIP', 'Cabin_Deck', 'Cabin_Side']] columns

cols_impute = ['HomePlanet', 'CryoSleep', 'Cabin', 'Destination', 'VIP', 'Cabin_Deck', 'Cabin_Side']

my_imputer = SimpleImputer(strategy='most_frequent')
X_train_imputed = pd.DataFrame(my_imputer.fit_transform(X_train))
X_valid_imputed = pd.DataFrame(my_imputer.transform(X_valid))

X_test_imputed = pd.DataFrame(my_imputer.transform(X_test))

X_train_imputed.columns = X_train.columns
X_valid_imputed.columns = X_valid.columns
X_test_imputed.columns = X_test.columns


X_train = X_train_imputed
X_valid = X_valid_imputed
X_test = X_test_imputed

count_nans(X_train)

print(X_train.sample(10))

HomePlanet 0.0
CryoSleep 0.0
Cabin 0.0
Destination 0.0
Age 0.0
VIP 0.0
RoomService 0.0
FoodCourt 0.0
ShoppingMall 0.0
Spa 0.0
VRDeck 0.0
Cabin_Deck 0.0
Cabin_Side 0.0
      HomePlanet  CryoSleep  Cabin  Destination   Age  VIP  RoomService  \
758          0.0        1.0   13.0          2.0  14.0  0.0          0.0   
6248         0.0        1.0    9.0          0.0  49.0  0.0          0.0   
2167         1.0        1.0    2.0          2.0  18.0  0.0          0.0   
5469         0.0        0.0   12.0          2.0  37.0  0.0         18.0   
4583         0.0        0.0   13.0          2.0  12.0  0.0          0.0   
3223         0.0        0.0   13.0          2.0  20.0  0.0          0.0   
4590         0.0        1.0   13.0          0.0  26.0  0.0          0.0   
738          0.0        0.0   13.0          0.0  21.0  0.0        770.0   
2203         0.0        0.0   12.0          2.0  73.0  0.0          0.0   
4284         0.0        0.0   11.0          2.0  16.0  0.0          0.0   

      F

In [10]:
model = XGBRegressor(n_estimators=800,
                         learning_rate=0.05)

print(mae(X_train, X_valid, y_train, y_valid, model))

model.fit(X_train, y_train)
predictions = model.predict(X_test)

predictions = predictions.astype(bool)

NameError: name 'model_rfc' is not defined

In [None]:
# Create the model
# model = XGBRegressor(n_estimators=800,
#                          learning_rate=0.05) 
output = pd.DataFrame({'PassengerId': df_test.PassengerId, 'Transported': predictions})
output.to_csv('submission.csv', index=False)

print(len(output.index))

4277
