In [1]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import copy 

In [2]:
# https://www.kaggle.com/competitions/spaceship-titanic/overview
def load_data():
    train_path = os.path.join('Datasets', 'train.csv')
    test_path = os.path.join('Datasets', 'train.csv')
    
    return pd.read_csv(train_path), pd.read_csv(test_path)

def split_passengerId(df):
    df['Position'] = df['PassengerId'].str[-2:]
    df['Group'] = df['PassengerId'].str[:4]
    df.drop(['PassengerId'], axis=1, inplace=True)  
    
def clean_cryosleep(df):
    indicies = df.loc[(df['CryoSleep'].isnull()) & ((df['FoodCourt']!=0) | (df['RoomService']!=0)
                                     | (df['ShoppingMall']!=0) | (df['Spa']!=0)
                                    | (df['VRDeck']!=0) )].index
    df.loc[indicies, 'CryoSleep'] = False

def split_cabin(df):
    df['Deck'] = df['Cabin'].str.split('/', expand=True)[0]
    df['CabinNumber'] = df['Cabin'].str.split('/', expand=True)[1]
    df['CabinSide'] = df['Cabin'].str.split('/', expand=True)[2]
    df.drop(['Cabin'], axis=1, inplace=True)

def fill_homeplanet(df):   
    df2 = df.loc[df['HomePlanet'].notnull()]
    homeplanet_dict = dict(zip(df2['Group'],df2['HomePlanet']))

    df['New_HomePlanet'] = np.where(df['HomePlanet'].isnull(),df['Group'].map(dict_df2),df['HomePlanet'])
    df.drop(['HomePlanet'], axis=1, inplace=True)
    df.rename(columns={'New_HomePlanet':'HomePlanet'}, inplace=True)
    df['HomePlanet'] = df[['HomePlanet']].fillna('Unknown')    


In [3]:
def clean_data(df):
    split_passengerId(df)
    split_cabin(df)
    clean_cryosleep(df)
    fill_homeplanet(df)
    
    df['TotalSpending'] = df['RoomService'] + df['FoodCourt'] + df['ShoppingMall'] + df['Spa'] + df['VRDeck']
    df.drop(['Name'], axis=1, inplace=True)

In [7]:

train_data, test_data = load_data()

X_train = train_data.loc[:, train_data.columns!='Transported']
y_train = train_data['Transported']
X_test = test_data.loc[:, test_data.columns!='Transported']
y_test = test_data['Transported']

paceship = X_train.copy()
spaceship_labels = y_train.copy()

In [126]:
X_train.head()

Unnamed: 0,PassengerId,CryoSleep,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Position,Group,Deck,CabinNumber,CabinSide,TotalSpending,HomePlanet
0,0001_01,False,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,1,1,B,0,P,0.0,Europa
1,0002_01,False,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,1,2,F,0,S,736.0,Earth
2,0003_01,False,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,1,3,A,0,S,10383.0,Europa
3,0003_02,False,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,2,3,A,0,S,5176.0,Europa
4,0004_01,False,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,1,4,F,1,S,1091.0,Earth


In [5]:
# Need to remove all nulls
spaceship.isna().sum()

NameError: name 'spaceship' is not defined

In [8]:
# Pipeline constructor used to run transformation steps in order
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

spaceship_num = list(spaceship[['Age','RoomService','FoodCourt','ShoppingMall','Spa','VRDeck',
                                'TotalSpending','Position','Group','CabinNumber']])
num_pipeline = Pipeline([
    ('std_scaler', StandardScaler())   
])



NameError: name 'spaceship' is not defined

In [182]:
# Pipeline constructor used to run transformation steps in order
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler


# Use column transformer when you need to do preprocessing on different type of columns
# such as standardizing numerical columns and hot encoding categories
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder



clean_data(spaceship)

spaceship_num = list(spaceship[['Age','RoomService','FoodCourt','ShoppingMall','Spa','VRDeck',
                                'TotalSpending','Position','Group','CabinNumber']])
num_pipeline = Pipeline([
    ('std_scaler', StandardScaler())   
])


spaceship_cats = list(spaceship[['CryoSleep','Destination','VIP','Deck','CabinSide', 'HomePlanet']])

full_pipeline = ColumnTransformer([
    ('num', num_pipeline, spaceship_num),
    ('encode', OneHotEncoder(), spaceship_cats)
],remainder='passthrough')

spaceship_prepared = full_pipeline.fit_transform(spaceship)
# As you can see it returns as dtype 'sparse matrix' which can be converted to
# a NumPy array with toarray()

In [186]:
full_pipeline.get_feature_names_out()

array(['num__Age', 'num__RoomService', 'num__FoodCourt',
       'num__ShoppingMall', 'num__Spa', 'num__VRDeck',
       'num__TotalSpending', 'num__Position', 'num__Group',
       'num__CabinNumber', 'encode__CryoSleep_False',
       'encode__CryoSleep_True', 'encode__CryoSleep_nan',
       'encode__Destination_55 Cancri e',
       'encode__Destination_PSO J318.5-22',
       'encode__Destination_TRAPPIST-1e', 'encode__Destination_nan',
       'encode__VIP_False', 'encode__VIP_True', 'encode__VIP_nan',
       'encode__Deck_A', 'encode__Deck_B', 'encode__Deck_C',
       'encode__Deck_D', 'encode__Deck_E', 'encode__Deck_F',
       'encode__Deck_G', 'encode__Deck_T', 'encode__Deck_nan',
       'encode__CabinSide_P', 'encode__CabinSide_S',
       'encode__CabinSide_nan', 'encode__HomePlanet_Earth',
       'encode__HomePlanet_Europa', 'encode__HomePlanet_Mars',
       'encode__HomePlanet_Unknown'], dtype=object)

In [184]:
# Need to take care of all Nulls first

from sklearn.linear_model import LogisticRegression

log_reg = LogisticRegression()
log_reg.fit(spaceship_prepared, spaceship_labels)


ValueError: Input contains NaN, infinity or a value too large for dtype('float64').

In [None]:
X_train = train_data.loc[:, df.columns!='Transported']
y_train = train_data['Transported']
X_test = test_data.loc[:, df.columns!='Transported']
y_test = test_data['Transported']

X_train.head()