# Spaceship Titanic Competition

### Goals: do some fine tuning with the model, features, etc using st_3

To Do List - check if model can absorb nulls, if not convert floats to int (can't do that until there are no nulls)

In [1]:
import pandas as pd

final_data = pd.read_csv('test.csv')
train_data = pd.read_csv('train.csv')

In [2]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8693 entries, 0 to 8692
Data columns (total 14 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   PassengerId   8693 non-null   object 
 1   HomePlanet    8492 non-null   object 
 2   CryoSleep     8476 non-null   object 
 3   Cabin         8494 non-null   object 
 4   Destination   8511 non-null   object 
 5   Age           8514 non-null   float64
 6   VIP           8490 non-null   object 
 7   RoomService   8512 non-null   float64
 8   FoodCourt     8510 non-null   float64
 9   ShoppingMall  8485 non-null   float64
 10  Spa           8510 non-null   float64
 11  VRDeck        8505 non-null   float64
 12  Name          8493 non-null   object 
 13  Transported   8693 non-null   bool   
dtypes: bool(1), float64(6), object(7)
memory usage: 891.5+ KB


setting features

In [3]:
cabin = train_data['Cabin'].str.split('/', 3, expand=True)
cabin.columns = ['Deck', 'Num', 'Side']
train_data_mod = pd.concat([train_data, cabin], axis=1)
train_data_mod.drop(['Cabin'], axis=1, inplace=True)

In [4]:
features = train_data_mod.columns.drop('Transported').to_list()
features

['PassengerId',
 'HomePlanet',
 'CryoSleep',
 'Destination',
 'Age',
 'VIP',
 'RoomService',
 'FoodCourt',
 'ShoppingMall',
 'Spa',
 'VRDeck',
 'Name',
 'Deck',
 'Num',
 'Side']

In [5]:
from sklearn.model_selection import train_test_split

X = train_data_mod[features]
y = train_data_mod.Transported

X_train, X_valid, y_train, y_valid = train_test_split(X, y)

list of all categorical columns

In [6]:
cat_cols = X_train.select_dtypes(include='object').columns.to_list()
cat_cols

['PassengerId',
 'HomePlanet',
 'CryoSleep',
 'Destination',
 'VIP',
 'Name',
 'Deck',
 'Num',
 'Side']

(code from the kaggle tutorial - checking to see how many unique values are in each categorical column)

In [7]:
# Get number of unique entries in each column with categorical data
object_nunique = list(map(lambda col: X_train[col].nunique(), cat_cols))
d = dict(zip(cat_cols, object_nunique))

# Print number of unique entries by column, in ascending order
sorted(d.items(), key=lambda x: x[1])

[('CryoSleep', 2),
 ('VIP', 2),
 ('Side', 2),
 ('HomePlanet', 3),
 ('Destination', 3),
 ('Deck', 8),
 ('Num', 1712),
 ('Name', 6350),
 ('PassengerId', 6519)]

In [8]:
usecat = ['CryoSleep', 'HomePlanet', 'Destination', 'Deck', 'Side']

In [9]:
from sklearn.preprocessing import OneHotEncoder

In [10]:
# Apply one-hot encoder to each column with categorical data
ohe = OneHotEncoder(handle_unknown='ignore', sparse=False)
ohe_train = pd.DataFrame(ohe.fit_transform(X_train[usecat]))
ohe_valid = pd.DataFrame(ohe.transform(X_valid[usecat]))

ohe_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6519 entries, 0 to 6518
Data columns (total 23 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   0       6519 non-null   float64
 1   1       6519 non-null   float64
 2   2       6519 non-null   float64
 3   3       6519 non-null   float64
 4   4       6519 non-null   float64
 5   5       6519 non-null   float64
 6   6       6519 non-null   float64
 7   7       6519 non-null   float64
 8   8       6519 non-null   float64
 9   9       6519 non-null   float64
 10  10      6519 non-null   float64
 11  11      6519 non-null   float64
 12  12      6519 non-null   float64
 13  13      6519 non-null   float64
 14  14      6519 non-null   float64
 15  15      6519 non-null   float64
 16  16      6519 non-null   float64
 17  17      6519 non-null   float64
 18  18      6519 non-null   float64
 19  19      6519 non-null   float64
 20  20      6519 non-null   float64
 21  21      6519 non-null   float64
 22  

In [11]:
# One-hot encoding removed index; put it back
ohe_train.index = X_train.index
ohe_valid.index = X_valid.index

-------------------------------------------------------------------------------------------------------------------------------

setting up for imputation

In [12]:
num = train_data._get_numeric_data()
num.drop(['Transported'], axis=1, inplace=True)
numdata = num.columns.to_list()

slightly modified code from the kaggle tutorial

In [13]:
from sklearn.ensemble import RandomForestClassifier

(I'm just splitting up a cell here so I can see what is taking so long)

https://stackoverflow.com/questions/67361786/how-to-measure-random-forest-classifier-accuracy

In [14]:
from sklearn.impute import SimpleImputer
from sklearn import metrics

In [15]:
i = SimpleImputer(strategy='most_frequent')
i_X_train = pd.DataFrame(i.fit_transform(X_train[numdata]))
i_X_valid = pd.DataFrame(i.transform(X_valid[numdata]))

i_X_train.columns = X_train[numdata].columns
i_X_valid.columns = X_valid[numdata].columns

i_X_train.index = X_train.index
i_X_valid.index = X_valid.index

i_X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6519 entries, 4451 to 1995
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Age           6519 non-null   float64
 1   RoomService   6519 non-null   float64
 2   FoodCourt     6519 non-null   float64
 3   ShoppingMall  6519 non-null   float64
 4   Spa           6519 non-null   float64
 5   VRDeck        6519 non-null   float64
dtypes: float64(6)
memory usage: 356.5 KB


-------------------------------------------------------------------------------------------------------------------------------

In [16]:
# Add one-hot encoded columns to numerical features
both_X_train = pd.concat([i_X_train, ohe_train], axis=1)
both_X_valid = pd.concat([i_X_valid, ohe_valid], axis=1)
both_X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6519 entries, 4451 to 1995
Data columns (total 29 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Age           6519 non-null   float64
 1   RoomService   6519 non-null   float64
 2   FoodCourt     6519 non-null   float64
 3   ShoppingMall  6519 non-null   float64
 4   Spa           6519 non-null   float64
 5   VRDeck        6519 non-null   float64
 6   0             6519 non-null   float64
 7   1             6519 non-null   float64
 8   2             6519 non-null   float64
 9   3             6519 non-null   float64
 10  4             6519 non-null   float64
 11  5             6519 non-null   float64
 12  6             6519 non-null   float64
 13  7             6519 non-null   float64
 14  8             6519 non-null   float64
 15  9             6519 non-null   float64
 16  10            6519 non-null   float64
 17  11            6519 non-null   float64
 18  12            6519 non-nu

In [17]:
both_X_train.columns = both_X_train.columns.astype(str)
both_X_valid.columns = both_X_valid.columns.astype(str)

importing models

In [18]:
from sklearn import ensemble
from sklearn.ensemble import GradientBoostingClassifier

doing some loops to see which parameters are the best for the gradient boosting, the lists changed to various tings throughout the process as I ran the loop and did some tests

https://intellipaat.com/blog/gradient-boosting-in-machine-learning/

In [42]:
n_estimators = [100, 125, 150, 175, 250]
max_features = [1, 2, 5, 8]
max_depth = [1, 2, 3, 4, 5]

for n in n_estimators:
    model = GradientBoostingClassifier(n_estimators=n, max_features=8, max_depth=5, random_state=1)
    model.fit(both_X_train, y_train)
    preds = model.predict(both_X_valid)
    print("max_features: ", n)
    print("Accuracy:", metrics.accuracy_score(y_valid, preds))

max_features:  100
Accuracy: 0.8077276908923643
max_features:  125
Accuracy: 0.8058877644894205
max_features:  150
Accuracy: 0.8077276908923643
max_features:  175
Accuracy: 0.8054277828886844
max_features:  250
Accuracy: 0.8058877644894205


the model with the highest accuracy score shown above

In [49]:
model = GradientBoostingClassifier(n_estimators=150, max_features=5, max_depth=2, random_state=1)
model.fit(both_X_train, y_train)
preds = model.predict(both_X_valid)
print("n_estimators: ", n)
print("max_features: ", f)
print("Accuracy:", metrics.accuracy_score(y_valid, preds))

n_estimators:  250
max_features:  10
Accuracy: 0.7907083716651334


-------------------------------------------------------------------------------------------------------------------------------

In [44]:
# doing what I did to my data to all my training data
ohe_X = pd.DataFrame(ohe.transform(X[usecat]))
ohe_X.index = X.index

i_X = pd.DataFrame(i.transform(X[numdata]))
i_X.columns = X[numdata].columns
i_X.index = X.index

both_X = pd.concat([i_X, ohe_X], axis=1)
both_X.columns = both_X.columns.astype(str)

# i_X.info()

model.fit(both_X, y)

GradientBoostingClassifier(max_depth=5, max_features=8, n_estimators=150,
                           random_state=1)

In [45]:
cabin = final_data['Cabin'].str.split('/', 3, expand=True)
cabin.columns = ['Deck', 'Num', 'Side']
final_data_mod = pd.concat([final_data, cabin], axis=1)
final_data_mod.drop(['Cabin'], axis=1, inplace=True)
final_data_mod

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Deck,Num,Side
0,0013_01,Earth,True,TRAPPIST-1e,27.0,False,0.0,0.0,0.0,0.0,0.0,Nelly Carsoning,G,3,S
1,0018_01,Earth,False,TRAPPIST-1e,19.0,False,0.0,9.0,0.0,2823.0,0.0,Lerome Peckers,F,4,S
2,0019_01,Europa,True,55 Cancri e,31.0,False,0.0,0.0,0.0,0.0,0.0,Sabih Unhearfus,C,0,S
3,0021_01,Europa,False,TRAPPIST-1e,38.0,False,0.0,6652.0,0.0,181.0,585.0,Meratz Caltilter,C,1,S
4,0023_01,Earth,False,TRAPPIST-1e,20.0,False,10.0,0.0,635.0,0.0,0.0,Brence Harperez,F,5,S
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4272,9266_02,Earth,True,TRAPPIST-1e,34.0,False,0.0,0.0,0.0,0.0,0.0,Jeron Peter,G,1496,S
4273,9269_01,Earth,False,TRAPPIST-1e,42.0,False,0.0,847.0,17.0,10.0,144.0,Matty Scheron,,,
4274,9271_01,Mars,True,55 Cancri e,,False,0.0,0.0,0.0,0.0,0.0,Jayrin Pore,D,296,P
4275,9273_01,Europa,False,,,False,0.0,2680.0,0.0,0.0,523.0,Kitakan Conale,D,297,P


In [46]:
ff = final_data_mod[features]
ohe_final = pd.DataFrame(ohe.transform(ff[usecat]))
ohe_final.index = ff.index

i_X_final = pd.DataFrame(i.fit_transform(ff[numdata]))
i_X_final.columns = ff[numdata].columns
i_X_final.index = ff.index

both_X_final = pd.concat([i_X_final, ohe_final], axis=1)
both_X_final.columns = both_X.columns.astype(str)
len(final_data)

4277

In [47]:
final_preds = model.predict(both_X_final)

In [48]:
output = pd.DataFrame({'PassengerId': final_data.PassengerId, 'Transported': final_preds})
output.to_csv('submission6.csv', index=False)