# Spaceship Titanic Competition
--------------------------------------------------------
## by alexis emerson

In [39]:
import pandas as pd

final_data = pd.read_csv('test.csv')
train_data = pd.read_csv('train.csv')

In [40]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8693 entries, 0 to 8692
Data columns (total 14 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   PassengerId   8693 non-null   object 
 1   HomePlanet    8492 non-null   object 
 2   CryoSleep     8476 non-null   object 
 3   Cabin         8494 non-null   object 
 4   Destination   8511 non-null   object 
 5   Age           8514 non-null   float64
 6   VIP           8490 non-null   object 
 7   RoomService   8512 non-null   float64
 8   FoodCourt     8510 non-null   float64
 9   ShoppingMall  8485 non-null   float64
 10  Spa           8510 non-null   float64
 11  VRDeck        8505 non-null   float64
 12  Name          8493 non-null   object 
 13  Transported   8693 non-null   bool   
dtypes: bool(1), float64(6), object(7)
memory usage: 891.5+ KB


## setting features up

#### splitting cabin into 3 columns (Deck, Num, Side)

In [41]:
cabin = train_data['Cabin'].str.split('/', 3, expand=True)
cabin.columns = ['Deck', 'Num', 'Side']
train_data_mod = pd.concat([train_data, cabin], axis=1)
train_data_mod.drop(['Cabin'], axis=1, inplace=True)

#### my beginning features

In [42]:
features = train_data_mod.columns.drop('Transported').to_list()
features

['PassengerId',
 'HomePlanet',
 'CryoSleep',
 'Destination',
 'Age',
 'VIP',
 'RoomService',
 'FoodCourt',
 'ShoppingMall',
 'Spa',
 'VRDeck',
 'Name',
 'Deck',
 'Num',
 'Side']

## splitting my data

In [43]:
from sklearn.model_selection import train_test_split

X = train_data_mod[features]
y = train_data_mod.Transported

X_train, X_valid, y_train, y_valid = train_test_split(X, y)

## categorical columns - one hot encoding

#### Listing all the categorical columns

In [44]:
cat_cols = X_train.select_dtypes(include='object').columns.to_list()
cat_cols

['PassengerId',
 'HomePlanet',
 'CryoSleep',
 'Destination',
 'VIP',
 'Name',
 'Deck',
 'Num',
 'Side']

#### checking to see the number of unique entries in every column

(code from the kaggle tutorial)

In [45]:
# Get number of unique entries in each column with categorical data
object_nunique = list(map(lambda col: X_train[col].nunique(), cat_cols))
d = dict(zip(cat_cols, object_nunique))

# Print number of unique entries by column, in ascending order
sorted(d.items(), key=lambda x: x[1])

[('CryoSleep', 2),
 ('VIP', 2),
 ('Side', 2),
 ('HomePlanet', 3),
 ('Destination', 3),
 ('Deck', 8),
 ('Num', 1718),
 ('Name', 6364),
 ('PassengerId', 6519)]

#### the final categorical columns i'm using to one hot encode

In [46]:
usecat = ['CryoSleep', 'HomePlanet', 'Destination', 'Deck', 'Side']

#### one hot encoding!

In [47]:
from sklearn.preprocessing import OneHotEncoder

In [48]:
# Apply one-hot encoder to each column with categorical data
ohe = OneHotEncoder(handle_unknown='ignore', sparse=False)
ohe_train = pd.DataFrame(ohe.fit_transform(X_train[usecat]))
ohe_valid = pd.DataFrame(ohe.transform(X_valid[usecat]))

ohe_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6519 entries, 0 to 6518
Data columns (total 23 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   0       6519 non-null   float64
 1   1       6519 non-null   float64
 2   2       6519 non-null   float64
 3   3       6519 non-null   float64
 4   4       6519 non-null   float64
 5   5       6519 non-null   float64
 6   6       6519 non-null   float64
 7   7       6519 non-null   float64
 8   8       6519 non-null   float64
 9   9       6519 non-null   float64
 10  10      6519 non-null   float64
 11  11      6519 non-null   float64
 12  12      6519 non-null   float64
 13  13      6519 non-null   float64
 14  14      6519 non-null   float64
 15  15      6519 non-null   float64
 16  16      6519 non-null   float64
 17  17      6519 non-null   float64
 18  18      6519 non-null   float64
 19  19      6519 non-null   float64
 20  20      6519 non-null   float64
 21  21      6519 non-null   float64
 22  

#### putting my index back

In [49]:
ohe_train.index = X_train.index
ohe_valid.index = X_valid.index

-------------------------------------------------------------------------------------------------------------------------------

## numerical columns - imputation

setting up my numerical columns

In [50]:
num = train_data._get_numeric_data()
num.drop(['Transported'], axis=1, inplace=True)
numdata = num.columns.to_list()

#### imputing!

In [51]:
from sklearn.impute import SimpleImputer
from sklearn import metrics

In [52]:
i = SimpleImputer(strategy='most_frequent')
i_X_train = pd.DataFrame(i.fit_transform(X_train[numdata]))
i_X_valid = pd.DataFrame(i.transform(X_valid[numdata]))

i_X_train.columns = X_train[numdata].columns
i_X_valid.columns = X_valid[numdata].columns

i_X_train.index = X_train.index
i_X_valid.index = X_valid.index

i_X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6519 entries, 1613 to 2350
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Age           6519 non-null   float64
 1   RoomService   6519 non-null   float64
 2   FoodCourt     6519 non-null   float64
 3   ShoppingMall  6519 non-null   float64
 4   Spa           6519 non-null   float64
 5   VRDeck        6519 non-null   float64
dtypes: float64(6)
memory usage: 356.5 KB


-------------------------------------------------------------------------------------------------------------------------------

## concatenating my numerical and categorical dataframes

In [53]:
both_X_train = pd.concat([i_X_train, ohe_train], axis=1)
both_X_valid = pd.concat([i_X_valid, ohe_valid], axis=1)
both_X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6519 entries, 1613 to 2350
Data columns (total 29 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Age           6519 non-null   float64
 1   RoomService   6519 non-null   float64
 2   FoodCourt     6519 non-null   float64
 3   ShoppingMall  6519 non-null   float64
 4   Spa           6519 non-null   float64
 5   VRDeck        6519 non-null   float64
 6   0             6519 non-null   float64
 7   1             6519 non-null   float64
 8   2             6519 non-null   float64
 9   3             6519 non-null   float64
 10  4             6519 non-null   float64
 11  5             6519 non-null   float64
 12  6             6519 non-null   float64
 13  7             6519 non-null   float64
 14  8             6519 non-null   float64
 15  9             6519 non-null   float64
 16  10            6519 non-null   float64
 17  11            6519 non-null   float64
 18  12            6519 non-nu

#### turning all my column titles into strings

In [54]:
both_X_train.columns = both_X_train.columns.astype(str)
both_X_valid.columns = both_X_valid.columns.astype(str)

## modelling!

#### importing models

In [55]:
from sklearn import ensemble
from sklearn.ensemble import GradientBoostingClassifier

some links! https://intellipaat.com/blog/gradient-boosting-in-machine-learning/, https://stackoverflow.com/questions/67361786/how-to-measure-random-forest-classifier-accuracy

#### my model, fitted to the train data

In [56]:
model = GradientBoostingClassifier(n_estimators=150, max_features=5, max_depth=2, random_state=1)
model.fit(both_X_train, y_train)
preds = model.predict(both_X_valid)
print("Accuracy:", metrics.accuracy_score(y_valid, preds))

Accuracy: 0.8035878564857406


-------------------------------------------------------------------------------------------------------------------------------

## refitting my model to ALL my training data

In [57]:
# doing what I did to my data to all my training data
ohe_X = pd.DataFrame(ohe.transform(X[usecat]))
ohe_X.index = X.index

i_X = pd.DataFrame(i.transform(X[numdata]))
i_X.columns = X[numdata].columns
i_X.index = X.index

both_X = pd.concat([i_X, ohe_X], axis=1)
both_X.columns = both_X.columns.astype(str)

# i_X.info()

model.fit(both_X, y)

GradientBoostingClassifier(max_depth=2, max_features=5, n_estimators=150,
                           random_state=1)

## modifying my final data

#### splitting final data Cabin column (as above)

In [58]:
cabin = final_data['Cabin'].str.split('/', 3, expand=True)
cabin.columns = ['Deck', 'Num', 'Side']
final_data_mod = pd.concat([final_data, cabin], axis=1)
final_data_mod.drop(['Cabin'], axis=1, inplace=True)

#### one hot encoding, imputing, concatenating final data (as above)

In [59]:
ff = final_data_mod[features]
ohe_final = pd.DataFrame(ohe.transform(ff[usecat]))
ohe_final.index = ff.index

i_X_final = pd.DataFrame(i.fit_transform(ff[numdata]))
i_X_final.columns = ff[numdata].columns
i_X_final.index = ff.index

both_X_final = pd.concat([i_X_final, ohe_final], axis=1)
both_X_final.columns = both_X.columns.astype(str)

#### predicting

In [60]:
final_preds = model.predict(both_X_final)

## code for submissions

In [61]:
output = pd.DataFrame({'PassengerId': final_data.PassengerId, 'Transported': final_preds})
output.to_csv('submission7.csv', index=False)