In [437]:

import tensorflow as tf
import pandas as pd
import numpy as np
import seaborn as sns
import sklearn as sk
import sklearn.pipeline as skp
import sklearn.preprocessing as skpp
import sklearn.impute as ski
import sklearn.compose as skc
import matplotlib.pyplot as plt

In [438]:
print(tf.__version__)

2.6.0


In [439]:
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

Num GPUs Available:  1


In [440]:
train_df = pd.read_csv('dataset/train.csv')
test_df = pd.read_csv('dataset/test.csv')
dataset_df = pd.concat([train_df, test_df], ignore_index=True)


print("Full train dataset shape is {}".format(dataset_df.shape))

Full train dataset shape is (12970, 14)


In [441]:
dataset_df.head(5)

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True


In [442]:
dataset_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12970 entries, 0 to 12969
Data columns (total 14 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   PassengerId   12970 non-null  object 
 1   HomePlanet    12682 non-null  object 
 2   CryoSleep     12660 non-null  object 
 3   Cabin         12671 non-null  object 
 4   Destination   12696 non-null  object 
 5   Age           12700 non-null  float64
 6   VIP           12674 non-null  object 
 7   RoomService   12707 non-null  float64
 8   FoodCourt     12681 non-null  float64
 9   ShoppingMall  12664 non-null  float64
 10  Spa           12686 non-null  float64
 11  VRDeck        12702 non-null  float64
 12  Name          12676 non-null  object 
 13  Transported   8693 non-null   object 
dtypes: float64(6), object(8)
memory usage: 1.4+ MB


In [443]:
dataset_df.describe()

Unnamed: 0,Age,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck
count,12700.0,12707.0,12681.0,12664.0,12686.0,12702.0
mean,28.771969,222.897852,451.961675,174.906033,308.476904,306.789482
std,14.387261,647.596664,1584.370747,590.55869,1130.279641,1180.097223
min,0.0,0.0,0.0,0.0,0.0,0.0
25%,19.0,0.0,0.0,0.0,0.0,0.0
50%,27.0,0.0,0.0,0.0,0.0,0.0
75%,38.0,49.0,77.0,29.0,57.0,42.0
max,79.0,14327.0,29813.0,23492.0,22408.0,24133.0


In [444]:
dataset_df.isnull().sum().sort_values(ascending=False)

Transported     4277
CryoSleep        310
ShoppingMall     306
Cabin            299
VIP              296
Name             294
FoodCourt        289
HomePlanet       288
Spa              284
Destination      274
Age              270
VRDeck           268
RoomService      263
PassengerId        0
dtype: int64

### Delete useless column 'Name'


In [445]:
dataset_df = dataset_df.drop('Name', axis=1)

### If a passenger is in cryosleep then he did not spend any money. Moreover , we can assume that if a passenger did not spent any money , he was in cryosleep


In [446]:
columns_money = ["RoomService", "FoodCourt", "ShoppingMall", "Spa", "VRDeck"]

dataset_df.loc[dataset_df["CryoSleep"] == 1,
               columns_money] = dataset_df.loc[dataset_df["CryoSleep"] == 1, columns_money].fillna(0)
dataset_df.loc[dataset_df["CryoSleep"] == 0,
               "VIP"] = dataset_df.loc[dataset_df["CryoSleep"] == 0, "VIP"].fillna(0)

dataset_df.loc[dataset_df[columns_money].sum(
    axis=1) == 0, "CryoSleep"] = dataset_df.loc[dataset_df[columns_money].sum(axis=1) == 0, "CryoSleep"].fillna(1)
dataset_df["CryoSleep"] = dataset_df["CryoSleep"].fillna(0)

### Dealing with AGE null values .


In [447]:
grouped = dataset_df.groupby(['HomePlanet', 'CryoSleep'])['Age']
dataset_df = dataset_df.merge(grouped.mean().reset_index(
    name="Age_means"), how='left', on=['HomePlanet', 'CryoSleep'])
dataset_df['Age'] = dataset_df['Age'].fillna(dataset_df['Age_means'])
dataset_df.drop('Age_means', axis=1)

  output = repr(obj)
  return method()


Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.000000,False,0.0,0.0,0.0,0.0,0.0,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.000000,False,109.0,9.0,25.0,549.0,44.0,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.000000,True,43.0,3576.0,0.0,6715.0,49.0,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.000000,False,0.0,1283.0,371.0,3329.0,193.0,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.000000,False,303.0,70.0,151.0,565.0,2.0,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...
12965,9266_02,Earth,True,G/1496/S,TRAPPIST-1e,34.000000,False,0.0,0.0,0.0,0.0,0.0,
12966,9269_01,Earth,False,,TRAPPIST-1e,42.000000,False,0.0,847.0,17.0,10.0,144.0,
12967,9271_01,Mars,True,D/296/P,55 Cancri e,28.876089,False,0.0,0.0,0.0,0.0,0.0,
12968,9273_01,Europa,False,D/297/P,,35.590855,False,0.0,2680.0,0.0,0.0,523.0,


### Format of the Cabin si Deck/Cabin_num/Side so we will split it in 3 different columns


In [448]:
dataset_df[["Deck", "Cabin_num", "Side"]
           ] = dataset_df["Cabin"].str.split("/", expand=True)

In [449]:
dataset_df.drop('Cabin', axis=1, inplace=True)

In [450]:
dataset_df.isnull().sum()

PassengerId        0
HomePlanet       288
CryoSleep          0
Destination      274
Age                5
VIP              113
RoomService      170
FoodCourt        180
ShoppingMall     175
Spa              177
VRDeck           177
Transported     4277
Age_means        288
Deck             299
Cabin_num        299
Side             299
dtype: int64

### Split categorical and numerical features


In [451]:
numerical_features = ['RoomService', 'FoodCourt', 'Cabin_num',
                      'ShoppingMall', 'Spa', 'VRDeck', 'Age']
category_features = ['HomePlanet', 'Deck',
                     'Side', 'Destination', 'CryoSleep', 'VIP']

In [452]:
numerical_pipeline = skp.Pipeline([('imputer', ski.SimpleImputer(
    strategy='mean')), ('scaler', skpp.StandardScaler())])
categorical_pipeline = skp.Pipeline([('imputer', ski.SimpleImputer(
    strategy='most_frequent')), ('encoder', skpp.OneHotEncoder(handle_unknown='ignore'))])
preprocessing = skc.ColumnTransformer(
    [('numerical', numerical_pipeline, numerical_features), ('categorical', categorical_pipeline, category_features)])

### Split test and train data


In [453]:
train_set = dataset_df.iloc[:len(train_df)]
test_set = dataset_df.iloc[len(train_df):len(train_df)+len(test_df)]
y = train_set["Transported"]
train_set.drop(columns=["Transported"], inplace=True)
test_set.drop(columns=["Transported"], inplace=True)
y = y.astype('bool')

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_set.drop(columns=["Transported"], inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_set.drop(columns=["Transported"], inplace=True)


In [454]:


# Shape before transformation
print('X_train shape: {}'.format(train_set.shape))
print('y_train shape: {}'.format(y.shape))
print('X_test shape: {}'.format(test_set.shape))

X_train shape: (8693, 15)
y_train shape: (8693,)
X_test shape: (4277, 15)


In [455]:
train_set_prepared = preprocessing.fit_transform(train_set)
test_set_prepared = preprocessing.transform(test_set)

feature_names = preprocessing.named_transformers_[
    'categorical'].named_steps['encoder'].get_feature_names_out(input_features=category_features)
all_feature_names = np.concatenate([numerical_features, feature_names])

In [456]:
transormed_train_set = pd.DataFrame(
    train_set_prepared, columns=all_feature_names)
transformed_test_set = pd.DataFrame(
    test_set_prepared, columns=all_feature_names)
transormed_train_set.head(5)

Unnamed: 0,RoomService,FoodCourt,Cabin_num,ShoppingMall,Spa,VRDeck,Age,HomePlanet_Earth,HomePlanet_Europa,HomePlanet_Mars,...,Deck_T,Side_P,Side_S,Destination_55 Cancri e,Destination_PSO J318.5-22,Destination_TRAPPIST-1e,CryoSleep_False,CryoSleep_True,VIP_False,VIP_True
0,-0.337739,-0.284876,-1.186627,-0.287431,-0.274488,-0.267007,0.709094,0.0,1.0,0.0,...,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0
1,-0.172587,-0.279233,-1.186627,-0.245601,0.213542,-0.228189,-0.336252,1.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0
2,-0.272587,1.957315,-1.186627,-0.287431,5.694769,-0.223778,2.033198,0.0,1.0,0.0,...,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0
3,-0.337739,0.519579,-1.186627,0.333325,2.684806,-0.096736,0.290955,0.0,1.0,0.0,...,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0
4,0.121354,-0.240985,-1.184651,-0.034778,0.227765,-0.265243,-0.89377,1.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0


In [457]:
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
Seed = 42
# First GBM Model
gbm_model_1 = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_features='sqrt', max_depth=5, random_state=Seed,
                                         min_samples_split=2, min_samples_leaf=3, loss='exponential', subsample=0.5)

# Second GBM Model
gbm_model_2 = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_features='log2', max_depth=5, random_state=Seed,
                                         min_samples_split=2, min_samples_leaf=3, subsample=0.5,)

# Third GBM Model
gbm_model_3 = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_features='log2', max_depth=5, random_state=Seed,
                                         min_samples_split=2, min_samples_leaf=3, subsample=0.5, loss='exponential')

In [458]:
X_train, X_test, y_train, y_test = sk.model_selection.train_test_split(
    transormed_train_set, y, test_size=0.2, random_state=Seed)  # Splitting the data into training and validation sets

In [459]:
from sklearn.metrics import accuracy_score

# Fit both models
gbm_model_1.fit(X_train, y_train)
gbm_model_2.fit(X_train, y_train)
gbm_model_3.fit(X_train, y_train)

# Run through predictions initially to train the Logistic Regression for predicting
gbm_1_predictions = gbm_model_1.predict(X_test)
gbm_2_predictions = gbm_model_2.predict(X_test)
gbm_3_predictions = gbm_model_3.predict(X_test)


# Stack both predictions to form a training set for the Logistic Regression
stacked_features = np.column_stack(
    (gbm_1_predictions, gbm_2_predictions, gbm_3_predictions))
print(stacked_features)

# Fit the meta model to stacked features
meta_model = LogisticRegression()
meta_model.fit(stacked_features, y_test)

# Predict yet again to get a new metric
gbm_1_base_preds = gbm_model_1.predict(X_test)
gbm_2_base_preds = gbm_model_2.predict(X_test)
gbm_3_base_preds = gbm_model_3.predict(X_test)


# Stack both predictions yet again
stacked_base_preds = np.column_stack(
    (gbm_1_base_preds, gbm_2_base_preds, gbm_3_base_preds))

# Get final predictions to gauge overall performance
ensemble_predictions = meta_model.predict(stacked_base_preds)

# Score the predictions and print them
ensemble_accuracy = accuracy_score(y_test, ensemble_predictions)
print("Ensemble Accuracy:", ensemble_accuracy)

[[False False False]
 [ True  True  True]
 [ True  True  True]
 ...
 [False False False]
 [ True  True  True]
 [False False False]]
Ensemble Accuracy: 0.7912593444508338


In [460]:

# Get new base predictions for actual test set
gbm_1_base_preds = gbm_model_1.predict(transformed_test_set)
gbm_2_base_preds = gbm_model_2.predict(transformed_test_set)
gbm_3_base_preds = gbm_model_3.predict(transformed_test_set)

# Combine both predictions for the meta model to predict
stacked_base_preds = np.column_stack(
    (gbm_1_base_preds, gbm_2_base_preds, gbm_3_base_preds))
ensemble_predictions = meta_model.predict(stacked_base_preds)

output = pd.DataFrame({
    'PassengerId': test_df['PassengerId'],
    'Transported': ensemble_predictions
})
output.to_csv('submission.csv', index=False)
print('Submission Saved')

Submission Saved
