# LAB | Ensemble Methods

**Load the data**

In this challenge, we will be working with the same Spaceship Titanic data, like the previous Lab. The data can be found here:

https://raw.githubusercontent.com/data-bootcamp-v4/data/main/spaceship_titanic.csv

Metadata

https://github.com/data-bootcamp-v4/data/blob/main/spaceship_titanic.md

In this Lab, you should try different ensemble methods in order to see if can obtain a better model than before. In order to do a fair comparison, you should perform the same feature scaling, engineering applied in previous Lab.

In [6]:
#Libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [10]:
spaceship = pd.read_csv("https://raw.githubusercontent.com/data-bootcamp-v4/data/main/spaceship_titanic.csv")
spaceship.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True


In [11]:
spaceship.isna().sum()
spaceship = spaceship.dropna()

In [13]:
#split by '/' 
spaceship[['Cabin', 'Deck', 'Side']] = spaceship["Cabin"].str.split("/", expand = True)


In [14]:
spaceship.drop(columns = ["PassengerId", "Name"], inplace = True)

In [15]:
# label HomePlanet
# Earth = 0
# Mars = 1
# Europa = 2

def label_planets(planet):
        if planet == "Earth":
                return 0
        elif planet == "Mars":
                return 1
        else:
                return 2

spaceship["HomePlanet"] = spaceship["HomePlanet"].apply(label_planets)

In [16]:
#label CryoSleep 
# False = 0
# True = 1

def label_cryo(cryosleep):
        if cryosleep == False:
                return 0
        elif cryosleep == True:
                return 1
        
spaceship["CryoSleep"] = spaceship["CryoSleep"].apply(label_cryo)

In [17]:
# label Cabin
# A = 0
# B = 1
# C = 2
# ...
#  T = 7

def label_cabin(cabin):
        if cabin == "A":
                return 0
        elif cabin == "B":
                return 1
        elif cabin == "C":
                return 2
        elif cabin == "D":
                return 3
        elif cabin == "E":
                return 4
        elif cabin == "F":
                return 5
        elif cabin == "G":
                return 6
        else:
                return 7

spaceship["Cabin"] = spaceship["Cabin"].apply(label_cabin)


In [18]:
# label Destination
# TRAPPIST-1e = 10
# PSO J318.5-22 = 11
# 55 Cancri e = 12

def label_destiny(planet):
        if planet == "TRAPPIST-1e":
                return 10
        elif planet == "PSO J318.5-22":
                return 11
        else:
                return 12

spaceship["Destination"] = spaceship["Destination"].apply(label_destiny)

In [19]:
# label Side
# S = 0
# P = 1

def label_side(side):
        if side == "S":
                return 0
        else:
                return 1

spaceship["Side"] = spaceship["Side"].apply(label_side)

In [None]:
#define VIP and Desck as int
spaceship["Deck"] = spaceship["Deck"].astype(int)
spaceship["VIP"] = spaceship["VIP"].astype(int)

In [None]:
spaceship.head(10)

Unnamed: 0,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Transported,Deck,Side
0,2,0,1,10,39.0,False,0.0,0.0,0.0,0.0,0.0,False,0,1
1,0,0,5,10,24.0,False,109.0,9.0,25.0,549.0,44.0,True,0,0
2,2,0,0,10,58.0,True,43.0,3576.0,0.0,6715.0,49.0,False,0,0
3,2,0,0,10,33.0,False,0.0,1283.0,371.0,3329.0,193.0,False,0,0
4,0,0,5,10,16.0,False,303.0,70.0,151.0,565.0,2.0,True,1,0
5,0,0,5,11,44.0,False,0.0,483.0,0.0,291.0,0.0,True,0,1
6,0,0,5,10,26.0,False,42.0,1539.0,3.0,0.0,0.0,True,2,0
8,0,0,5,10,35.0,False,0.0,785.0,17.0,216.0,0.0,True,3,0
9,2,1,1,12,14.0,False,0.0,0.0,0.0,0.0,0.0,True,1,1
11,2,0,1,12,45.0,False,39.0,7295.0,589.0,110.0,124.0,True,1,1


Now perform the same as before:
- Feature Scaling
- Feature Selection


In [25]:
#creating the normalizer
from sklearn.preprocessing import MinMaxScaler, StandardScaler

#MixMax will scale values between 0 and 1
normalizer = MinMaxScaler()
normalizer.fit(X_train)

In [26]:
#transform training and testing data - needed to transform both so the results are comparable
X_train_norm = normalizer.transform(X_train)
X_test_norm = normalizer.transform(X_test)

In [27]:
#creating a df for this data
X_train_norm = pd.DataFrame(X_train_norm, columns=X_train.columns)
X_test_norm = pd.DataFrame(X_test_norm, columns=X_test.columns)

display(X_train_norm)
display(X_test_norm)

Unnamed: 0,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Deck,Side
0,0.0,0.0,0.714286,0.0,0.227848,0.0,0.000000,0.009660,0.003270,0.000000,0.021391,0.603485,1.0
1,0.0,0.0,0.714286,0.0,0.430380,0.0,0.000000,0.000000,0.000000,0.047060,0.000000,0.076558,0.0
2,1.0,0.0,0.142857,1.0,0.443038,0.0,0.008770,0.057659,0.256889,0.000000,0.003885,0.085533,1.0
3,0.0,1.0,0.857143,0.0,0.037975,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.737064,1.0
4,1.0,0.0,0.428571,0.0,0.746835,0.0,0.000000,0.074129,0.000000,0.219955,0.000541,0.035903,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
5279,0.0,0.0,0.857143,0.0,0.037975,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.013728,1.0
5280,1.0,0.0,0.285714,1.0,0.569620,0.0,0.006653,0.200382,0.000000,0.000000,0.030734,0.127244,1.0
5281,1.0,0.0,0.142857,1.0,0.367089,0.0,0.008065,0.124342,0.000000,0.078882,0.064910,0.153643,1.0
5282,0.0,0.0,0.857143,0.0,0.544304,0.0,0.000907,0.000000,0.100794,0.000000,0.035848,0.206441,0.0


Unnamed: 0,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Deck,Side
0,0.0,0.0,0.571429,1.0,0.329114,0.0,0.000101,0.000000,0.000000,0.000000,0.035307,0.248680,1.0
1,0.0,0.0,0.857143,0.0,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.612988,1.0
2,0.5,0.0,0.714286,0.0,0.556962,0.0,0.000101,0.000000,0.194302,0.000592,0.000000,0.897571,0.0
3,1.0,0.0,0.142857,1.0,0.405063,0.0,0.000000,0.005166,0.000000,0.295714,0.094611,0.096093,0.0
4,0.0,0.0,0.714286,0.0,0.265823,0.0,0.016431,0.004897,0.005044,0.013192,0.004672,0.910771,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1317,0.0,1.0,0.857143,0.0,0.303797,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.230729,1.0
1318,0.0,0.0,0.857143,0.0,0.215190,0.0,0.000302,0.002784,0.086595,0.000000,0.000049,0.346885,1.0
1319,0.0,1.0,0.857143,0.0,0.025316,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.167371,0.0
1320,1.0,0.0,0.142857,0.0,0.544304,0.0,0.000000,0.300138,0.000000,0.008238,0.042880,0.158395,0.0


**Perform Train Test Split**

In [23]:
# split df in features and target
features = spaceship.drop("Transported", axis=1)
target = spaceship["Transported"]

display(features)
display(target)

Unnamed: 0,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Deck,Side
0,2,0,1,10,39.0,False,0.0,0.0,0.0,0.0,0.0,0,1
1,0,0,5,10,24.0,False,109.0,9.0,25.0,549.0,44.0,0,0
2,2,0,0,10,58.0,True,43.0,3576.0,0.0,6715.0,49.0,0,0
3,2,0,0,10,33.0,False,0.0,1283.0,371.0,3329.0,193.0,0,0
4,0,0,5,10,16.0,False,303.0,70.0,151.0,565.0,2.0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
8688,2,0,0,12,41.0,True,0.0,6819.0,0.0,1643.0,74.0,98,1
8689,0,1,6,11,18.0,False,0.0,0.0,0.0,0.0,0.0,1499,0
8690,0,0,6,10,26.0,False,0.0,0.0,1872.0,1.0,0.0,1500,0
8691,2,0,4,12,32.0,False,0.0,1049.0,0.0,353.0,3235.0,608,0


0       False
1        True
2       False
3       False
4        True
        ...  
8688    False
8689    False
8690     True
8691    False
8692     True
Name: Transported, Length: 6606, dtype: bool

In [24]:
#split into test and train data
X_train, X_test, y_train, y_test =  train_test_split(features, target, test_size=0.20)

**Model Selection** - now you will try to apply different ensemble methods in order to get a better model

In [29]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import BaggingRegressor, RandomForestRegressor,AdaBoostRegressor, GradientBoostingRegressor

from sklearn.metrics import r2_score, mean_absolute_error, root_mean_squared_error

- Bagging and Pasting

In [49]:
bagging_reg = BaggingRegressor(DecisionTreeRegressor(max_depth=20),
                               n_estimators=2000,
                               max_samples = 1000,
                               bootstrap=False)

In [50]:
bagging_reg.fit(X_train_norm, y_train)

In [51]:
pred = bagging_reg.predict(X_test_norm)

print("R2 score", r2_score(y_test, pred))
print("RMSE", root_mean_squared_error(y_test, pred))
print("MAE", mean_absolute_error(y_test, pred))

R2 score 0.44387032222097766
RMSE 0.37283594847338886
MAE 0.27746598670261774


In [52]:
pred = bagging_reg.predict(X_train_norm)

print("R2 score", r2_score(y_train, pred))
print("RMSE", root_mean_squared_error(y_train, pred))
print("MAE", mean_absolute_error(y_train, pred))

R2 score 0.6600386253812123
RMSE 0.2915082914969647
MAE 0.21561358441589434


In [None]:
#0.444 on test_data
#0.660 on train_data

# big difference it overfits everytime with bagging

In [None]:
### MISSING PASTING !!! 

- Random Forests

In [101]:
forest = RandomForestRegressor(n_estimators=800, max_depth=6)

In [102]:
forest.fit(X_train_norm, y_train)

In [103]:
pred = forest.predict(X_test_norm)

print("R2 score", r2_score(y_test, pred))
print("RMSE", root_mean_squared_error(y_test, pred))
print("MAE", mean_absolute_error(y_test, pred))

R2 score 0.4247359479063654
RMSE 0.37919566351802686
MAE 0.2847206584862691


In [104]:
pred = forest.predict(X_train_norm)

print("R2 score", r2_score(y_train, pred))
print("RMSE", root_mean_squared_error(y_train, pred))
print("MAE", mean_absolute_error(y_train, pred))

R2 score 0.5230002569788883
RMSE 0.3452989011744551
MAE 0.25820306921280456


In [None]:
# generaly more estimators increase R2 score on train_data making the model overfit
# 

- Gradient Boosting

In [145]:
gb_reg = GradientBoostingRegressor(max_depth=6, n_estimators=30)
gb_reg.fit(X_train_norm, y_train)

In [146]:
pred = gb_reg.predict(X_test_norm)

print("R2 score", r2_score(y_test, pred))
print("RMSE", root_mean_squared_error(y_test, pred))
print("MAE", mean_absolute_error(y_test, pred))

R2 score 0.4503081707034494
RMSE 0.37067166162080345
MAE 0.2804973274320405


In [147]:
pred = gb_reg.predict(X_train_norm)

print("R2 score", r2_score(y_train, pred))
print("RMSE", root_mean_squared_error(y_train, pred))
print("MAE", mean_absolute_error(y_train, pred))

R2 score 0.6128342018382649
RMSE 0.3110889724260125
MAE 0.23392829109661106


- Adaptive Boosting

In [154]:
gb_reg = GradientBoostingRegressor(max_depth=6, n_estimators=70)
gb_reg.fit(X_train_norm, y_train)

In [155]:
pred = gb_reg.predict(X_test_norm)

print("R2 score", r2_score(y_test, pred))
print("RMSE", root_mean_squared_error(y_test, pred))
print("MAE", mean_absolute_error(y_test, pred))

R2 score 0.45693389316600985
RMSE 0.36843093952026634
MAE 0.2659670894512934


In [156]:
pred = gb_reg.predict(X_train_norm)

print("R2 score", r2_score(y_train, pred))
print("RMSE", root_mean_squared_error(y_train, pred))
print("MAE", mean_absolute_error(y_train, pred))

R2 score 0.6902504984430217
RMSE 0.2782540100541956
MAE 0.19710279230239985


Which model is the best and why?

In [None]:
# best model seems to be Adaptive Boosting
# it has the highest value for R2 squared and the lowest value for errors
# Note: it does overfit and that should be upgraded