In [4]:
# Set up environment with libraries & data

# Importing Packages/Libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import random
import itertools

from tqdm.notebook import tqdm

from sklearn.feature_selection import SelectFromModel
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.linear_model import LinearRegression,LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier, export_graphviz
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
from sklearn.metrics import confusion_matrix,classification_report,roc_auc_score,roc_curve,make_scorer
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.cluster import KMeans
from sklearn import preprocessing
from sklearn.preprocessing import RobustScaler
from imblearn.over_sampling import SMOTE
from six import StringIO
from IPython.display import Image
import statsmodels.formula.api as smf
import pydotplus
# Ignore Warnings
import warnings
warnings.filterwarnings('ignore')
# Accessing Dataset
r_train = pd.read_csv("data/train.csv")
r_test = pd.read_csv("data/test.csv")

In [5]:
#Cleaning data
le = preprocessing.LabelEncoder()

# Cleaning and Nulls
train = r_train[['HomePlanet',
                 'CryoSleep',
                 'Cabin',
                 'Destination',
                 'Age',
                 'VIP',
                 'RoomService',
                 'FoodCourt',
                 'ShoppingMall',
                 'Spa',
                 'VRDeck',
                 'Transported']]

train[['CabinFloor','CabinNumber','CabinType']] = train['Cabin'].str.split(pat='/',expand=True)

train = train.astype({'HomePlanet':'category',
                      'CryoSleep':'string',
                      'Cabin':'string',
                      'Destination':'category',
                      'Age':'float',
                      'VIP':'string',
                      'RoomService':'float',
                      'FoodCourt':'float',
                      'ShoppingMall':'float',
                      'Spa':'float',
                      'VRDeck':'float',
                      'Transported':'string',
                      'CabinFloor':'category',
                      'CabinNumber':'float',
                      'CabinType':'category'})

# 2
train['CryoSleep_N'] = train['CryoSleep'].map({'True':1,'False':0})
train['VIP_N'] = train['VIP'].map({'True':1,'False':0})
train['Transported_N'] = train['Transported'].map({'True':1,'False':0})
train['CabinType_N'] = le.fit_transform(train['CabinType'])

fillnalist_1 = ['CryoSleep_N','VIP_N','Transported_N','CabinType_N']

for column in fillnalist_1:
    prob = (train[column].sum() / train[column].notnull().sum())
    train[column] = train[column].apply(lambda x:
                                        1 if x == 1
                                        else(0 if x == 0
                                             else(1 if random.random() < prob
                                                  else 0)))

train['HomePlanet_N'] = le.fit_transform(train['HomePlanet'])
train['Destination_N'] = le.fit_transform(train['Destination'])

fillnalist_2 = ['HomePlanet_N','Destination_N']

for column in fillnalist_2:
    prob2 = (len(train[train[column]==2]) / train[column].notnull().sum())
    prob1 = (len(train[train[column]==1]) / train[column].notnull().sum())
    train[column] = train[column].apply(lambda x:
                                        2 if x == 2
                                        else(1 if x == 1
                                             else(0 if x == 0
                                                  else(2 if random.random() < prob2
                                                       else (1 if random.random() < (prob1+prob2)
                                                             else 0)))))

train['CabinFloor_N'] = le.fit_transform(train['CabinFloor'])
prob7 = (len(train[train['CabinFloor_N']==7]) / train['CabinFloor_N'].notnull().sum())
prob6 = (len(train[train['CabinFloor_N']==6]) / train['CabinFloor_N'].notnull().sum())
prob5 = (len(train[train['CabinFloor_N']==5]) / train['CabinFloor_N'].notnull().sum())
prob4 = (len(train[train['CabinFloor_N']==4]) / train['CabinFloor_N'].notnull().sum())
prob3 = (len(train[train['CabinFloor_N']==3]) / train['CabinFloor_N'].notnull().sum())
prob2 = (len(train[train['CabinFloor_N']==2]) / train['CabinFloor_N'].notnull().sum())
prob1 = (len(train[train['CabinFloor_N']==1]) / train['CabinFloor_N'].notnull().sum())
train['CabinFloor_N'] = train['CabinFloor_N'].apply(lambda x:
                                                    7 if x == 7 else(6 if x == 6 else(5 if x == 5 else(4 if x == 4 else(3 if x == 3 else(2 if x == 2 else(1 if x == 1 else(0 if x == 0
                                                    else(7 if random.random() < prob7
                                                        else(6 if random.random() < (prob7+prob6)
                                                             else(5 if random.random() < (prob7+prob6+prob5)
                                                                 else(4 if random.random() < (prob7+prob6+prob5+prob4)
                                                                     else(3 if random.random() < (prob7+prob6+prob5+prob4+prob3)
                                                                         else(2 if random.random() < (prob7+prob6+prob5+prob4+prob3+prob2)
                                                                         else(1 if random.random() < (prob7+prob6+prob5+prob4+prob3+prob2+prob1)
                                                                         else 0)))))))))))))))

# Dummies
dum_HomePlanet = pd.get_dummies(train['HomePlanet_N'],prefix='HomePlanet',prefix_sep='_')
dum_Destination = pd.get_dummies(train['Destination_N'],prefix='Destination',prefix_sep='_')
dum_CabinFloor = pd.get_dummies(train['CabinFloor_N'],prefix='CabinFloor',prefix_sep='_')
dum_CabinType = pd.get_dummies(train['CabinType_N'],prefix='CabinType',prefix_sep='_')
dummies_list = [dum_HomePlanet,dum_Destination,dum_CabinFloor,dum_CabinType]

for df in dummies_list:
    train = train.merge(df,
                        how="left",
                        left_index=True,
                        right_index=True)

train['Age'] = train['Age'].astype('float').fillna(train['Age'].median()).astype('float')
train['RoomService'] = train['RoomService'].astype('float').fillna(train['RoomService'].median()).astype('float')
train['FoodCourt'] = train['FoodCourt'].astype('float').fillna(train['FoodCourt'].median()).astype('float')
train['ShoppingMall'] = train['ShoppingMall'].astype('float').fillna(train['ShoppingMall'].median()).astype('float')
train['Spa'] = train['Spa'].astype('float').fillna(train['Spa'].median()).astype('float')
train['VRDeck'] = train['VRDeck'].astype('float').fillna(train['VRDeck'].median()).astype('float')
train['CabinNumber'] = train['CabinNumber'].astype('float').fillna(train['CabinNumber'].median()).astype('float')

# Bins
train['AgeBin'] = pd.cut(train['Age'], bins=5,labels=(1,2,3,4,5))
train['RoomServiceBin'] = pd.cut(train['RoomService'], bins=5,labels=(1,2,3,4,5))
train['FoodCourtBin'] = pd.cut(train['FoodCourt'], bins=5,labels=(1,2,3,4,5))
train['ShoppingMallBin'] = pd.cut(train['ShoppingMall'], bins=5,labels=(1,2,3,4,5))
train['SpaBin'] = pd.cut(train['Spa'], bins=5,labels=(1,2,3,4,5))
train['VRDeckBin'] = pd.cut(train['VRDeck'], bins=5,labels=(1,2,3,4,5))
train['CabinNumberBin'] = pd.cut(train['CabinNumber'], bins=5,labels=(1,2,3,4,5))

In [6]:
train_final = train[['HomePlanet_N',
                      'HomePlanet_0',
                      'HomePlanet_1',
                      'HomePlanet_2',
                      'CryoSleep_N',
                      'CabinFloor_N',
                      'CabinFloor_0',
                      'CabinFloor_1',
                      'CabinFloor_2',
                      'CabinFloor_3',
                      'CabinFloor_4',
                      'CabinFloor_5',
                      'CabinFloor_6',
                      'CabinFloor_7',
                      'CabinNumber',
                      'CabinNumberBin',
                      'CabinType_N',
                      'CabinType_0',
                      'CabinType_1',
                      'Destination_N',
                      'Destination_0',
                      'Destination_1',
                      'Destination_2',
                      'Age',
                      'AgeBin',
                      'VIP_N',
                      'RoomService',
                      'RoomServiceBin',
                      'FoodCourt',
                      'FoodCourtBin',
                      'ShoppingMall',
                      'ShoppingMallBin',
                      'Spa',
                      'SpaBin',
                      'VRDeck',
                      'VRDeckBin',
                      'Transported',
                      'Transported_N']]

train_final.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8693 entries, 0 to 8692
Data columns (total 38 columns):
 #   Column           Non-Null Count  Dtype   
---  ------           --------------  -----   
 0   HomePlanet_N     8693 non-null   int64   
 1   HomePlanet_0     8693 non-null   uint8   
 2   HomePlanet_1     8693 non-null   uint8   
 3   HomePlanet_2     8693 non-null   uint8   
 4   CryoSleep_N      8693 non-null   int64   
 5   CabinFloor_N     8693 non-null   int64   
 6   CabinFloor_0     8693 non-null   uint8   
 7   CabinFloor_1     8693 non-null   uint8   
 8   CabinFloor_2     8693 non-null   uint8   
 9   CabinFloor_3     8693 non-null   uint8   
 10  CabinFloor_4     8693 non-null   uint8   
 11  CabinFloor_5     8693 non-null   uint8   
 12  CabinFloor_6     8693 non-null   uint8   
 13  CabinFloor_7     8693 non-null   uint8   
 14  CabinNumber      8693 non-null   float64 
 15  CabinNumberBin   8693 non-null   category
 16  CabinType_N      8693 non-null   int64   


In [7]:
le = preprocessing.LabelEncoder()

# Cleaning and Nulls
test = r_test[['PassengerId',
               'HomePlanet',
               'CryoSleep',
               'Cabin',
               'Destination',
               'Age',
               'VIP',
               'RoomService',
               'FoodCourt',
               'ShoppingMall',
               'Spa',
               'VRDeck']]

test[['CabinFloor','CabinNumber','CabinType']] = test['Cabin'].str.split(pat='/',expand=True)

test = test.astype({'HomePlanet':'category',
                      'CryoSleep':'string',
                      'Cabin':'string',
                      'Destination':'category',
                      'Age':'float',
                      'VIP':'string',
                      'RoomService':'float',
                      'FoodCourt':'float',
                      'ShoppingMall':'float',
                      'Spa':'float',
                      'VRDeck':'float',
                      'CabinFloor':'category',
                      'CabinNumber':'float',
                      'CabinType':'category'})

# 2
test['CryoSleep_N'] = test['CryoSleep'].map({'True':1,'False':0})
test['VIP_N'] = test['VIP'].map({'True':1,'False':0})
test['CabinType_N'] = le.fit_transform(test['CabinType'])

fillnalist_1 = ['CryoSleep_N','VIP_N','CabinType_N']

for column in fillnalist_1:
    prob = (test[column].sum() / test[column].notnull().sum())
    test[column] = test[column].apply(lambda x:
                                        1 if x == 1
                                        else(0 if x == 0
                                             else(1 if random.random() < prob
                                                  else 0)))

test['HomePlanet_N'] = le.fit_transform(test['HomePlanet'])
test['Destination_N'] = le.fit_transform(test['Destination'])

fillnalist_2 = ['HomePlanet_N','Destination_N']

for column in fillnalist_2:
    prob2 = (len(test[test[column]==2]) / test[column].notnull().sum())
    prob1 = (len(test[test[column]==1]) / test[column].notnull().sum())
    test[column] = test[column].apply(lambda x:
                                        2 if x == 2
                                        else(1 if x == 1
                                             else(0 if x == 0
                                                  else(2 if random.random() < prob2
                                                       else (1 if random.random() < (prob1+prob2)
                                                             else 0)))))

test['CabinFloor_N'] = le.fit_transform(test['CabinFloor'])
prob7 = (len(test[test['CabinFloor_N']==7]) / test['CabinFloor_N'].notnull().sum())
prob6 = (len(test[test['CabinFloor_N']==6]) / test['CabinFloor_N'].notnull().sum())
prob5 = (len(test[test['CabinFloor_N']==5]) / test['CabinFloor_N'].notnull().sum())
prob4 = (len(test[test['CabinFloor_N']==4]) / test['CabinFloor_N'].notnull().sum())
prob3 = (len(test[test['CabinFloor_N']==3]) / test['CabinFloor_N'].notnull().sum())
prob2 = (len(test[test['CabinFloor_N']==2]) / test['CabinFloor_N'].notnull().sum())
prob1 = (len(test[test['CabinFloor_N']==1]) / test['CabinFloor_N'].notnull().sum())
test['CabinFloor_N'] = test['CabinFloor_N'].apply(lambda x:
                                                    7 if x == 7 else(6 if x == 6 else(5 if x == 5 else(4 if x == 4 else(3 if x == 3 else(2 if x == 2 else(1 if x == 1 else(0 if x == 0
                                                    else(7 if random.random() < prob7
                                                        else(6 if random.random() < (prob7+prob6)
                                                             else(5 if random.random() < (prob7+prob6+prob5)
                                                                 else(4 if random.random() < (prob7+prob6+prob5+prob4)
                                                                     else(3 if random.random() < (prob7+prob6+prob5+prob4+prob3)
                                                                         else(2 if random.random() < (prob7+prob6+prob5+prob4+prob3+prob2)
                                                                         else(1 if random.random() < (prob7+prob6+prob5+prob4+prob3+prob2+prob1)
                                                                         else 0)))))))))))))))

# Dummies
dum_HomePlanet = pd.get_dummies(test['HomePlanet_N'],prefix='HomePlanet',prefix_sep='_')
dum_Destination = pd.get_dummies(test['Destination_N'],prefix='Destination',prefix_sep='_')
dum_CabinFloor = pd.get_dummies(test['CabinFloor_N'],prefix='CabinFloor',prefix_sep='_')
dum_CabinType = pd.get_dummies(test['CabinType_N'],prefix='CabinType',prefix_sep='_')
dummies_list = [dum_HomePlanet,dum_Destination,dum_CabinFloor,dum_CabinType]

for df in dummies_list:
    test = test.merge(df,
                        how="left",
                        left_index=True,
                        right_index=True)

test['Age'] = test['Age'].astype('float').fillna(test['Age'].median()).astype('float')
test['RoomService'] = test['RoomService'].astype('float').fillna(test['RoomService'].median()).astype('float')
test['FoodCourt'] = test['FoodCourt'].astype('float').fillna(test['FoodCourt'].median()).astype('float')
test['ShoppingMall'] = test['ShoppingMall'].astype('float').fillna(test['ShoppingMall'].median()).astype('float')
test['Spa'] = test['Spa'].astype('float').fillna(test['Spa'].median()).astype('float')
test['VRDeck'] = test['VRDeck'].astype('float').fillna(test['VRDeck'].median()).astype('float')
test['CabinNumber'] = test['CabinNumber'].astype('float').fillna(test['CabinNumber'].median()).astype('float')
test['spa*age'] = test['Spa']*test['Age']
# Bins
test['AgeBin'] = pd.cut(test['Age'], bins=5,labels=(1,2,3,4,5))
test['RoomServiceBin'] = pd.cut(test['RoomService'], bins=5,labels=(1,2,3,4,5))
test['FoodCourtBin'] = pd.cut(test['FoodCourt'], bins=5,labels=(1,2,3,4,5))
test['ShoppingMallBin'] = pd.cut(test['ShoppingMall'], bins=5,labels=(1,2,3,4,5))
test['SpaBin'] = pd.cut(test['Spa'], bins=5,labels=(1,2,3,4,5))
test['VRDeckBin'] = pd.cut(test['VRDeck'], bins=5,labels=(1,2,3,4,5))
test['CabinNumberBin'] = pd.cut(test['CabinNumber'], bins=5,labels=(1,2,3,4,5))
test['total_spending']=test['RoomService']+test['ShoppingMall']+test['RoomService'] 
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4277 entries, 0 to 4276
Data columns (total 46 columns):
 #   Column           Non-Null Count  Dtype   
---  ------           --------------  -----   
 0   PassengerId      4277 non-null   object  
 1   HomePlanet       4190 non-null   category
 2   CryoSleep        4184 non-null   string  
 3   Cabin            4177 non-null   string  
 4   Destination      4185 non-null   category
 5   Age              4277 non-null   float64 
 6   VIP              4184 non-null   string  
 7   RoomService      4277 non-null   float64 
 8   FoodCourt        4277 non-null   float64 
 9   ShoppingMall     4277 non-null   float64 
 10  Spa              4277 non-null   float64 
 11  VRDeck           4277 non-null   float64 
 12  CabinFloor       4177 non-null   category
 13  CabinNumber      4277 non-null   float64 
 14  CabinType        4177 non-null   category
 15  CryoSleep_N      4277 non-null   int64   
 16  VIP_N            4277 non-null   int64   


In [8]:
test_final = test[['PassengerId',
                   'HomePlanet_N',
                   'HomePlanet_0',
                   'HomePlanet_1',
                   'HomePlanet_2',
                   'CryoSleep_N',
                   'CabinFloor_N',
                   'CabinFloor_0',
                   'CabinFloor_1',
                   'CabinFloor_2',
                   'CabinFloor_3',
                   'CabinFloor_4',
                   'CabinFloor_5',
                   'CabinFloor_6',
                   'CabinFloor_7',
                   'CabinNumber',
                   'CabinNumberBin',
                   'CabinType_N',
                   'CabinType_0',
                   'CabinType_1',
                   'Destination_N',
                   'Destination_0',
                   'Destination_1',
                   'Destination_2',
                   'Age',
                   'AgeBin',
                   'VIP_N',
                   'RoomService',
                   'RoomServiceBin',
                   'FoodCourt',
                   'FoodCourtBin',
                   'ShoppingMall',
                   'ShoppingMallBin',
                   'Spa',
                   'SpaBin',
                   'VRDeck',
                   'VRDeckBin']]

In [9]:
x_var = ['HomePlanet_N',
         'HomePlanet_0',
         'HomePlanet_1',
         'HomePlanet_2',
         'CryoSleep_N',
         'CabinFloor_N',
         'CabinFloor_0',
         'CabinFloor_1',
         'CabinFloor_2',
         'CabinFloor_3',
         'CabinFloor_4',
         'CabinFloor_5',
         'CabinFloor_6',
         'CabinFloor_7',
         'CabinNumber',
         'CabinNumberBin',
         'CabinType_N',
         'CabinType_0',
         'CabinType_1',
         'Destination_N',
         'Destination_0',
         'Destination_1',
         'Destination_2',
         'Age',
         'AgeBin',
         'VIP_N',
         'RoomService',
         'RoomServiceBin',
         'FoodCourt',
         'FoodCourtBin',
         'ShoppingMall',
         'ShoppingMallBin',
         'Spa',
         'SpaBin',
         'VRDeck',
         'VRDeckBin']

In [10]:
# DEFINING X AND Y VARIABLES
x = train_final.loc[:, x_var]
y = train_final.loc[:, 'Transported_N']
col_names = x.columns

# SCALING DATA
scaler = RobustScaler()
scaler.fit(x)
x_scaled = scaler.transform(x)

# SPLITTING DATA - with stratification
x_train, x_test, y_train, y_test = train_test_split(
            x_scaled,
            y,
            test_size    = 0.25,
            random_state = 219,
            stratify     = y)

#x_train = x_scaled
#y_train = y

# BALANCING DATA
os = SMOTE(random_state=0)

x_smote,y_smote = os.fit_resample(x_train, y_train)
x_smote = pd.DataFrame(data = x_smote,
                       columns = x_var)
y_smote= pd.Series(data=y_smote)

# RANDOM FOREST
randomf = RandomForestClassifier()
randomf = randomf.fit(x_smote, y_smote)
y_predicted = randomf.predict(x_test)

# Model Performance Analysis
con_matrix = confusion_matrix(y_test, y_predicted)
class_report = classification_report(y_test,y_predicted)
AUC = roc_auc_score(y_test, y_predicted).round(4)
MSE = mean_squared_error(y_test, y_predicted).round(4)
Score = randomf.score(x_test,y_test).round(4)

# 6. Printing results
# Accuracy
print(f"Score: {Score}")
print(f"""{"-"*50}""")
# Confusion Matrix
print(f"""Confusion Matrix 
{con_matrix}""")
print(f"""{"-"*50}""")
# Clasification Matrix
print(f"""Classification Report
{class_report}""")
print(f"""{"-"*50}""")
# AUC
print(f"AUC: {AUC}")
print(f"""{"-"*50}""")
# MSE
print(f"MSE: {MSE}")

Score: 0.8004
--------------------------------------------------
Confusion Matrix 
[[910 169]
 [265 830]]
--------------------------------------------------
Classification Report
              precision    recall  f1-score   support

           0       0.77      0.84      0.81      1079
           1       0.83      0.76      0.79      1095

    accuracy                           0.80      2174
   macro avg       0.80      0.80      0.80      2174
weighted avg       0.80      0.80      0.80      2174

--------------------------------------------------
AUC: 0.8007
--------------------------------------------------
MSE: 0.1996


In [12]:
# RANDOM FOREST TUNING (Daoud, 2023 and Koehrsen, 2018)

# DEFINING X AND Y VARIABLES
xt = train_final.loc[:, x_var]
yt = train_final.loc[:, 'Transported_N']
col_names = xt.columns

# SCALING
scaler = RobustScaler()
scaler.fit(xt.values)
xt_scaled = scaler.transform(xt.values)

# Train-test split with stratification
xt_train, xt_test, yt_train, yt_test = train_test_split(
            xt.values,
            yt,
            test_size    = 0.25,
            random_state = 219,
            stratify     = yt)

# BALANCING
os = SMOTE(random_state=0)

xt_smote,yt_smote = os.fit_resample(xt_train, yt_train)
xt_smote = pd.DataFrame(data = xt_smote,
                       columns = x_var)
yt_smote= pd.Series(data=yt_smote)

# RANDOM FOREST TUNING USING RANDOMIZED SEARCH CV
# Tuning Parameters
estimator_space  = np.arange(140, 200, 100) # number of trees you want to build before taking the maximum voting or averages of predictions, higher the better
criterion_space  = ['gini', 'gini'] # measures quality of the split
    # gini = measure of variance; higher the more mis-classification
depth_space      = np.arange(2, 16, 1) # max number of levels in each decision tree
#leaf_space       = np.arange(1, 1, 1) # smaller leaf makes the model more prone to capturing noise in train data, >50, higher the better but needs balance
#bootstrap_space  = [True, False] # bootstraps the data for each tree, then grows a decision tree that can only use a random subset of features at each split.
warm_start_space = [True, True] # model learns with every test
#split_space      = np.arange(1, 525, 25)
#features_space   = np.arange(1, 14, 1)
#max_features: [0.1, 0.3, 0.6] # maximum number of features Random Forest is allowed to try in individual tree, higher the better but needs balance

# Creating parameter grids
param_grid = {'n_estimators'     : estimator_space,
              'criterion'        : criterion_space,
              'max_depth'        : depth_space,
              #'min_samples_leaf' : leaf_space,
              #'bootstrap'        : bootstrap_space,
              'warm_start'       : warm_start_space,
              #"max_features": max_features
             }

# Setting the default random forest model
rf_default = RandomForestClassifier(random_state = 219)

# Randomized search and choosing of parameters
rf_tuned_cv = RandomizedSearchCV(estimator           = rf_default,
                                 param_distributions = param_grid,
                                 cv           = 3,
                                 n_iter       = 10,
                                 random_state = 219,
                                 scoring      = make_scorer(roc_auc_score,
                                                            needs_threshold = False))

# Fitting tuned dataset to original dataset for cross validation
rf_tuned_cv.fit(xt.values, yt)

# MODELING WITH TUNED PARAMETERS
rf_tuned = rf_tuned_cv.best_estimator_

# Fit the model
rf_tuned = rf_tuned.fit(xt_smote.values, yt_smote)

# Predicting
y_predicted_tuned = rf_tuned.predict(xt_test)

# Model performance analysis
con_matrix_tuned = confusion_matrix(yt_test, y_predicted_tuned)
class_report_tuned = classification_report(yt_test,y_predicted_tuned)
AUC_tuned = roc_auc_score(yt_test, y_predicted_tuned).round(4)

# 6. Printing results
# Accuracy
print(f"Score: {(rf_tuned.score(xt_test, yt_test))}")
print(f"""{"-"*50}""")
# Confusion Matrix
print(f"""Confusion Matrix 
{con_matrix_tuned}""")
print(f"""{"-"*50}""")
# Clasification Matrix
print(f"""Classification Report
{class_report_tuned}""")
print(f"""{"-"*50}""")
# AUC
print(f"AUC: {AUC_tuned}")

Score: 0.8353265869365225
--------------------------------------------------
Confusion Matrix 
[[901 178]
 [180 915]]
--------------------------------------------------
Classification Report
              precision    recall  f1-score   support

           0       0.83      0.84      0.83      1079
           1       0.84      0.84      0.84      1095

    accuracy                           0.84      2174
   macro avg       0.84      0.84      0.84      2174
weighted avg       0.84      0.84      0.84      2174

--------------------------------------------------
AUC: 0.8353


In [13]:
x_test2 = test_final.loc[:,x_var]
y_predicted_test = rf_tuned.predict(x_test2)

predictions = pd.DataFrame(data = {'PassengerId':test_final['PassengerId'],
                                   'Transported':y_predicted_test.astype(bool)})

predictions.head(5)

Unnamed: 0,PassengerId,Transported
0,0013_01,True
1,0018_01,False
2,0019_01,True
3,0021_01,True
4,0023_01,False


In [14]:
predictions.to_csv(path_or_buf = 'submission-rfc-807-2.csv',
                   index=False)