<a href="https://colab.research.google.com/github/bksat90kc/KaggleChallenge/blob/main/KaggleChallenge.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [88]:
# import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

warnings.filterwarnings('ignore')

In [89]:
# URL's for test and train data
train_url = 'https://raw.githubusercontent.com/bksat90kc/KaggleChallenge/main/train.csv'
test_url = 'https://raw.githubusercontent.com/bksat90kc/KaggleChallenge/main/test.csv'

**Processing Train Data**

In [90]:
# read the train data
train_df = pd.read_csv(train_url)
train_df.drop('Name', axis=1, inplace=True)

In [91]:
train_df.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,True


In [92]:
# extract the cabin details
train_df[['Deck', 'CabinNum', 'Side']] = train_df['Cabin'].str.split('/', expand=True)

In [93]:
# replace NaN values in dataframe
values = {'HomePlanet': 'unknown', 'CryoSleep': 0, 'CabinNum': 0,
          'Destination': 'unknown', 'VIP': 'False', 'RoomService': 0,
          'FoodCourt': 0, 'ShoppingMall': 0, 'Spa':0, 'VRDeck': 0, 'Age': 0}
train_df.fillna(value=values, inplace=True)

In [94]:
# one hot encoding for HomePlanet, Destination, Deck, and Side
oh1 = OneHotEncoder()
oh1.fit(train_df[['HomePlanet', 'Destination', 'Deck', 'Side']])

In [95]:
# categories in HomePlanet
oh1.categories_

[array(['Earth', 'Europa', 'Mars', 'unknown'], dtype=object),
 array(['55 Cancri e', 'PSO J318.5-22', 'TRAPPIST-1e', 'unknown'],
       dtype=object),
 array(['A', 'B', 'C', 'D', 'E', 'F', 'G', 'T', nan], dtype=object),
 array(['P', 'S', nan], dtype=object)]

In [96]:
# transform the one hot encoder object for train data
oh_train = oh1.transform(train_df[['HomePlanet', 'Destination',
                                   'Deck', 'Side']]).toarray()

In [97]:
# shape of the output
print('Shape of one-hot encoded HomePlanet data:', oh_train.shape)

Shape of one-hot encoded HomePlanet data: (8693, 20)


In [98]:
oh_train

array([[0., 1., 0., ..., 1., 0., 0.],
       [1., 0., 0., ..., 0., 1., 0.],
       [0., 1., 0., ..., 0., 1., 0.],
       ...,
       [1., 0., 0., ..., 0., 1., 0.],
       [0., 1., 0., ..., 0., 1., 0.],
       [0., 1., 0., ..., 0., 1., 0.]])

In [99]:
# new transformed data for the HomePlanet
oh_train_df = pd.DataFrame(oh_train, columns=['Earth', 'Europa', 'Mars', 'UnSrc',
                                           '55 Cancri e', 'PSO J318.5-22',
                                           'TRAPPIST-1e', 'UnDes', 'Deck_A',
                                           'Deck_B', 'Deck_C', 'Deck_D',
                                           'Deck_E', 'Deck_F', 'Deck_G',
                                           'Deck_T', 'Deck_Unk', 'Side_P',
                                           'Side_S', 'Side_Unk'])
oh_train_df.head()

Unnamed: 0,Earth,Europa,Mars,UnSrc,55 Cancri e,PSO J318.5-22,TRAPPIST-1e,UnDes,Deck_A,Deck_B,Deck_C,Deck_D,Deck_E,Deck_F,Deck_G,Deck_T,Deck_Unk,Side_P,Side_S,Side_Unk
0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
2,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0


In [100]:
# concatenating two dataframes
train_df = pd.concat([train_df, oh_train_df], axis=1, join='inner')

In [101]:
# dataframe information
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8693 entries, 0 to 8692
Data columns (total 36 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   PassengerId    8693 non-null   object 
 1   HomePlanet     8693 non-null   object 
 2   CryoSleep      8693 non-null   object 
 3   Cabin          8494 non-null   object 
 4   Destination    8693 non-null   object 
 5   Age            8693 non-null   float64
 6   VIP            8693 non-null   object 
 7   RoomService    8693 non-null   float64
 8   FoodCourt      8693 non-null   float64
 9   ShoppingMall   8693 non-null   float64
 10  Spa            8693 non-null   float64
 11  VRDeck         8693 non-null   float64
 12  Transported    8693 non-null   bool   
 13  Deck           8494 non-null   object 
 14  CabinNum       8693 non-null   object 
 15  Side           8494 non-null   object 
 16  Earth          8693 non-null   float64
 17  Europa         8693 non-null   float64
 18  Mars    

In [102]:
# convert Transported column to numeric values : 0 for False and 1 for True
train_df.loc[train_df["Transported"] == False, "Transported"] = 0
train_df.loc[train_df["Transported"] == True, "Transported"] = 1

In [103]:
# convert VIP column to numeric values : 0 for False and 1 for True
train_df.loc[train_df["VIP"] == False, "VIP"] = 0
train_df.loc[train_df["VIP"] == 'False', "VIP"] = 0
train_df.loc[train_df["VIP"] == True, "VIP"] = 1

In [104]:
# convert CryoSleep column to numeric values : 0 for False and 1 for True
train_df.loc[train_df["CryoSleep"] == False, "CryoSleep"] = 0
train_df.loc[train_df["CryoSleep"] == 'False', "CryoSleep"] = 0
train_df.loc[train_df["CryoSleep"] == True, "CryoSleep"] = 1

In [105]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8693 entries, 0 to 8692
Data columns (total 36 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   PassengerId    8693 non-null   object 
 1   HomePlanet     8693 non-null   object 
 2   CryoSleep      8693 non-null   object 
 3   Cabin          8494 non-null   object 
 4   Destination    8693 non-null   object 
 5   Age            8693 non-null   float64
 6   VIP            8693 non-null   object 
 7   RoomService    8693 non-null   float64
 8   FoodCourt      8693 non-null   float64
 9   ShoppingMall   8693 non-null   float64
 10  Spa            8693 non-null   float64
 11  VRDeck         8693 non-null   float64
 12  Transported    8693 non-null   object 
 13  Deck           8494 non-null   object 
 14  CabinNum       8693 non-null   object 
 15  Side           8494 non-null   object 
 16  Earth          8693 non-null   float64
 17  Europa         8693 non-null   float64
 18  Mars    

In [106]:
# col = 'CryoSleep'
# train_df[col] = train_df[col].astype('int64')

In [107]:
# convert the data type to integer
columns = ['CryoSleep', 'Transported', 'VIP', 'Earth', 'Europa', 'Mars',
           'UnSrc', '55 Cancri e', 'PSO J318.5-22', 'TRAPPIST-1e', 'UnDes',
           'Deck_A', 'Deck_B', 'Deck_C', 'Deck_D', 'Deck_E', 'Deck_F', 'Deck_G',
           'Deck_T', 'Deck_Unk', 'Side_P', 'Side_S', 'Side_Unk', 'CabinNum']

for col in columns:
  train_df[col] = train_df[col].astype('int64')

In [108]:
# Change the index to PassengerId
train_df.index = train_df['PassengerId']

In [110]:
train_df.drop(['HomePlanet', 'Destination', 'Cabin', 'PassengerId', 'Deck', 'Side'], axis=1, inplace=True)

In [111]:
train_df.head()

Unnamed: 0_level_0,CryoSleep,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Transported,CabinNum,...,Deck_C,Deck_D,Deck_E,Deck_F,Deck_G,Deck_T,Deck_Unk,Side_P,Side_S,Side_Unk
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0001_01,0,39.0,0,0.0,0.0,0.0,0.0,0.0,0,0,...,0,0,0,0,0,0,0,1,0,0
0002_01,0,24.0,0,109.0,9.0,25.0,549.0,44.0,1,0,...,0,0,0,1,0,0,0,0,1,0
0003_01,0,58.0,1,43.0,3576.0,0.0,6715.0,49.0,0,0,...,0,0,0,0,0,0,0,0,1,0
0003_02,0,33.0,0,0.0,1283.0,371.0,3329.0,193.0,0,0,...,0,0,0,0,0,0,0,0,1,0
0004_01,0,16.0,0,303.0,70.0,151.0,565.0,2.0,1,1,...,0,0,0,1,0,0,0,0,1,0


In [113]:
# extract required train data
X_train = train_df.drop('Transported', axis=1)
y_train = train_df[['Transported']]

In [114]:
X_train.head()

Unnamed: 0_level_0,CryoSleep,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,CabinNum,Earth,...,Deck_C,Deck_D,Deck_E,Deck_F,Deck_G,Deck_T,Deck_Unk,Side_P,Side_S,Side_Unk
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0001_01,0,39.0,0,0.0,0.0,0.0,0.0,0.0,0,0,...,0,0,0,0,0,0,0,1,0,0
0002_01,0,24.0,0,109.0,9.0,25.0,549.0,44.0,0,1,...,0,0,0,1,0,0,0,0,1,0
0003_01,0,58.0,1,43.0,3576.0,0.0,6715.0,49.0,0,0,...,0,0,0,0,0,0,0,0,1,0
0003_02,0,33.0,0,0.0,1283.0,371.0,3329.0,193.0,0,0,...,0,0,0,0,0,0,0,0,1,0
0004_01,0,16.0,0,303.0,70.0,151.0,565.0,2.0,1,1,...,0,0,0,1,0,0,0,0,1,0


In [115]:
columns = train_df.columns
for col in columns:
  check_nan = train_df[col].isnull().values.any()
  print(col, check_nan)

CryoSleep False
Age False
VIP False
RoomService False
FoodCourt False
ShoppingMall False
Spa False
VRDeck False
Transported False
CabinNum False
Earth False
Europa False
Mars False
UnSrc False
55 Cancri e False
PSO J318.5-22 False
TRAPPIST-1e False
UnDes False
Deck_A False
Deck_B False
Deck_C False
Deck_D False
Deck_E False
Deck_F False
Deck_G False
Deck_T False
Deck_Unk False
Side_P False
Side_S False
Side_Unk False


**Processing Test Data**

In [116]:
# read the test data
test_df = pd.read_csv(test_url)
test_df.drop('Name', axis=1, inplace=True)

In [117]:
# extract the cabin details
test_df[['Deck', 'CabinNum', 'Side']] = test_df['Cabin'].str.split('/', expand=True)

In [118]:
# replace NaN values in dataframe
values = {'HomePlanet': 'unknown', 'CryoSleep': 0, 'CabinNum': 0,
          'Destination': 'unknown', 'VIP': 'False', 'RoomService': 0,
          'FoodCourt': 0, 'ShoppingMall': 0, 'Spa':0, 'VRDeck': 0, 'Age': 0}
test_df.fillna(value=values, inplace=True)

In [119]:
# one hot encoding for test data
oh2 = OneHotEncoder()
oh2.fit(test_df[['HomePlanet', 'Destination', 'Deck', 'Side']])

In [120]:
# categories in HomePlanet
oh2.categories_

[array(['Earth', 'Europa', 'Mars', 'unknown'], dtype=object),
 array(['55 Cancri e', 'PSO J318.5-22', 'TRAPPIST-1e', 'unknown'],
       dtype=object),
 array(['A', 'B', 'C', 'D', 'E', 'F', 'G', 'T', nan], dtype=object),
 array(['P', 'S', nan], dtype=object)]

In [121]:
# transform the one hot encoder object for train data
oh_test = oh2.transform(test_df[['HomePlanet', 'Destination',
                                   'Deck', 'Side']]).toarray()

In [122]:
# shape of the output
print('Shape of one-hot encoded HomePlanet data:', oh_test.shape)

Shape of one-hot encoded HomePlanet data: (4277, 20)


In [123]:
oh_test

array([[1., 0., 0., ..., 0., 1., 0.],
       [1., 0., 0., ..., 0., 1., 0.],
       [0., 1., 0., ..., 0., 1., 0.],
       ...,
       [0., 0., 1., ..., 1., 0., 0.],
       [0., 1., 0., ..., 1., 0., 0.],
       [1., 0., 0., ..., 0., 1., 0.]])

In [124]:
# new transformed test data
oh_test_df = pd.DataFrame(oh_test, columns=['Earth', 'Europa', 'Mars', 'UnSrc',
                                           '55 Cancri e', 'PSO J318.5-22',
                                           'TRAPPIST-1e', 'UnDes', 'Deck_A',
                                           'Deck_B', 'Deck_C', 'Deck_D',
                                           'Deck_E', 'Deck_F', 'Deck_G',
                                           'Deck_T', 'Deck_Unk', 'Side_P',
                                           'Side_S', 'Side_Unk'])
oh_test_df.head()

Unnamed: 0,Earth,Europa,Mars,UnSrc,55 Cancri e,PSO J318.5-22,TRAPPIST-1e,UnDes,Deck_A,Deck_B,Deck_C,Deck_D,Deck_E,Deck_F,Deck_G,Deck_T,Deck_Unk,Side_P,Side_S,Side_Unk
0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
1,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
2,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0


In [125]:
# concatenating two dataframes
test_df = pd.concat([test_df, oh_train_df], axis=1, join='inner')

In [132]:
# convert VIP column to numeric values : 0 for False and 1 for True
test_df.loc[test_df["VIP"] == False, "VIP"] = 0
test_df.loc[test_df["VIP"] == 'False', "VIP"] = 0
test_df.loc[test_df["VIP"] == True, "VIP"] = 1


# convert CryoSleep column to numeric values : 0 for False and 1 for True
test_df.loc[test_df["CryoSleep"] == False, "CryoSleep"] = 0
test_df.loc[test_df["CryoSleep"] == 'False', "CryoSleep"] = 0
test_df.loc[test_df["CryoSleep"] == True, "CryoSleep"] = 1

In [135]:
# convert the data type to integer
columns = ['CryoSleep', 'VIP', 'Earth', 'Europa', 'Mars',
           'UnSrc', '55 Cancri e', 'PSO J318.5-22', 'TRAPPIST-1e', 'UnDes',
           'Deck_A', 'Deck_B', 'Deck_C', 'Deck_D', 'Deck_E', 'Deck_F', 'Deck_G',
           'Deck_T', 'Deck_Unk', 'Side_P', 'Side_S', 'Side_Unk', 'CabinNum']

for col in columns:
  test_df[col] = test_df[col].astype('int64')


In [136]:
# Change the index to PassengerId
test_df.index = test_df['PassengerId']

KeyError: 'PassengerId'

In [130]:
test_df.drop(['HomePlanet', 'Destination', 'Cabin', 'PassengerId',
              'Deck', 'Side'], axis=1, inplace=True)

In [149]:
test_df.head()

Unnamed: 0_level_0,CryoSleep,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,CabinNum,Earth,...,Deck_C,Deck_D,Deck_E,Deck_F,Deck_G,Deck_T,Deck_Unk,Side_P,Side_S,Side_Unk
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0013_01,1,27.0,0,0.0,0.0,0.0,0.0,0.0,3,0,...,0,0,0,0,0,0,0,1,0,0
0018_01,0,19.0,0,0.0,9.0,0.0,2823.0,0.0,4,1,...,0,0,0,1,0,0,0,0,1,0
0019_01,1,31.0,0,0.0,0.0,0.0,0.0,0.0,0,0,...,0,0,0,0,0,0,0,0,1,0
0021_01,0,38.0,0,0.0,6652.0,0.0,181.0,585.0,1,0,...,0,0,0,0,0,0,0,0,1,0
0023_01,0,20.0,0,10.0,0.0,635.0,0.0,0.0,5,1,...,0,0,0,1,0,0,0,0,1,0


In [138]:
columns = test_df.columns
for col in columns:
  check_nan = test_df[col].isnull().values.any()
  print(col, check_nan)

CryoSleep False
Age False
VIP False
RoomService False
FoodCourt False
ShoppingMall False
Spa False
VRDeck False
CabinNum False
Earth False
Europa False
Mars False
UnSrc False
55 Cancri e False
PSO J318.5-22 False
TRAPPIST-1e False
UnDes False
Deck_A False
Deck_B False
Deck_C False
Deck_D False
Deck_E False
Deck_F False
Deck_G False
Deck_T False
Deck_Unk False
Side_P False
Side_S False
Side_Unk False


**Following Classification are conducted:**


1.   Logistic Regression
2.   Decision Tree Classifier
3.   Random Forrest



In [139]:
# logistic regression
lrcl = LogisticRegression()
lrcl.fit(X_train, y_train)

In [140]:
# random forrest classifier
rfcl = RandomForestClassifier()
rfcl.fit(X_train, y_train)

In [141]:
# decision tree classifier
dtcl = DecisionTreeClassifier()
dtcl.fit(X_train, y_train)

In [144]:
# finding predictions for each algorithm
lr_y_pred = lrcl.predict(test_df)
rf_y_pred = rfcl.predict(test_df)
dt_y_pred = dtcl.predict(test_df)

In [145]:
# reshaping the predictions
lr_y_pred = lr_y_pred.reshape(-1, 1)
rf_y_pred = rf_y_pred.reshape(-1, 1)
dt_y_pred = dt_y_pred.reshape(-1, 1)

In [146]:
# creating the dataframes for the predictions
lr_pred_df = pd.DataFrame(lr_y_pred, columns=['Transported'])
rf_pred_df = pd.DataFrame(rf_y_pred, columns=['Transported'])
dt_pred_df = pd.DataFrame(dt_y_pred, columns=['Transported'])

In [150]:
test_df.reset_index(inplace=True)

In [153]:
# creating the outcomes
lr_outcome_df = pd.concat([test_df[['PassengerId']], lr_pred_df], axis=1, join='inner')
rf_outcome_df = pd.concat([test_df[['PassengerId']], rf_pred_df], axis=1, join='inner')
dt_outcome_df = pd.concat([test_df[['PassengerId']], dt_pred_df], axis=1, join='inner')

In [155]:
# convert Transported column to numeric values : 0 for False and 1 for True
lr_outcome_df.loc[lr_outcome_df["Transported"] == 0, "Transported"] = False
lr_outcome_df.loc[lr_outcome_df["Transported"] == 1, "Transported"] = True

rf_outcome_df.loc[rf_outcome_df["Transported"] == 0, "Transported"] = False
rf_outcome_df.loc[rf_outcome_df["Transported"] == 1, "Transported"] = True

dt_outcome_df.loc[dt_outcome_df["Transported"] == 0, "Transported"] = False
dt_outcome_df.loc[dt_outcome_df["Transported"] == 1, "Transported"] = True

In [159]:
lr_outcome_df.head()

Unnamed: 0,PassengerId,Transported
0,0013_01,True
1,0018_01,False
2,0019_01,True
3,0021_01,True
4,0023_01,True


In [160]:
# save the outcomes into the files
lr_outcome_df.to_csv('lr_submission.csv', index=False)
rf_outcome_df.to_csv('rf_submission.csv', index=False)
dt_outcome_df.to_csv('dt_submission.csv', index=False)