In [1]:
import pandas as pd
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder

train_data_raw = pd.read_csv("train.csv") 
test_data_raw = pd.read_csv("test.csv") 

#### Steps
- Have Test and Train Data
- Train Test Split
- Impute
- Test Models

### Test and Train Data

In [2]:
train_data_raw

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8688,9276_01,Europa,False,A/98/P,55 Cancri e,41.0,True,0.0,6819.0,0.0,1643.0,74.0,Gravior Noxnuther,False
8689,9278_01,Earth,True,G/1499/S,PSO J318.5-22,18.0,False,0.0,0.0,0.0,0.0,0.0,Kurta Mondalley,False
8690,9279_01,Earth,False,G/1500/S,TRAPPIST-1e,26.0,False,0.0,0.0,1872.0,1.0,0.0,Fayey Connon,True
8691,9280_01,Europa,False,E/608/S,55 Cancri e,32.0,False,0.0,1049.0,0.0,353.0,3235.0,Celeon Hontichre,False


In [3]:
train_data_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8693 entries, 0 to 8692
Data columns (total 14 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   PassengerId   8693 non-null   object 
 1   HomePlanet    8492 non-null   object 
 2   CryoSleep     8476 non-null   object 
 3   Cabin         8494 non-null   object 
 4   Destination   8511 non-null   object 
 5   Age           8514 non-null   float64
 6   VIP           8490 non-null   object 
 7   RoomService   8512 non-null   float64
 8   FoodCourt     8510 non-null   float64
 9   ShoppingMall  8485 non-null   float64
 10  Spa           8510 non-null   float64
 11  VRDeck        8505 non-null   float64
 12  Name          8493 non-null   object 
 13  Transported   8693 non-null   bool   
dtypes: bool(1), float64(6), object(7)
memory usage: 891.5+ KB


##### Deleting Name Column

In [4]:
train_data = train_data_raw.drop(["Name", "PassengerId"], axis='columns')
test_data = test_data_raw.drop(["Name", 'PassengerId'], axis='columns')

In [5]:
train_data

Unnamed: 0,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Transported
0,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,False
1,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,True
2,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,False
3,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,False
4,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,True
...,...,...,...,...,...,...,...,...,...,...,...,...
8688,Europa,False,A/98/P,55 Cancri e,41.0,True,0.0,6819.0,0.0,1643.0,74.0,False
8689,Earth,True,G/1499/S,PSO J318.5-22,18.0,False,0.0,0.0,0.0,0.0,0.0,False
8690,Earth,False,G/1500/S,TRAPPIST-1e,26.0,False,0.0,0.0,1872.0,1.0,0.0,True
8691,Europa,False,E/608/S,55 Cancri e,32.0,False,0.0,1049.0,0.0,353.0,3235.0,False


##### Getting Rid of HomePlanet Nulls

In [6]:
train_data['HomePlanet'].mode()

0    Earth
Name: HomePlanet, dtype: object

In [7]:
train_data["HomePlanet"].fillna("Earth", inplace = True)
test_data["HomePlanet"].fillna("Earth", inplace = True)

In [8]:
train_data['HomePlanet'].isna().sum()

0

##### Getting Rid of CryoSleep Nulls and Changing to numbers

In [9]:
train_data['CryoSleep'].mode()

0    False
Name: CryoSleep, dtype: object

In [10]:
train_data['CryoSleep']

0       False
1       False
2       False
3       False
4       False
        ...  
8688    False
8689     True
8690    False
8691    False
8692    False
Name: CryoSleep, Length: 8693, dtype: object

In [11]:
train_data["CryoSleep"].fillna(False, inplace = True)
test_data["CryoSleep"].fillna(False, inplace = True)

In [12]:
train_data["CryoSleep"] = train_data["CryoSleep"].astype(int)

In [13]:
test_data["CryoSleep"] = test_data["CryoSleep"].astype(int)

In [14]:
train_data['CryoSleep'].isna().sum()

0

##### Checking on Nulls Again

In [15]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8693 entries, 0 to 8692
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   HomePlanet    8693 non-null   object 
 1   CryoSleep     8693 non-null   int64  
 2   Cabin         8494 non-null   object 
 3   Destination   8511 non-null   object 
 4   Age           8514 non-null   float64
 5   VIP           8490 non-null   object 
 6   RoomService   8512 non-null   float64
 7   FoodCourt     8510 non-null   float64
 8   ShoppingMall  8485 non-null   float64
 9   Spa           8510 non-null   float64
 10  VRDeck        8505 non-null   float64
 11  Transported   8693 non-null   bool   
dtypes: bool(1), float64(6), int64(1), object(4)
memory usage: 755.7+ KB


##### Splitting up Cabin

In [16]:
td = train_data.Cabin.str.split('/').tolist()
td2 = test_data.Cabin.str.split('/').tolist()

In [17]:
type(td[0][0])

str

In [18]:
for cabin in td:
    print(cabin)

['B', '0', 'P']
['F', '0', 'S']
['A', '0', 'S']
['A', '0', 'S']
['F', '1', 'S']
['F', '0', 'P']
['F', '2', 'S']
['G', '0', 'S']
['F', '3', 'S']
['B', '1', 'P']
['B', '1', 'P']
['B', '1', 'P']
['F', '1', 'P']
['G', '1', 'S']
['F', '2', 'P']
nan
['F', '3', 'P']
['F', '4', 'P']
['F', '5', 'P']
['G', '0', 'P']
['F', '6', 'P']
['E', '0', 'S']
['E', '0', 'S']
['E', '0', 'S']
['E', '0', 'S']
['E', '0', 'S']
['E', '0', 'S']
['D', '0', 'P']
['C', '2', 'S']
['F', '6', 'S']
['C', '0', 'P']
['F', '8', 'P']
['G', '4', 'S']
['F', '9', 'P']
['F', '9', 'P']
['F', '9', 'P']
['D', '1', 'S']
['D', '1', 'P']
['F', '8', 'S']
['F', '10', 'S']
['G', '1', 'P']
['G', '2', 'P']
['B', '3', 'P']
['G', '3', 'P']
['G', '3', 'P']
['G', '3', 'P']
['F', '10', 'P']
['F', '10', 'P']
['E', '1', 'S']
['E', '2', 'S']
['G', '6', 'S']
['F', '11', 'S']
['A', '1', 'S']
['A', '1', 'S']
['A', '1', 'S']
['G', '7', 'S']
['F', '12', 'S']
['F', '13', 'S']
['F', '14', 'S']
['E', '3', 'S']
['G', '6', 'P']
['G', '10', 'S']
['G', '10', 

['F', '379', 'S']
['F', '380', 'S']
['E', '125', 'P']
['G', '314', 'S']
['F', '381', 'S']
['G', '315', 'S']
['F', '383', 'S']
['C', '76', 'S']
['F', '396', 'P']
['G', '323', 'P']
['F', '386', 'S']
['G', '317', 'S']
['G', '317', 'S']
['A', '25', 'S']
['B', '59', 'P']
['B', '59', 'P']
['B', '59', 'P']
['G', '324', 'P']
['G', '318', 'S']
['B', '60', 'P']
['F', '398', 'P']
['F', '387', 'S']
['B', '83', 'S']
['B', '83', 'S']
['F', '390', 'S']
['G', '322', 'S']
['E', '150', 'S']
['G', '323', 'S']
['G', '323', 'S']
['G', '323', 'S']
['F', '399', 'P']
['F', '391', 'S']
['F', '400', 'P']
['F', '402', 'P']
['F', '393', 'S']
['F', '404', 'P']
['B', '85', 'S']
['B', '85', 'S']
['G', '327', 'P']
['F', '394', 'S']
['F', '395', 'S']
['F', '396', 'S']
['G', '324', 'S']
['G', '328', 'P']
['E', '126', 'P']
['E', '151', 'S']
['C', '68', 'P']
['C', '69', 'P']
['G', '330', 'P']
['G', '330', 'P']
['G', '330', 'P']
['G', '330', 'P']
['G', '330', 'P']
['G', '330', 'P']
['G', '331', 'P']
['F', '407', 'P']
['F'

['F', '852', 'P']
['C', '149', 'S']
['C', '149', 'S']
['G', '682', 'P']
['F', '780', 'S']
['G', '683', 'P']
['F', '853', 'P']
['F', '781', 'S']
['G', '684', 'P']
['E', '273', 'S']
['F', '782', 'S']
['F', '783', 'S']
['G', '683', 'S']
['G', '685', 'P']
['B', '132', 'P']
['B', '132', 'P']
nan
['F', '855', 'P']
['E', '260', 'P']
['D', '140', 'P']
['D', '140', 'P']
['B', '133', 'P']
['G', '686', 'P']
['F', '856', 'P']
['F', '856', 'P']
['F', '856', 'P']
['F', '856', 'P']
['F', '856', 'P']
['F', '856', 'P']
['D', '141', 'P']
['E', '261', 'P']
['F', '857', 'P']
['F', '858', 'P']
['F', '859', 'P']
['F', '859', 'P']
['F', '859', 'P']
['F', '859', 'P']
['D', '142', 'P']
['F', '859', 'P']
['F', '859', 'P']
['B', '161', 'S']
['B', '161', 'S']
['B', '161', 'S']
['G', '687', 'P']
['A', '50', 'S']
['A', '50', 'S']
['C', '133', 'P']
['C', '133', 'P']
nan
['C', '133', 'P']
['C', '133', 'P']
['C', '133', 'P']
['A', '51', 'S']
['A', '51', 'S']
['A', '51', 'S']
['F', '784', 'S']
['F', '860', 'P']
['G', '

['F', '1280', 'P']
['B', '239', 'S']
['B', '239', 'S']
['F', '1282', 'P']
['F', '1188', 'S']
['E', '413', 'S']
['F', '1283', 'P']
['F', '1284', 'P']
['F', '1190', 'S']
['E', '398', 'P']
['F', '1191', 'S']
['G', '1006', 'S']
['G', '1007', 'P']
['F', '1192', 'S']
['G', '1007', 'S']
['F', '1286', 'P']
['E', '414', 'S']
['B', '241', 'S']
['F', '1287', 'P']
['F', '1194', 'S']
['F', '1195', 'S']
['F', '1196', 'S']
['F', '1197', 'S']
['F', '1198', 'S']
['G', '1010', 'S']
['G', '1011', 'S']
['F', '1199', 'S']
['G', '1012', 'S']
['G', '1008', 'P']
['G', '1013', 'S']
['F', '1200', 'S']
['F', '1201', 'S']
['F', '1289', 'P']
['F', '1289', 'P']
['B', '243', 'S']
['B', '243', 'S']
['G', '1009', 'P']
['F', '1293', 'P']
['D', '195', 'P']
['B', '213', 'P']
['B', '213', 'P']
['F', '1294', 'P']
nan
['F', '1296', 'P']
['F', '1202', 'S']
['F', '1297', 'P']
['F', '1297', 'P']
['A', '62', 'P']
['D', '196', 'P']
['D', '196', 'P']
['F', '1203', 'S']
['G', '1011', 'P']
['G', '1014', 'S']
['A', '74', 'S']
['G', 

['G', '1370', 'P']
['F', '1610', 'S']
['F', '1725', 'P']
['B', '276', 'P']
['D', '260', 'P']
['G', '1371', 'P']
['B', '329', 'S']
['B', '329', 'S']
['C', '315', 'S']
['C', '316', 'S']
['G', '1346', 'S']
['G', '1346', 'S']
['G', '1372', 'P']
['D', '250', 'S']
['F', '1613', 'S']
['G', '1373', 'P']
['F', '1615', 'S']
['F', '1726', 'P']
['G', '1374', 'P']
['E', '553', 'S']
['B', '330', 'S']
['B', '330', 'S']
['F', '1618', 'S']
['G', '1349', 'S']
['E', '554', 'S']
['F', '1619', 'S']
['F', '1620', 'S']
['G', '1376', 'P']
['G', '1351', 'S']
['F', '1621', 'S']
['B', '277', 'P']
['B', '277', 'P']
['G', '1377', 'P']
['F', '1729', 'P']
nan
['F', '1624', 'S']
['G', '1352', 'S']
['E', '540', 'P']
['F', '1733', 'P']
['F', '1734', 'P']
['B', '278', 'P']
['C', '317', 'S']
['C', '317', 'S']
['F', '1625', 'S']
['F', '1625', 'S']
['C', '279', 'P']
['C', '279', 'P']
['C', '279', 'P']
['C', '279', 'P']
['G', '1354', 'S']
['C', '280', 'P']
['F', '1627', 'S']
['C', '319', 'S']
['C', '319', 'S']
['C', '319', 

In [19]:
#train_data['Deck'] = [person[0] for person in td]
# train_data['Num'] = [person[1] for person in td]
# train_data['Side'] = [person[2] for person in td]
# test_data['Deck'] = [person[0] for person in td2]
# test_data['Num'] = [person[1] for person in td2]
# train_data['Side'] = [person[2] for person in td2]
train_data

Unnamed: 0,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Transported
0,Europa,0,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,False
1,Earth,0,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,True
2,Europa,0,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,False
3,Europa,0,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,False
4,Earth,0,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,True
...,...,...,...,...,...,...,...,...,...,...,...,...
8688,Europa,0,A/98/P,55 Cancri e,41.0,True,0.0,6819.0,0.0,1643.0,74.0,False
8689,Earth,1,G/1499/S,PSO J318.5-22,18.0,False,0.0,0.0,0.0,0.0,0.0,False
8690,Earth,0,G/1500/S,TRAPPIST-1e,26.0,False,0.0,0.0,1872.0,1.0,0.0,True
8691,Europa,0,E/608/S,55 Cancri e,32.0,False,0.0,1049.0,0.0,353.0,3235.0,False


##### Cabin: I need to get info about each section of the Cabin before I can replace the nulls and split the column

In [20]:
cabindf = train_data['Cabin']
cabindf

0          B/0/P
1          F/0/S
2          A/0/S
3          A/0/S
4          F/1/S
          ...   
8688      A/98/P
8689    G/1499/S
8690    G/1500/S
8691     E/608/S
8692     E/608/S
Name: Cabin, Length: 8693, dtype: object

In [21]:
cabindftest = test_data['Cabin']
cabindftest

0          G/3/S
1          F/4/S
2          C/0/S
3          C/1/S
4          F/5/S
          ...   
4272    G/1496/S
4273         NaN
4274     D/296/P
4275     D/297/P
4276    G/1498/S
Name: Cabin, Length: 4277, dtype: object

In [22]:
cabindf.info()

<class 'pandas.core.series.Series'>
RangeIndex: 8693 entries, 0 to 8692
Series name: Cabin
Non-Null Count  Dtype 
--------------  ----- 
8494 non-null   object
dtypes: object(1)
memory usage: 68.0+ KB


In [23]:
cabindftest.info()

<class 'pandas.core.series.Series'>
RangeIndex: 4277 entries, 0 to 4276
Series name: Cabin
Non-Null Count  Dtype 
--------------  ----- 
4177 non-null   object
dtypes: object(1)
memory usage: 33.5+ KB


In [24]:
cabindf.dropna(inplace = True)

In [25]:
cabindftest.dropna(inplace = True)

In [26]:
cabindf.info()

<class 'pandas.core.series.Series'>
Int64Index: 8494 entries, 0 to 8692
Series name: Cabin
Non-Null Count  Dtype 
--------------  ----- 
8494 non-null   object
dtypes: object(1)
memory usage: 132.7+ KB


In [27]:
cabindftest.info()

<class 'pandas.core.series.Series'>
Int64Index: 4177 entries, 0 to 4276
Series name: Cabin
Non-Null Count  Dtype 
--------------  ----- 
4177 non-null   object
dtypes: object(1)
memory usage: 65.3+ KB


In [28]:
cabtd = cabindf.str.split('/').tolist()
cabtd

[['B', '0', 'P'],
 ['F', '0', 'S'],
 ['A', '0', 'S'],
 ['A', '0', 'S'],
 ['F', '1', 'S'],
 ['F', '0', 'P'],
 ['F', '2', 'S'],
 ['G', '0', 'S'],
 ['F', '3', 'S'],
 ['B', '1', 'P'],
 ['B', '1', 'P'],
 ['B', '1', 'P'],
 ['F', '1', 'P'],
 ['G', '1', 'S'],
 ['F', '2', 'P'],
 ['F', '3', 'P'],
 ['F', '4', 'P'],
 ['F', '5', 'P'],
 ['G', '0', 'P'],
 ['F', '6', 'P'],
 ['E', '0', 'S'],
 ['E', '0', 'S'],
 ['E', '0', 'S'],
 ['E', '0', 'S'],
 ['E', '0', 'S'],
 ['E', '0', 'S'],
 ['D', '0', 'P'],
 ['C', '2', 'S'],
 ['F', '6', 'S'],
 ['C', '0', 'P'],
 ['F', '8', 'P'],
 ['G', '4', 'S'],
 ['F', '9', 'P'],
 ['F', '9', 'P'],
 ['F', '9', 'P'],
 ['D', '1', 'S'],
 ['D', '1', 'P'],
 ['F', '8', 'S'],
 ['F', '10', 'S'],
 ['G', '1', 'P'],
 ['G', '2', 'P'],
 ['B', '3', 'P'],
 ['G', '3', 'P'],
 ['G', '3', 'P'],
 ['G', '3', 'P'],
 ['F', '10', 'P'],
 ['F', '10', 'P'],
 ['E', '1', 'S'],
 ['E', '2', 'S'],
 ['G', '6', 'S'],
 ['F', '11', 'S'],
 ['A', '1', 'S'],
 ['A', '1', 'S'],
 ['A', '1', 'S'],
 ['G', '7', 'S'],
 ['F',

In [29]:
cabtest = cabindftest.str.split('/').tolist()
cabtest

[['G', '3', 'S'],
 ['F', '4', 'S'],
 ['C', '0', 'S'],
 ['C', '1', 'S'],
 ['F', '5', 'S'],
 ['F', '7', 'P'],
 ['B', '2', 'P'],
 ['D', '0', 'S'],
 ['D', '0', 'S'],
 ['F', '7', 'S'],
 ['F', '9', 'S'],
 ['D', '2', 'P'],
 ['D', '2', 'P'],
 ['E', '0', 'P'],
 ['G', '4', 'P'],
 ['F', '11', 'P'],
 ['F', '11', 'P'],
 ['B', '0', 'S'],
 ['B', '0', 'S'],
 ['G', '5', 'S'],
 ['E', '1', 'P'],
 ['B', '4', 'P'],
 ['B', '4', 'P'],
 ['B', '4', 'P'],
 ['G', '5', 'P'],
 ['C', '1', 'P'],
 ['E', '2', 'P'],
 ['F', '12', 'P'],
 ['G', '8', 'S'],
 ['G', '9', 'S'],
 ['G', '11', 'S'],
 ['G', '12', 'S'],
 ['G', '7', 'P'],
 ['G', '8', 'P'],
 ['E', '3', 'P'],
 ['F', '19', 'S'],
 ['G', '10', 'P'],
 ['G', '17', 'S'],
 ['F', '21', 'S'],
 ['F', '18', 'P'],
 ['F', '22', 'S'],
 ['G', '13', 'P'],
 ['G', '13', 'P'],
 ['F', '25', 'S'],
 ['F', '26', 'S'],
 ['F', '22', 'P'],
 ['E', '4', 'P'],
 ['G', '14', 'P'],
 ['F', '26', 'P'],
 ['G', '15', 'P'],
 ['B', '6', 'P'],
 ['B', '6', 'P'],
 ['G', '16', 'P'],
 ['F', '28', 'S'],
 ['D', 

In [30]:
cabindf = pd.DataFrame(cabindf)

In [31]:
cabindftest = pd.DataFrame(cabtest)

In [32]:
cabindf['Deck'] = [cabin[0] for cabin in cabtd]
cabindf['Num'] = [cabin[1] for cabin in cabtd]
cabindf['Side'] = [cabin[2] for cabin in cabtd]

In [33]:
cabindftest['Deck'] = [cabin[0] for cabin in cabtest]
cabindftest['Num'] = [cabin[1] for cabin in cabtest]
cabindftest['Side'] = [cabin[2] for cabin in cabtest]

In [34]:
cabindf = pd.DataFrame(cabindf)
cabindf

Unnamed: 0,Cabin,Deck,Num,Side
0,B/0/P,B,0,P
1,F/0/S,F,0,S
2,A/0/S,A,0,S
3,A/0/S,A,0,S
4,F/1/S,F,1,S
...,...,...,...,...
8688,A/98/P,A,98,P
8689,G/1499/S,G,1499,S
8690,G/1500/S,G,1500,S
8691,E/608/S,E,608,S


In [35]:
cabindftest = pd.DataFrame(cabindftest)
cabindftest

Unnamed: 0,0,1,2,Deck,Num,Side
0,G,3,S,G,3,S
1,F,4,S,F,4,S
2,C,0,S,C,0,S
3,C,1,S,C,1,S
4,F,5,S,F,5,S
...,...,...,...,...,...,...
4172,F,1796,S,F,1796,S
4173,G,1496,S,G,1496,S
4174,D,296,P,D,296,P
4175,D,297,P,D,297,P


In [36]:
cabindf['Deck'].mode()

0    F
Name: Deck, dtype: object

In [37]:
cabindf['Num'].mode()

0    82
Name: Num, dtype: object

In [38]:
cabindf['Side'].mode()

0    S
Name: Side, dtype: object

In [39]:
cabindftest['Deck'].mode()

0    F
Name: Deck, dtype: object

In [40]:
cabindftest['Num'].mode()

0    4
Name: Num, dtype: object

In [41]:
cabindftest['Side'].mode()

0    S
Name: Side, dtype: object

##### Now I know that the mode of the train data cabin is "F/82/S" and the mode of the test is "F/4/S" so I can replace the nulls with this in the real Dataframe

In [42]:
train_data['Cabin'] = train_data['Cabin'].fillna('F/82/S')

In [43]:
train_data['Cabin'].info()

<class 'pandas.core.series.Series'>
RangeIndex: 8693 entries, 0 to 8692
Series name: Cabin
Non-Null Count  Dtype 
--------------  ----- 
8693 non-null   object
dtypes: object(1)
memory usage: 68.0+ KB


In [44]:
test_data['Cabin'] = test_data['Cabin'].fillna('F/4/S')

In [45]:
test_data['Cabin'].info()

<class 'pandas.core.series.Series'>
RangeIndex: 4277 entries, 0 to 4276
Series name: Cabin
Non-Null Count  Dtype 
--------------  ----- 
4277 non-null   object
dtypes: object(1)
memory usage: 33.5+ KB


In [46]:
train_data

Unnamed: 0,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Transported
0,Europa,0,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,False
1,Earth,0,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,True
2,Europa,0,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,False
3,Europa,0,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,False
4,Earth,0,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,True
...,...,...,...,...,...,...,...,...,...,...,...,...
8688,Europa,0,A/98/P,55 Cancri e,41.0,True,0.0,6819.0,0.0,1643.0,74.0,False
8689,Earth,1,G/1499/S,PSO J318.5-22,18.0,False,0.0,0.0,0.0,0.0,0.0,False
8690,Earth,0,G/1500/S,TRAPPIST-1e,26.0,False,0.0,0.0,1872.0,1.0,0.0,True
8691,Europa,0,E/608/S,55 Cancri e,32.0,False,0.0,1049.0,0.0,353.0,3235.0,False


In [47]:
td = train_data.Cabin.str.split('/').tolist()
td2 = test_data.Cabin.str.split('/').tolist()

In [48]:
train_data['Deck'] = [cabin[0] for cabin in td]
train_data['Num'] = [cabin[1] for cabin in td]
train_data['Side'] = [cabin[2] for cabin in td]
test_data['Deck'] = [cabin[0] for cabin in td2]
test_data['Num'] = [cabin[1] for cabin in td2]
test_data['Side'] = [cabin[2] for cabin in td2]

In [49]:
train_data.drop(columns=['Cabin', 'Num'], inplace=True)

In [50]:
train_data

Unnamed: 0,HomePlanet,CryoSleep,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Transported,Deck,Side
0,Europa,0,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,False,B,P
1,Earth,0,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,True,F,S
2,Europa,0,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,False,A,S
3,Europa,0,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,False,A,S
4,Earth,0,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,True,F,S
...,...,...,...,...,...,...,...,...,...,...,...,...,...
8688,Europa,0,55 Cancri e,41.0,True,0.0,6819.0,0.0,1643.0,74.0,False,A,P
8689,Earth,1,PSO J318.5-22,18.0,False,0.0,0.0,0.0,0.0,0.0,False,G,S
8690,Earth,0,TRAPPIST-1e,26.0,False,0.0,0.0,1872.0,1.0,0.0,True,G,S
8691,Europa,0,55 Cancri e,32.0,False,0.0,1049.0,0.0,353.0,3235.0,False,E,S


In [51]:
test_data.drop(columns=['Cabin', 'Num'], inplace=True)

In [52]:
test_data

Unnamed: 0,HomePlanet,CryoSleep,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Deck,Side
0,Earth,1,TRAPPIST-1e,27.0,False,0.0,0.0,0.0,0.0,0.0,G,S
1,Earth,0,TRAPPIST-1e,19.0,False,0.0,9.0,0.0,2823.0,0.0,F,S
2,Europa,1,55 Cancri e,31.0,False,0.0,0.0,0.0,0.0,0.0,C,S
3,Europa,0,TRAPPIST-1e,38.0,False,0.0,6652.0,0.0,181.0,585.0,C,S
4,Earth,0,TRAPPIST-1e,20.0,False,10.0,0.0,635.0,0.0,0.0,F,S
...,...,...,...,...,...,...,...,...,...,...,...,...
4272,Earth,1,TRAPPIST-1e,34.0,False,0.0,0.0,0.0,0.0,0.0,G,S
4273,Earth,0,TRAPPIST-1e,42.0,False,0.0,847.0,17.0,10.0,144.0,F,S
4274,Mars,1,55 Cancri e,,False,0.0,0.0,0.0,0.0,0.0,D,P
4275,Europa,0,,,False,0.0,2680.0,0.0,0.0,523.0,D,P


##### I have finished with the Cabin Information

In [53]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8693 entries, 0 to 8692
Data columns (total 13 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   HomePlanet    8693 non-null   object 
 1   CryoSleep     8693 non-null   int64  
 2   Destination   8511 non-null   object 
 3   Age           8514 non-null   float64
 4   VIP           8490 non-null   object 
 5   RoomService   8512 non-null   float64
 6   FoodCourt     8510 non-null   float64
 7   ShoppingMall  8485 non-null   float64
 8   Spa           8510 non-null   float64
 9   VRDeck        8505 non-null   float64
 10  Transported   8693 non-null   bool   
 11  Deck          8693 non-null   object 
 12  Side          8693 non-null   object 
dtypes: bool(1), float64(6), int64(1), object(5)
memory usage: 823.6+ KB


##### I'm going to work on the Destination column now

In [54]:
train_data['Destination'].unique()

array(['TRAPPIST-1e', 'PSO J318.5-22', '55 Cancri e', nan], dtype=object)

In [55]:
train_data['HomePlanet'].unique()

array(['Europa', 'Earth', 'Mars'], dtype=object)

In [56]:
train_data['Destination'].loc[train_data['HomePlanet'] == 'Europa'].value_counts()

TRAPPIST-1e      1189
55 Cancri e       886
PSO J318.5-22      19
Name: Destination, dtype: int64

In [57]:
train_data['Destination'].loc[train_data['HomePlanet'] == 'Earth'].value_counts()

TRAPPIST-1e      3251
PSO J318.5-22     728
55 Cancri e       721
Name: Destination, dtype: int64

In [58]:
train_data['Destination'].loc[train_data['HomePlanet'] == 'Mars'].value_counts()

TRAPPIST-1e      1475
55 Cancri e       193
PSO J318.5-22      49
Name: Destination, dtype: int64

##### I was going to fill Destination Nulls based on HomePlanet but TRAPPIST-1e seems to be the most common every time so I will just fill the Nulls with that

In [59]:
train_data["Destination"].fillna("TRAPPIST-1e", inplace = True)
test_data["Destination"].fillna("TRAPPIST-1e", inplace = True)

In [60]:
train_data['Destination'].info()

<class 'pandas.core.series.Series'>
RangeIndex: 8693 entries, 0 to 8692
Series name: Destination
Non-Null Count  Dtype 
--------------  ----- 
8693 non-null   object
dtypes: object(1)
memory usage: 68.0+ KB


In [61]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8693 entries, 0 to 8692
Data columns (total 13 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   HomePlanet    8693 non-null   object 
 1   CryoSleep     8693 non-null   int64  
 2   Destination   8693 non-null   object 
 3   Age           8514 non-null   float64
 4   VIP           8490 non-null   object 
 5   RoomService   8512 non-null   float64
 6   FoodCourt     8510 non-null   float64
 7   ShoppingMall  8485 non-null   float64
 8   Spa           8510 non-null   float64
 9   VRDeck        8505 non-null   float64
 10  Transported   8693 non-null   bool   
 11  Deck          8693 non-null   object 
 12  Side          8693 non-null   object 
dtypes: bool(1), float64(6), int64(1), object(5)
memory usage: 823.6+ KB


##### VIP is the last object dtype column that I individually need to deal with

In [62]:
train_data['VIP'].mode()

0    False
Name: VIP, dtype: object

In [63]:
test_data['VIP'].mode()

0    False
Name: VIP, dtype: object

In [64]:
train_data["VIP"].fillna(False, inplace = True)
test_data["VIP"].fillna(False, inplace = True)

In [65]:
train_data["VIP"] = train_data["VIP"].astype(int)
test_data["VIP"] = test_data["VIP"].astype(int)

In [66]:
train_data['VIP'].isna().sum()

0

##### Test Train Split

In [67]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8693 entries, 0 to 8692
Data columns (total 13 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   HomePlanet    8693 non-null   object 
 1   CryoSleep     8693 non-null   int64  
 2   Destination   8693 non-null   object 
 3   Age           8514 non-null   float64
 4   VIP           8693 non-null   int64  
 5   RoomService   8512 non-null   float64
 6   FoodCourt     8510 non-null   float64
 7   ShoppingMall  8485 non-null   float64
 8   Spa           8510 non-null   float64
 9   VRDeck        8505 non-null   float64
 10  Transported   8693 non-null   bool   
 11  Deck          8693 non-null   object 
 12  Side          8693 non-null   object 
dtypes: bool(1), float64(6), int64(2), object(4)
memory usage: 823.6+ KB


In [68]:
test_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4277 entries, 0 to 4276
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   HomePlanet    4277 non-null   object 
 1   CryoSleep     4277 non-null   int64  
 2   Destination   4277 non-null   object 
 3   Age           4186 non-null   float64
 4   VIP           4277 non-null   int64  
 5   RoomService   4195 non-null   float64
 6   FoodCourt     4171 non-null   float64
 7   ShoppingMall  4179 non-null   float64
 8   Spa           4176 non-null   float64
 9   VRDeck        4197 non-null   float64
 10  Deck          4277 non-null   object 
 11  Side          4277 non-null   object 
dtypes: float64(6), int64(2), object(4)
memory usage: 401.1+ KB


In [69]:
X = train_data[list(train_data.columns)]
X.drop(columns='Transported', inplace=True)
y = train_data.Transported

In [70]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, random_state=1)

##### One Hot Encoding HomePlanet

In [71]:
s = (X_train.dtypes == 'object')
object_cols = list(s[s].index)

print("Categorical variables:")
print(object_cols)

Categorical variables:
['HomePlanet', 'Destination', 'Deck', 'Side']


In [72]:
train_data['Deck'].value_counts()

F    2993
G    2559
E     876
B     779
C     747
D     478
A     256
T       5
Name: Deck, dtype: int64

In [73]:
OH_encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)
OH_cols_train = pd.DataFrame(OH_encoder.fit_transform(X_train[object_cols]))
OH_cols_valid = pd.DataFrame(OH_encoder.transform(X_valid[object_cols]))

In [74]:
OH_cols_train

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
1,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
2,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
3,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
4,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6514,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
6515,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
6516,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
6517,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0


In [75]:
OH_cols_train.index = X_train.index
OH_cols_valid.index = X_valid.index

In [76]:
OH_cols_train

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
1554,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
5941,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
4194,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
7837,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
1232,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2895,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
7813,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
905,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
5192,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0


In [77]:
num_X_train = X_train.drop(object_cols, axis=1)
num_X_valid = X_valid.drop(object_cols, axis=1)

In [78]:
OH_X_train = pd.concat([num_X_train, OH_cols_train], axis=1)
OH_X_valid = pd.concat([num_X_valid, OH_cols_valid], axis=1)

In [79]:
OH_X_valid

Unnamed: 0,CryoSleep,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,0,1,...,6,7,8,9,10,11,12,13,14,15
1454,0,32.0,0,54.0,3782.0,0.0,21.0,5.0,0.0,1.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
218,1,27.0,0,0.0,,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
7866,0,24.0,0,86.0,669.0,1.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
7622,1,38.0,0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
4108,0,32.0,0,192.0,0.0,441.0,18.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8442,0,41.0,0,3.0,1320.0,0.0,0.0,12392.0,0.0,1.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
7120,1,19.0,0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
8517,0,30.0,0,11.0,495.0,0.0,2049.0,2064.0,0.0,1.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
6147,0,24.0,0,0.0,0.0,53.0,652.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0


In [80]:
#train_data.drop(['PassengerId'], axis=1, inplace = True)
#test_data.drop(['PassengerId'], axis=1, inplace = True)

In [81]:
OH_X_valid.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2174 entries, 1454 to 1256
Data columns (total 24 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   CryoSleep     2174 non-null   int64  
 1   Age           2134 non-null   float64
 2   VIP           2174 non-null   int64  
 3   RoomService   2124 non-null   float64
 4   FoodCourt     2127 non-null   float64
 5   ShoppingMall  2124 non-null   float64
 6   Spa           2133 non-null   float64
 7   VRDeck        2131 non-null   float64
 8   0             2174 non-null   float64
 9   1             2174 non-null   float64
 10  2             2174 non-null   float64
 11  3             2174 non-null   float64
 12  4             2174 non-null   float64
 13  5             2174 non-null   float64
 14  6             2174 non-null   float64
 15  7             2174 non-null   float64
 16  8             2174 non-null   float64
 17  9             2174 non-null   float64
 18  10            2174 non-nu

In [82]:
def score_dataset(OH_X_train, OH_X_valid, y_train, y_valid):
    model = RandomForestRegressor(n_estimators=10, random_state=1)
    model.fit(OH_X_train, y_train)
    preds = model.predict(OH_X_valid)
    return accuracy_score(y_valid, preds)

In [83]:
my_imputer = SimpleImputer()
imputed_X_train = pd.DataFrame(my_imputer.fit_transform(OH_X_train))
imputed_X_valid = pd.DataFrame(my_imputer.transform(OH_X_valid))

imputed_X_train.columns = OH_X_train.columns
imputed_X_valid.columns = OH_X_valid.columns

#print("MAE from Approach 2 (Imputation):")
#print(score_dataset(imputed_X_train, imputed_X_valid, y_train, y_valid))



##### Testing Models

In [87]:
X_train, X_valid, y_train, y_valid = imputed_X_train, imputed_X_valid, y_train, y_valid

In [91]:
y_valid

1454     True
218     False
7866    False
7622     True
4108    False
        ...  
8442    False
7120     True
8517    False
6147     True
1256    False
Name: Transported, Length: 2174, dtype: object

In [85]:
X_train = X_train.astype('str')
X_valid = X_valid.astype('str')
y_train = y_train.astype('str')
y_valid = y_valid.astype('str')

In [93]:
forest_model = RandomForestClassifier(random_state=1)
forest_model.fit(X_train, X_valid)



ValueError: Found input variables with inconsistent numbers of samples: [6519, 2174]

In [96]:
cols = list(train_data.columns)
cols.remove('Transported')

In [97]:
X_test = test_data[cols]

In [98]:
X_test

Unnamed: 0,HomePlanet,CryoSleep,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Deck,Side
0,Earth,1,TRAPPIST-1e,27.0,0,0.0,0.0,0.0,0.0,0.0,G,S
1,Earth,0,TRAPPIST-1e,19.0,0,0.0,9.0,0.0,2823.0,0.0,F,S
2,Europa,1,55 Cancri e,31.0,0,0.0,0.0,0.0,0.0,0.0,C,S
3,Europa,0,TRAPPIST-1e,38.0,0,0.0,6652.0,0.0,181.0,585.0,C,S
4,Earth,0,TRAPPIST-1e,20.0,0,10.0,0.0,635.0,0.0,0.0,F,S
...,...,...,...,...,...,...,...,...,...,...,...,...
4272,Earth,1,TRAPPIST-1e,34.0,0,0.0,0.0,0.0,0.0,0.0,G,S
4273,Earth,0,TRAPPIST-1e,42.0,0,0.0,847.0,17.0,10.0,144.0,F,S
4274,Mars,1,55 Cancri e,,0,0.0,0.0,0.0,0.0,0.0,D,P
4275,Europa,0,TRAPPIST-1e,,0,0.0,2680.0,0.0,0.0,523.0,D,P


In [None]:
predictions = forest_model.predict(X_test)
predictions

In [None]:
output = pd.DataFrame({'Transported': predictions})

output.to_csv('submission1.csv', index=False)