In [40]:
import pandas as pd
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer

train_data_raw = pd.read_csv("train.csv") 
test_data_raw = pd.read_csv("test.csv") 

In [27]:
train_data_raw

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8688,9276_01,Europa,False,A/98/P,55 Cancri e,41.0,True,0.0,6819.0,0.0,1643.0,74.0,Gravior Noxnuther,False
8689,9278_01,Earth,True,G/1499/S,PSO J318.5-22,18.0,False,0.0,0.0,0.0,0.0,0.0,Kurta Mondalley,False
8690,9279_01,Earth,False,G/1500/S,TRAPPIST-1e,26.0,False,0.0,0.0,1872.0,1.0,0.0,Fayey Connon,True
8691,9280_01,Europa,False,E/608/S,55 Cancri e,32.0,False,0.0,1049.0,0.0,353.0,3235.0,Celeon Hontichre,False


In [28]:
train_data_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8693 entries, 0 to 8692
Data columns (total 14 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   PassengerId   8693 non-null   object 
 1   HomePlanet    8492 non-null   object 
 2   CryoSleep     8476 non-null   object 
 3   Cabin         8494 non-null   object 
 4   Destination   8511 non-null   object 
 5   Age           8514 non-null   float64
 6   VIP           8490 non-null   object 
 7   RoomService   8512 non-null   float64
 8   FoodCourt     8510 non-null   float64
 9   ShoppingMall  8485 non-null   float64
 10  Spa           8510 non-null   float64
 11  VRDeck        8505 non-null   float64
 12  Name          8493 non-null   object 
 13  Transported   8693 non-null   bool   
dtypes: bool(1), float64(6), object(7)
memory usage: 891.5+ KB


In [29]:
test_data_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4277 entries, 0 to 4276
Data columns (total 13 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   PassengerId   4277 non-null   object 
 1   HomePlanet    4190 non-null   object 
 2   CryoSleep     4184 non-null   object 
 3   Cabin         4177 non-null   object 
 4   Destination   4185 non-null   object 
 5   Age           4186 non-null   float64
 6   VIP           4184 non-null   object 
 7   RoomService   4195 non-null   float64
 8   FoodCourt     4171 non-null   float64
 9   ShoppingMall  4179 non-null   float64
 10  Spa           4176 non-null   float64
 11  VRDeck        4197 non-null   float64
 12  Name          4183 non-null   object 
dtypes: float64(6), object(7)
memory usage: 434.5+ KB


In [42]:
test_data = test_data_raw

### Dealing with the nulls and splitting columns apart if necessary 

#### Splitting up PassengerId

In [44]:
train_data = train_data_raw
td = train_data.PassengerId.str.split('_').tolist()
td
td2 = test_data.PassengerId.str.split('_').tolist()
td2

[['0013', '01'],
 ['0018', '01'],
 ['0019', '01'],
 ['0021', '01'],
 ['0023', '01'],
 ['0027', '01'],
 ['0029', '01'],
 ['0032', '01'],
 ['0032', '02'],
 ['0033', '01'],
 ['0037', '01'],
 ['0040', '01'],
 ['0040', '02'],
 ['0042', '01'],
 ['0046', '01'],
 ['0046', '02'],
 ['0046', '03'],
 ['0047', '01'],
 ['0047', '02'],
 ['0047', '03'],
 ['0048', '01'],
 ['0049', '01'],
 ['0054', '01'],
 ['0054', '02'],
 ['0054', '03'],
 ['0055', '01'],
 ['0057', '01'],
 ['0059', '01'],
 ['0060', '01'],
 ['0063', '01'],
 ['0065', '01'],
 ['0075', '01'],
 ['0079', '01'],
 ['0080', '01'],
 ['0083', '01'],
 ['0087', '01'],
 ['0089', '01'],
 ['0093', '01'],
 ['0094', '01'],
 ['0094', '02'],
 ['0095', '01'],
 ['0096', '01'],
 ['0100', '01'],
 ['0100', '02'],
 ['0104', '01'],
 ['0106', '01'],
 ['0109', '01'],
 ['0117', '01'],
 ['0118', '01'],
 ['0121', '01'],
 ['0124', '01'],
 ['0125', '01'],
 ['0125', '02'],
 ['0130', '01'],
 ['0131', '01'],
 ['0132', '01'],
 ['0135', '01'],
 ['0137', '01'],
 ['0142', '01'

In [45]:
train_data['Group'] = [person[0] for person in td]
train_data['GroupNumber'] = [person[1] for person in td]
test_data['Group'] = [person[0] for person in td2]
test_data['GroupNumber'] = [person[1] for person in td2]
train_data

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported,Group,GroupNumber
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False,0001,01
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True,0002,01
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False,0003,01
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False,0003,02
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True,0004,01
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8688,9276_01,Europa,False,A/98/P,55 Cancri e,41.0,True,0.0,6819.0,0.0,1643.0,74.0,Gravior Noxnuther,False,9276,01
8689,9278_01,Earth,True,G/1499/S,PSO J318.5-22,18.0,False,0.0,0.0,0.0,0.0,0.0,Kurta Mondalley,False,9278,01
8690,9279_01,Earth,False,G/1500/S,TRAPPIST-1e,26.0,False,0.0,0.0,1872.0,1.0,0.0,Fayey Connon,True,9279,01
8691,9280_01,Europa,False,E/608/S,55 Cancri e,32.0,False,0.0,1049.0,0.0,353.0,3235.0,Celeon Hontichre,False,9280,01


#### Making the Train Test Split

In [68]:
X = train_data[list(train_data.columns)]
X.drop(columns='Transported', inplace=True)
y = train_data.Transported

In [83]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, random_state=1)

#### Starting to replace nulls

In [74]:
train_data.isnull().sum()

PassengerId       0
HomePlanet      201
CryoSleep       217
Cabin           199
Destination     182
Age             179
VIP             203
RoomService     181
FoodCourt       183
ShoppingMall    208
Spa             183
VRDeck          188
Name            200
Transported       0
Group             0
GroupNumber       0
dtype: int64

In [75]:
test_data.isnull().sum()

PassengerId       0
HomePlanet       87
CryoSleep        93
Cabin           100
Destination      92
Age              91
VIP              93
RoomService      82
FoodCourt       106
ShoppingMall     98
Spa             101
VRDeck           80
Name             94
Group             0
GroupNumber       0
dtype: int64

In [78]:
train_data['HomePlanet'].unique()

array(['Europa', 'Earth', 'Mars', nan], dtype=object)

In [98]:
train_data['HomePlanet'].mode()

0    Earth
Name: HomePlanet, dtype: object