# Getting Started

In [1]:
import pandas as pd
import numpy as np

In [2]:
dataset_train = pd.read_csv("/kaggle/input/spaceship-titanic/train.csv")

In [3]:
dataset_train.shape

(8693, 14)

In [4]:
dataset_train.isnull().sum()

PassengerId       0
HomePlanet      201
CryoSleep       217
Cabin           199
Destination     182
Age             179
VIP             203
RoomService     181
FoodCourt       183
ShoppingMall    208
Spa             183
VRDeck          188
Name            200
Transported       0
dtype: int64

In [5]:
dataset_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8693 entries, 0 to 8692
Data columns (total 14 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   PassengerId   8693 non-null   object 
 1   HomePlanet    8492 non-null   object 
 2   CryoSleep     8476 non-null   object 
 3   Cabin         8494 non-null   object 
 4   Destination   8511 non-null   object 
 5   Age           8514 non-null   float64
 6   VIP           8490 non-null   object 
 7   RoomService   8512 non-null   float64
 8   FoodCourt     8510 non-null   float64
 9   ShoppingMall  8485 non-null   float64
 10  Spa           8510 non-null   float64
 11  VRDeck        8505 non-null   float64
 12  Name          8493 non-null   object 
 13  Transported   8693 non-null   bool   
dtypes: bool(1), float64(6), object(7)
memory usage: 891.5+ KB


In [6]:
dataset_test = pd.read_csv("/kaggle/input/spaceship-titanic/test.csv")

In [7]:
dataset_test.shape

(4277, 13)

In [8]:
dataset_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4277 entries, 0 to 4276
Data columns (total 13 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   PassengerId   4277 non-null   object 
 1   HomePlanet    4190 non-null   object 
 2   CryoSleep     4184 non-null   object 
 3   Cabin         4177 non-null   object 
 4   Destination   4185 non-null   object 
 5   Age           4186 non-null   float64
 6   VIP           4184 non-null   object 
 7   RoomService   4195 non-null   float64
 8   FoodCourt     4171 non-null   float64
 9   ShoppingMall  4179 non-null   float64
 10  Spa           4176 non-null   float64
 11  VRDeck        4197 non-null   float64
 12  Name          4183 non-null   object 
dtypes: float64(6), object(7)
memory usage: 434.5+ KB


In [9]:
dataset_test.isnull().sum()

PassengerId       0
HomePlanet       87
CryoSleep        93
Cabin           100
Destination      92
Age              91
VIP              93
RoomService      82
FoodCourt       106
ShoppingMall     98
Spa             101
VRDeck           80
Name             94
dtype: int64

# Merging Data

In [10]:
y = dataset_train['Transported']
dataset_train = dataset_train.drop(["Transported"],axis = 1)
y

0       False
1        True
2       False
3       False
4        True
        ...  
8688    False
8689    False
8690     True
8691    False
8692     True
Name: Transported, Length: 8693, dtype: bool

In [11]:
final_dataset = pd.concat([dataset_train,dataset_test])

# Preparing Data

In [12]:
final_dataset.shape

(12970, 13)

In [13]:
final_dataset.isnull().sum()

PassengerId       0
HomePlanet      288
CryoSleep       310
Cabin           299
Destination     274
Age             270
VIP             296
RoomService     263
FoodCourt       289
ShoppingMall    306
Spa             284
VRDeck          268
Name            294
dtype: int64

In [14]:
final_dataset['HomePlanet'].unique()

array(['Europa', 'Earth', 'Mars', nan], dtype=object)

In [15]:
final_dataset[['VIP','RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']] = final_dataset[['VIP','RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']].fillna(0)

In [16]:
final_dataset.isnull().sum()

PassengerId       0
HomePlanet      288
CryoSleep       310
Cabin           299
Destination     274
Age             270
VIP               0
RoomService       0
FoodCourt         0
ShoppingMall      0
Spa               0
VRDeck            0
Name            294
dtype: int64

In [17]:
final_dataset.drop(['PassengerId','Name'],axis = 1,inplace= True)

In [18]:
final_dataset.isnull().sum()

HomePlanet      288
CryoSleep       310
Cabin           299
Destination     274
Age             270
VIP               0
RoomService       0
FoodCourt         0
ShoppingMall      0
Spa               0
VRDeck            0
dtype: int64

In [19]:
final_dataset.info()

<class 'pandas.core.frame.DataFrame'>
Index: 12970 entries, 0 to 4276
Data columns (total 11 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   HomePlanet    12682 non-null  object 
 1   CryoSleep     12660 non-null  object 
 2   Cabin         12671 non-null  object 
 3   Destination   12696 non-null  object 
 4   Age           12700 non-null  float64
 5   VIP           12970 non-null  object 
 6   RoomService   12970 non-null  float64
 7   FoodCourt     12970 non-null  float64
 8   ShoppingMall  12970 non-null  float64
 9   Spa           12970 non-null  float64
 10  VRDeck        12970 non-null  float64
dtypes: float64(6), object(5)
memory usage: 1.2+ MB


In [20]:
final_dataset['HomePlanet']=final_dataset['HomePlanet'].fillna(final_dataset['HomePlanet'].mode()[0])
final_dataset['CryoSleep']=final_dataset['CryoSleep'].fillna(final_dataset['CryoSleep'].mode()[0])
final_dataset['Cabin']=final_dataset['Cabin'].fillna(final_dataset['Cabin'].mode()[0])
final_dataset['Destination']=final_dataset['Destination'].fillna(final_dataset['Destination'].mode()[0])
final_dataset['Age']=final_dataset['Age'].fillna(final_dataset['Age'].mean())

In [21]:
final_dataset.isnull().sum()

HomePlanet      0
CryoSleep       0
Cabin           0
Destination     0
Age             0
VIP             0
RoomService     0
FoodCourt       0
ShoppingMall    0
Spa             0
VRDeck          0
dtype: int64

In [22]:
final_dataset[['Deck','Cabin_num','Side']]=final_dataset.Cabin.str.split('/',expand=True)
final_dataset.drop(['Cabin'],axis = 1,inplace=True)

In [23]:
final_dataset.Cabin_num = final_dataset.Cabin_num.astype(float)

In [24]:
final_dataset = pd.get_dummies(final_dataset)

# Again Splitting Data

In [25]:
X = final_dataset.iloc[:8693,:]
Test = final_dataset.iloc[8693:,:]

In [26]:
X

Unnamed: 0,CryoSleep,Age,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Cabin_num,HomePlanet_Earth,HomePlanet_Europa,...,Deck_A,Deck_B,Deck_C,Deck_D,Deck_E,Deck_F,Deck_G,Deck_T,Side_P,Side_S
0,False,39.0,0.0,0.0,0.0,0.0,0.0,0.0,False,True,...,False,True,False,False,False,False,False,False,True,False
1,False,24.0,109.0,9.0,25.0,549.0,44.0,0.0,True,False,...,False,False,False,False,False,True,False,False,False,True
2,False,58.0,43.0,3576.0,0.0,6715.0,49.0,0.0,False,True,...,True,False,False,False,False,False,False,False,False,True
3,False,33.0,0.0,1283.0,371.0,3329.0,193.0,0.0,False,True,...,True,False,False,False,False,False,False,False,False,True
4,False,16.0,303.0,70.0,151.0,565.0,2.0,1.0,True,False,...,False,False,False,False,False,True,False,False,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8688,False,41.0,0.0,6819.0,0.0,1643.0,74.0,98.0,False,True,...,True,False,False,False,False,False,False,False,True,False
8689,True,18.0,0.0,0.0,0.0,0.0,0.0,1499.0,True,False,...,False,False,False,False,False,False,True,False,False,True
8690,False,26.0,0.0,0.0,1872.0,1.0,0.0,1500.0,True,False,...,False,False,False,False,False,False,True,False,False,True
8691,False,32.0,0.0,1049.0,0.0,353.0,3235.0,608.0,False,True,...,False,False,False,False,True,False,False,False,False,True


In [27]:
Test

Unnamed: 0,CryoSleep,Age,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Cabin_num,HomePlanet_Earth,HomePlanet_Europa,...,Deck_A,Deck_B,Deck_C,Deck_D,Deck_E,Deck_F,Deck_G,Deck_T,Side_P,Side_S
0,True,27.000000,0.0,0.0,0.0,0.0,0.0,3.0,True,False,...,False,False,False,False,False,False,True,False,False,True
1,False,19.000000,0.0,9.0,0.0,2823.0,0.0,4.0,True,False,...,False,False,False,False,False,True,False,False,False,True
2,True,31.000000,0.0,0.0,0.0,0.0,0.0,0.0,False,True,...,False,False,True,False,False,False,False,False,False,True
3,False,38.000000,0.0,6652.0,0.0,181.0,585.0,1.0,False,True,...,False,False,True,False,False,False,False,False,False,True
4,False,20.000000,10.0,0.0,635.0,0.0,0.0,5.0,True,False,...,False,False,False,False,False,True,False,False,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4272,True,34.000000,0.0,0.0,0.0,0.0,0.0,1496.0,True,False,...,False,False,False,False,False,False,True,False,False,True
4273,False,42.000000,0.0,847.0,17.0,10.0,144.0,160.0,True,False,...,False,False,False,False,False,False,True,False,True,False
4274,True,28.771969,0.0,0.0,0.0,0.0,0.0,296.0,False,False,...,False,False,False,True,False,False,False,False,True,False
4275,False,28.771969,0.0,2680.0,0.0,0.0,523.0,297.0,False,True,...,False,False,False,True,False,False,False,False,True,False


# Creating Model and Prediction

In [28]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier()

model.fit(X,y)

In [29]:
pred = model.predict(X)

In [30]:
from sklearn.metrics import accuracy_score
accuracy_score(y,pred)

0.9993097894857932

In [31]:
pred_test = model.predict(Test)
print(pred_test)

[ True False  True ...  True  True False]


# Creating Output

In [32]:
submission = pd.read_csv("/kaggle/input/spaceship-titanic/test.csv")
submission.columns

Index(['PassengerId', 'HomePlanet', 'CryoSleep', 'Cabin', 'Destination', 'Age',
       'VIP', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck',
       'Name'],
      dtype='object')

In [33]:
submission.drop(['HomePlanet', 'CryoSleep', 'Cabin', 'Destination', 'Age','VIP', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck','Name'],axis = 1,inplace = True)

In [34]:
submission['Transported'] = pred_test

In [35]:
submission

Unnamed: 0,PassengerId,Transported
0,0013_01,True
1,0018_01,False
2,0019_01,True
3,0021_01,True
4,0023_01,False
...,...,...
4272,9266_02,True
4273,9269_01,False
4274,9271_01,True
4275,9273_01,True


In [36]:
submission.to_csv('submission.csv', index=False)

In [37]:
submission

Unnamed: 0,PassengerId,Transported
0,0013_01,True
1,0018_01,False
2,0019_01,True
3,0021_01,True
4,0023_01,False
...,...,...
4272,9266_02,True
4273,9269_01,False
4274,9271_01,True
4275,9273_01,True
