# EDA

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df = pd.read_csv("/kaggle/input/spaceship-titanic/train.csv")
df.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True


In [3]:
df = df.drop(['PassengerId', 'Name', 'Cabin'], axis=1)
df.head()

Unnamed: 0,HomePlanet,CryoSleep,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Transported
0,Europa,False,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,False
1,Earth,False,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,True
2,Europa,False,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,False
3,Europa,False,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,False
4,Earth,False,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,True


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8693 entries, 0 to 8692
Data columns (total 11 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   HomePlanet    8492 non-null   object 
 1   CryoSleep     8476 non-null   object 
 2   Destination   8511 non-null   object 
 3   Age           8514 non-null   float64
 4   VIP           8490 non-null   object 
 5   RoomService   8512 non-null   float64
 6   FoodCourt     8510 non-null   float64
 7   ShoppingMall  8485 non-null   float64
 8   Spa           8510 non-null   float64
 9   VRDeck        8505 non-null   float64
 10  Transported   8693 non-null   bool   
dtypes: bool(1), float64(6), object(4)
memory usage: 687.8+ KB


In [5]:
df.isna().sum()

HomePlanet      201
CryoSleep       217
Destination     182
Age             179
VIP             203
RoomService     181
FoodCourt       183
ShoppingMall    208
Spa             183
VRDeck          188
Transported       0
dtype: int64

# Preprocessing

In [6]:
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.impute import KNNImputer

In [7]:
encoder = LabelEncoder()
df['HomePlanet'] = encoder.fit_transform(df['HomePlanet'])
df['CryoSleep'] = encoder.fit_transform(df['CryoSleep']) 
df['Destination'] = encoder.fit_transform(df['Destination']) 
df['VIP'] = encoder.fit_transform(df['VIP']) 
df['Transported'] = encoder.fit_transform(df['Transported']) 

In [8]:
scalar = MinMaxScaler()
df[['Age',
    'FoodCourt',
    'RoomService',
    'ShoppingMall',
    'Spa',
    'VRDeck']] = scalar.fit_transform(df[['Age',
                                          'FoodCourt',
                                          'RoomService',
                                          'ShoppingMall',
                                          'Spa',
                                          'VRDeck']])

In [9]:
columns = df.columns
imputer = KNNImputer(n_neighbors=5)
df = imputer.fit_transform(df)
df = pd.DataFrame(df, columns= columns)

In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8693 entries, 0 to 8692
Data columns (total 11 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   HomePlanet    8693 non-null   float64
 1   CryoSleep     8693 non-null   float64
 2   Destination   8693 non-null   float64
 3   Age           8693 non-null   float64
 4   VIP           8693 non-null   float64
 5   RoomService   8693 non-null   float64
 6   FoodCourt     8693 non-null   float64
 7   ShoppingMall  8693 non-null   float64
 8   Spa           8693 non-null   float64
 9   VRDeck        8693 non-null   float64
 10  Transported   8693 non-null   float64
dtypes: float64(11)
memory usage: 747.2 KB


# Models

In [11]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import GradientBoostingClassifier
from tpot import TPOTClassifier
from sklearn.ensemble import ExtraTreesClassifier

In [12]:
X = df.iloc[:, :-1]
y = df.iloc[:, -1]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2)

In [13]:
logreg = LogisticRegression()
logreg.fit(X_train, y_train)
acc_log = round(logreg.score(X_train, y_train) * 100, 3)
log = round(logreg.score(X_test, y_test) * 100, 3)
print("Train: ", acc_log)
print("Test: ", log)

Train:  75.812
Test:  75.963


In [14]:
SVC = SVC(kernel= 'linear', C= 1)
SVC.fit(X_train, y_train)
acc_svc = round(SVC.score(X_train, y_train) * 100, 3)
svc = round(SVC.score(X_test, y_test) * 100, 3)
print("Train: ", acc_svc)
print("Test: ", svc)

Train:  74.633
Test:  74.468


In [15]:
RandomForestClassifier = RandomForestClassifier(n_estimators=100)
RandomForestClassifier.fit(X_train, y_train)
acc_random_forest = round(RandomForestClassifier.score(X_train, y_train) * 100, 3) 
random_forest = round(RandomForestClassifier.score(X_test, y_test) * 100, 3) 
print("Train: ", acc_random_forest)
print("Test: ", random_forest)

Train:  94.061
Test:  77.976


In [16]:
DecisionTreeClassifier = DecisionTreeClassifier()
DecisionTreeClassifier.fit(X_train, y_train)
acc_decision_tree = round(DecisionTreeClassifier.score(X_train, y_train) * 100, 3)
decision_tree = round(DecisionTreeClassifier.score(X_test, y_test) * 100, 3)
print("Train: ",acc_decision_tree)
print("Test: ", decision_tree)

Train:  94.075
Test:  73.145


In [17]:
KNeighborsClassifier = KNeighborsClassifier(n_neighbors = 3)
KNeighborsClassifier.fit(X_train, y_train)
acc_knn = round(KNeighborsClassifier.score(X_train, y_train) * 100, 3)
knn = round(KNeighborsClassifier.score(X_test, y_test) * 100, 3)
print("Train: ", acc_knn)
print("Test: ", knn)

Train:  83.966
Test:  74.296


In [18]:
#tpot = TPOTClassifier(verbosity=2, max_time_mins=10)
#tpot.fit(X_train, y_train)
#print("Train: ", tpot.score(X_train, y_train))
#print("Test: ", tpot.score(X_test, y_test))

In [19]:
GradientBoostingClassifier = GradientBoostingClassifier(
    learning_rate=0.01,
    max_depth=8,
    max_features=0.5,
    min_samples_leaf=17,
    min_samples_split=6,
    n_estimators=100,
    subsample=0.6,
    random_state=42
)
GradientBoostingClassifier.fit(X_train, y_train)
acc_GBC = round(GradientBoostingClassifier.score(X_train, y_train) * 100, 3)
GBC = round(GradientBoostingClassifier.score(X_test, y_test) * 100, 3)
print("Train: ", acc_GBC)
print("Test: ", GBC)

Train:  82.183
Test:  78.723


In [20]:
ExtraTreesClassifier = ExtraTreesClassifier(
    bootstrap=True,
    criterion='gini',
    max_features=0.65,
    min_samples_leaf=1,
    min_samples_split=6,
    n_estimators=100,
    random_state=42
)
ExtraTreesClassifier.fit(X_train, y_train)
acc_extra_tree = round(ExtraTreesClassifier.score(X_train, y_train) * 100, 3)
extra_tree = round(ExtraTreesClassifier.score(X_test, y_test) * 100, 3)
print("Train: ", acc_extra_tree)
print("Test: ", extra_tree)

Train:  89.819
Test:  78.723


In [21]:
test = pd.read_csv("/kaggle/input/spaceship-titanic/test.csv")

In [22]:
test.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name
0,0013_01,Earth,True,G/3/S,TRAPPIST-1e,27.0,False,0.0,0.0,0.0,0.0,0.0,Nelly Carsoning
1,0018_01,Earth,False,F/4/S,TRAPPIST-1e,19.0,False,0.0,9.0,0.0,2823.0,0.0,Lerome Peckers
2,0019_01,Europa,True,C/0/S,55 Cancri e,31.0,False,0.0,0.0,0.0,0.0,0.0,Sabih Unhearfus
3,0021_01,Europa,False,C/1/S,TRAPPIST-1e,38.0,False,0.0,6652.0,0.0,181.0,585.0,Meratz Caltilter
4,0023_01,Earth,False,F/5/S,TRAPPIST-1e,20.0,False,10.0,0.0,635.0,0.0,0.0,Brence Harperez


In [23]:
test = test.drop(['PassengerId', 'Name', 'Cabin'], axis='columns')

In [24]:
test['HomePlanet'] = encoder.fit_transform(test['HomePlanet'])
test['CryoSleep'] = encoder.fit_transform(test['CryoSleep']) 
test['Destination'] = encoder.fit_transform(test['Destination']) 
test['VIP'] = encoder.fit_transform(test['VIP']) 

In [25]:
test[['Age',
      'FoodCourt',
      'RoomService',
      'ShoppingMall',
      'Spa',
      'VRDeck']] = scalar.fit_transform(test[['Age',
                                              'FoodCourt',
                                              'RoomService',
                                              'ShoppingMall',
                                              'Spa',
                                              'VRDeck']])

In [26]:
from sklearn.impute import KNNImputer
columns = test.columns
imputer = KNNImputer(n_neighbors=2)
test = imputer.fit_transform(test)
test = pd.DataFrame(test, columns= columns)

In [28]:
predictions = ExtraTreesClassifier.predict(test)
predictions = predictions.astype(bool)

In [None]:
test_df = pd.read_csv("/kaggle/input/spaceship-titanic/test.csv")
output = pd.DataFrame({'PassengerId': test_df.PassengerId, 'Transported': predictions})
output.to_csv('/kaggle/working/submission.csv', index=False)