In [1]:
# FUNCTIONS
def check_missing(df, drop_list):
  for col in df.columns:
    if df[col].isnull().sum() > 0 :
      if df[col].isnull().sum() > 0.5 * len(df):
        drop_list.append(col)
      print(f"{col} - {df[col].isnull().sum()} - {df[col].dtype}")


def ML_models(models, models_names):
  x = df.drop('Transported', axis=1)
  y = df['Transported']
  x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)
  train_score = []
  test_score = []
  # Train models:
  for model in models:
      model.fit(x_train, y_train)
      y_pred1 = model.predict(x_train)
      y_pred2 = model.predict(x_test)
      train_score.append(accuracy_score(y_train, y_pred1))
      test_score.append(accuracy_score(y_test, y_pred2))

  # Difference between training and testing ratio
  ratio = []
  for train, test in zip(train_score, test_score):
      result = train - test
      ratio.append(f'{result * 100:.2f}%')

  # Measure model state:6
  rate = []
  for train, test in zip(train_score, test_score):
      if train <= 0.65 and test <= 0.65:
          rate.append('bad')
      elif train > test * 1.10:
          rate.append('overfite')
      elif train > 0.65 and train < 0.80 and test > 0.65 and test < 0.80:
          rate.append('middle')
      elif train >= 0.80 and test >= 0.80 and train < 1.00 and test < 1.00:
          rate.append('good')
      elif train >= 0.80 and test < 0.80:
          rate.append('high train, low test')
      else:
          rate.append('unknown')


  # Create DataFrame
  model_score = pd.DataFrame({
      'Model': models_names,
      'Train score': [f'{round(score * 100, 2)}%' for score in train_score],
      'Test score': [f'{round(score * 100, 2)}%' for score in test_score],
      'Ratio difference': ratio,
      'Evaluate model': rate
  })
  return model_score

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split, cross_val_score

from sklearn.linear_model import LinearRegression, SGDRegressor, Lasso, Ridge, ElasticNet
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, BaggingRegressor, ExtraTreesRegressor, AdaBoostRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor

from sklearn.metrics import accuracy_score
import warnings
warnings.filterwarnings('ignore')
sns.set()

In [3]:
df = pd.read_csv('/kaggle/input/spaceship-titanic/train.csv')
df_test = pd.read_csv('/kaggle/input/spaceship-titanic/test.csv')
df

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8688,9276_01,Europa,False,A/98/P,55 Cancri e,41.0,True,0.0,6819.0,0.0,1643.0,74.0,Gravior Noxnuther,False
8689,9278_01,Earth,True,G/1499/S,PSO J318.5-22,18.0,False,0.0,0.0,0.0,0.0,0.0,Kurta Mondalley,False
8690,9279_01,Earth,False,G/1500/S,TRAPPIST-1e,26.0,False,0.0,0.0,1872.0,1.0,0.0,Fayey Connon,True
8691,9280_01,Europa,False,E/608/S,55 Cancri e,32.0,False,0.0,1049.0,0.0,353.0,3235.0,Celeon Hontichre,False


In [4]:
dropping = []
check_missing(df, dropping)
print(f"Features that can be droped: {dropping}")

HomePlanet - 201 - object
CryoSleep - 217 - object
Cabin - 199 - object
Destination - 182 - object
Age - 179 - float64
VIP - 203 - object
RoomService - 181 - float64
FoodCourt - 183 - float64
ShoppingMall - 208 - float64
Spa - 183 - float64
VRDeck - 188 - float64
Name - 200 - object
Features that can be droped: []


In [5]:
df.drop(columns=['PassengerId', 'Name', 'Cabin'], inplace=True)
df_test.drop(columns=['PassengerId', 'Name', 'Cabin'], inplace=True)
df_test

Unnamed: 0,HomePlanet,CryoSleep,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck
0,Earth,True,TRAPPIST-1e,27.0,False,0.0,0.0,0.0,0.0,0.0
1,Earth,False,TRAPPIST-1e,19.0,False,0.0,9.0,0.0,2823.0,0.0
2,Europa,True,55 Cancri e,31.0,False,0.0,0.0,0.0,0.0,0.0
3,Europa,False,TRAPPIST-1e,38.0,False,0.0,6652.0,0.0,181.0,585.0
4,Earth,False,TRAPPIST-1e,20.0,False,10.0,0.0,635.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...
4272,Earth,True,TRAPPIST-1e,34.0,False,0.0,0.0,0.0,0.0,0.0
4273,Earth,False,TRAPPIST-1e,42.0,False,0.0,847.0,17.0,10.0,144.0
4274,Mars,True,55 Cancri e,,False,0.0,0.0,0.0,0.0,0.0
4275,Europa,False,,,False,0.0,2680.0,0.0,0.0,523.0


In [6]:
for col in df.columns:
    distinct_values = df[col].unique()
    print(col)
    print(pd.Series(distinct_values))

HomePlanet
0    Europa
1     Earth
2      Mars
3       NaN
dtype: object
CryoSleep
0    False
1     True
2      NaN
dtype: object
Destination
0      TRAPPIST-1e
1    PSO J318.5-22
2      55 Cancri e
3              NaN
dtype: object
Age
0     39.0
1     24.0
2     58.0
3     33.0
4     16.0
      ... 
76    73.0
77    66.0
78    69.0
79    72.0
80    77.0
Length: 81, dtype: float64
VIP
0    False
1     True
2      NaN
dtype: object
RoomService
0          0.0
1        109.0
2         43.0
3        303.0
4         42.0
         ...  
1269     459.0
1270    1003.0
1271    1569.0
1272    8586.0
1273     745.0
Length: 1274, dtype: float64
FoodCourt
0          0.0
1          9.0
2       3576.0
3       1283.0
4         70.0
         ...  
1503    1015.0
1504    1146.0
1505    3208.0
1506    6819.0
1507    4688.0
Length: 1508, dtype: float64
ShoppingMall
0          0.0
1         25.0
2        371.0
3        151.0
4          3.0
         ...  
1111     918.0
1112     205.0
1113    1085.0
1114   

In [7]:
for col in df.columns:
    if df[col].dtype == 'object':
        df[col].fillna(df[col].mode()[0], inplace=True)
for col in df_test.columns:
    if df_test[col].dtype == 'object':
        df_test[col].fillna(df_test[col].mode()[0], inplace=True)
check_missing(df_test, dropping)

Age - 91 - float64
RoomService - 82 - float64
FoodCourt - 106 - float64
ShoppingMall - 98 - float64
Spa - 101 - float64
VRDeck - 80 - float64


In [8]:
df.dropna(inplace=True)
check_missing(df, dropping)

In [9]:
for col in df_test.columns:
    if df_test[col].isnull().sum() > 0:
       df_test[col].fillna(df_test[col].median(), inplace=True)

In [10]:
le = LabelEncoder()
for col in df.columns:
    if df[col].dtype == 'object' or df[col].dtype == 'bool':
        df[col] = le.fit_transform(df[col])
for col in df_test.columns:
    if df_test[col].dtype == 'object' or df_test[col].dtype == 'bool':
        df_test[col] = le.fit_transform(df_test[col])

In [11]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier, ExtraTreesClassifier, AdaBoostClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier


models = [
    LogisticRegression(),
    SVC(),
    KNeighborsClassifier(),
    DecisionTreeClassifier(),
    RandomForestClassifier(),
    BaggingClassifier(),
    ExtraTreesClassifier(),
    AdaBoostClassifier(),
    XGBClassifier(),
    LGBMClassifier()
]

models_names = [
    "LogisticRegression",
    "SVC",
    "KNeighborsClassifier",
    "DecisionTreeClassifier",
    "RandomForestClassifier",
    "BaggingClassifier",
    "ExtraTreesClassifier",
    "AdaBoostClassifier",
    "XGBClassifier",
    "LGBMClassifier"
]

In [12]:
model_scores = ML_models(models, models_names)
model_scores

[LightGBM] [Info] Number of positive: 3065, number of negative: 3031
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004202 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1364
[LightGBM] [Info] Number of data points in the train set: 6096, number of used features: 10
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.502789 -> initscore=0.011155
[LightGBM] [Info] Start training from score 0.011155


Unnamed: 0,Model,Train score,Test score,Ratio difference,Evaluate model
0,LogisticRegression,78.3%,79.86%,-1.56%,middle
1,SVC,78.89%,81.89%,-3.00%,unknown
2,KNeighborsClassifier,82.09%,80.18%,1.90%,good
3,DecisionTreeClassifier,93.01%,74.61%,18.41%,overfite
4,RandomForestClassifier,93.01%,80.45%,12.57%,overfite
5,BaggingClassifier,91.63%,78.74%,12.89%,overfite
6,ExtraTreesClassifier,93.01%,78.48%,14.53%,overfite
7,AdaBoostClassifier,78.74%,80.45%,-1.71%,unknown
8,XGBClassifier,88.73%,81.04%,7.69%,good
9,LGBMClassifier,85.58%,81.63%,3.95%,good


In [13]:
y_pred = models[9].predict(df_test)
pred = y_pred == 1
submission = pd.read_csv('/kaggle/input/spaceship-titanic/sample_submission.csv')

submission['Transported'] = pred

submission.to_csv('submission.csv', index=False)