In [28]:
import logging
import os
import pickle
import argparse
import optuna
import pandas as pd
from catboost import CatBoostClassifier
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split


In [29]:
passenger_data = pd.read_csv(
   "../data/train.csv", 
   true_values = ["True"], 
   false_values = ["False"]
)

In [30]:
passenger_data

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8688,9276_01,Europa,False,A/98/P,55 Cancri e,41.0,True,0.0,6819.0,0.0,1643.0,74.0,Gravior Noxnuther,False
8689,9278_01,Earth,True,G/1499/S,PSO J318.5-22,18.0,False,0.0,0.0,0.0,0.0,0.0,Kurta Mondalley,False
8690,9279_01,Earth,False,G/1500/S,TRAPPIST-1e,26.0,False,0.0,0.0,1872.0,1.0,0.0,Fayey Connon,True
8691,9280_01,Europa,False,E/608/S,55 Cancri e,32.0,False,0.0,1049.0,0.0,353.0,3235.0,Celeon Hontichre,False


In [31]:
passenger_data.dtypes

PassengerId      object
HomePlanet       object
CryoSleep        object
Cabin            object
Destination      object
Age             float64
VIP              object
RoomService     float64
FoodCourt       float64
ShoppingMall    float64
Spa             float64
VRDeck          float64
Name             object
Transported        bool
dtype: object

In [32]:
passenger_data.HomePlanet.value_counts(dropna = False)
passenger_data.CryoSleep.value_counts(dropna = False)
passenger_data.VIP.value_counts(dropna = False)
destinations = passenger_data.Destination.value_counts().sort_values(ascending = True)

In [33]:
num_passengers = passenger_data.PassengerId \
    .str.split("_", expand = True)[0] \
    .value_counts().value_counts()

In [34]:
passenger_attributes = passenger_data.drop(columns = ["Transported"]) # X
passenger_labels = passenger_data["Transported"] # y

In [35]:
passenger_attributes.drop(columns = ["Name", "Cabin", "PassengerId"], inplace = True)
passenger_attributes = pd.get_dummies(passenger_attributes, drop_first = True)
passenger_attributes = passenger_attributes.dropna()
passenger_labels = passenger_labels[passenger_attributes.index]

In [36]:
len(passenger_attributes), len(passenger_labels)

(7620, 7620)

In [37]:
passenger_attributes

Unnamed: 0,Age,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,HomePlanet_Europa,HomePlanet_Mars,CryoSleep_True,Destination_PSO J318.5-22,Destination_TRAPPIST-1e,VIP_True
0,39.0,0.0,0.0,0.0,0.0,0.0,True,False,False,False,True,False
1,24.0,109.0,9.0,25.0,549.0,44.0,False,False,False,False,True,False
2,58.0,43.0,3576.0,0.0,6715.0,49.0,True,False,False,False,True,True
3,33.0,0.0,1283.0,371.0,3329.0,193.0,True,False,False,False,True,False
4,16.0,303.0,70.0,151.0,565.0,2.0,False,False,False,False,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...
8688,41.0,0.0,6819.0,0.0,1643.0,74.0,True,False,False,False,False,True
8689,18.0,0.0,0.0,0.0,0.0,0.0,False,False,True,True,False,False
8690,26.0,0.0,0.0,1872.0,1.0,0.0,False,False,False,False,True,False
8691,32.0,0.0,1049.0,0.0,353.0,3235.0,True,False,False,False,False,False


In [38]:
from sklearn.preprocessing import MinMaxScaler


In [39]:
model = LogisticRegression()
model.fit(passenger_attributes, passenger_labels)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [40]:
model.score(passenger_attributes, passenger_labels)

0.7896325459317586

In [41]:
test_passenger_data = pd.read_csv("../data/test.csv", true_values = ["True"], false_values = ["False"])

In [42]:
test_passenger_data

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name
0,0013_01,Earth,True,G/3/S,TRAPPIST-1e,27.0,False,0.0,0.0,0.0,0.0,0.0,Nelly Carsoning
1,0018_01,Earth,False,F/4/S,TRAPPIST-1e,19.0,False,0.0,9.0,0.0,2823.0,0.0,Lerome Peckers
2,0019_01,Europa,True,C/0/S,55 Cancri e,31.0,False,0.0,0.0,0.0,0.0,0.0,Sabih Unhearfus
3,0021_01,Europa,False,C/1/S,TRAPPIST-1e,38.0,False,0.0,6652.0,0.0,181.0,585.0,Meratz Caltilter
4,0023_01,Earth,False,F/5/S,TRAPPIST-1e,20.0,False,10.0,0.0,635.0,0.0,0.0,Brence Harperez
...,...,...,...,...,...,...,...,...,...,...,...,...,...
4272,9266_02,Earth,True,G/1496/S,TRAPPIST-1e,34.0,False,0.0,0.0,0.0,0.0,0.0,Jeron Peter
4273,9269_01,Earth,False,,TRAPPIST-1e,42.0,False,0.0,847.0,17.0,10.0,144.0,Matty Scheron
4274,9271_01,Mars,True,D/296/P,55 Cancri e,,False,0.0,0.0,0.0,0.0,0.0,Jayrin Pore
4275,9273_01,Europa,False,D/297/P,,,False,0.0,2680.0,0.0,0.0,523.0,Kitakan Conale


In [43]:
test_passenger_data = test_passenger_data.set_index("PassengerId")
test_passenger_data.drop(columns = ["Name", "Cabin"], inplace=True)
test_passenger_data = pd.get_dummies(test_passenger_data, drop_first = True)

In [44]:
test_passenger_data

Unnamed: 0_level_0,Age,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,HomePlanet_Europa,HomePlanet_Mars,CryoSleep_True,Destination_PSO J318.5-22,Destination_TRAPPIST-1e,VIP_True
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
0013_01,27.0,0.0,0.0,0.0,0.0,0.0,False,False,True,False,True,False
0018_01,19.0,0.0,9.0,0.0,2823.0,0.0,False,False,False,False,True,False
0019_01,31.0,0.0,0.0,0.0,0.0,0.0,True,False,True,False,False,False
0021_01,38.0,0.0,6652.0,0.0,181.0,585.0,True,False,False,False,True,False
0023_01,20.0,10.0,0.0,635.0,0.0,0.0,False,False,False,False,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...
9266_02,34.0,0.0,0.0,0.0,0.0,0.0,False,False,True,False,True,False
9269_01,42.0,0.0,847.0,17.0,10.0,144.0,False,False,False,False,True,False
9271_01,,0.0,0.0,0.0,0.0,0.0,False,True,True,False,False,False
9273_01,,0.0,2680.0,0.0,0.0,523.0,True,False,False,False,False,False


In [45]:
test_passenger_data = test_passenger_data.fillna(0)

In [46]:
test_passenger_data

Unnamed: 0_level_0,Age,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,HomePlanet_Europa,HomePlanet_Mars,CryoSleep_True,Destination_PSO J318.5-22,Destination_TRAPPIST-1e,VIP_True
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
0013_01,27.0,0.0,0.0,0.0,0.0,0.0,False,False,True,False,True,False
0018_01,19.0,0.0,9.0,0.0,2823.0,0.0,False,False,False,False,True,False
0019_01,31.0,0.0,0.0,0.0,0.0,0.0,True,False,True,False,False,False
0021_01,38.0,0.0,6652.0,0.0,181.0,585.0,True,False,False,False,True,False
0023_01,20.0,10.0,0.0,635.0,0.0,0.0,False,False,False,False,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...
9266_02,34.0,0.0,0.0,0.0,0.0,0.0,False,False,True,False,True,False
9269_01,42.0,0.0,847.0,17.0,10.0,144.0,False,False,False,False,True,False
9271_01,0.0,0.0,0.0,0.0,0.0,0.0,False,True,True,False,False,False
9273_01,0.0,0.0,2680.0,0.0,0.0,523.0,True,False,False,False,False,False


In [47]:
predictions = model.predict(test_passenger_data)

In [48]:
test_passenger_data["Transported"] = predictions

In [49]:
submission = test_passenger_data[["Transported"]]
submission.to_csv("submission.csv")