In [2]:
import pandas as pd

In [3]:
data = pd.read_csv("./train.csv")

In [4]:
data.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True


In [5]:
data.HomePlanet.unique()

array(['Europa', 'Earth', 'Mars', nan], dtype=object)

In [6]:
data.Destination.unique()

array(['TRAPPIST-1e', 'PSO J318.5-22', '55 Cancri e', nan], dtype=object)

In [7]:
data = data.drop(["PassengerId","Cabin","Name"], axis=1)

In [8]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8693 entries, 0 to 8692
Data columns (total 11 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   HomePlanet    8492 non-null   object 
 1   CryoSleep     8476 non-null   object 
 2   Destination   8511 non-null   object 
 3   Age           8514 non-null   float64
 4   VIP           8490 non-null   object 
 5   RoomService   8512 non-null   float64
 6   FoodCourt     8510 non-null   float64
 7   ShoppingMall  8485 non-null   float64
 8   Spa           8510 non-null   float64
 9   VRDeck        8505 non-null   float64
 10  Transported   8693 non-null   bool   
dtypes: bool(1), float64(6), object(4)
memory usage: 687.8+ KB


In [9]:
data["HomePlanet"].fillna(data["HomePlanet"].mode().values[0], inplace=True)
data["CryoSleep"].fillna(data["CryoSleep"].mode().values[0], inplace=True)
data["Destination"].fillna(data["Destination"].mode().values[0], inplace=True)
data["VIP"].fillna(data["VIP"].mode().values[0], inplace=True)

In [10]:
data["Age"].fillna(data["Age"].median(), inplace=True)
data["RoomService"].fillna(data["RoomService"].median(), inplace=True)
data["FoodCourt"].fillna(data["FoodCourt"].median(), inplace=True)
data["ShoppingMall"].fillna(data["ShoppingMall"].median(), inplace=True)
data["Spa"].fillna(data["Spa"].median(), inplace=True)
data["VRDeck"].fillna(data["VRDeck"].median(), inplace=True)

In [11]:
print("Missing values")
print(data.isna().sum())

Missing values
HomePlanet      0
CryoSleep       0
Destination     0
Age             0
VIP             0
RoomService     0
FoodCourt       0
ShoppingMall    0
Spa             0
VRDeck          0
Transported     0
dtype: int64


In [12]:
data.index.name = 'SNo'
data.to_csv("data.csv")

In [14]:
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
# from sklearn.preprocessing import LabelEncoder, StandardScaler
import pickle

class CustomDataset(Dataset):
    def __init__(self, features, labels):
        self.features = features
        self.labels = labels
    def __len__(self):
        return len(self.features)
    def __getitem__(self, idx):
        return self.features[idx], self.labels[idx]

In [15]:
import torch.nn as nn

class Model(nn.Module):
    def __init__(self, input_size, output_size):
        super(Model, self).__init__()
        self.layer1 = nn.Linear(input_size, 32)
        self.layer2 = nn.Linear(32, 64)
        self.layer3 = nn.Linear(64, 32)
        self.layer4 = nn.Linear(32, output_size)
        self.relu = nn.ReLU()
        self.sigmoid = nn.Sigmoid()
    def forward(self, x):
        x = self.layer1(x)
        x = self.relu(x)
        x = self.layer2(x)
        x = self.relu(x)
        x = self.layer3(x)
        x = self.relu(x)
        x = self.sigmoid(self.layer4(x))
        return x

In [16]:
model = Model(14, 1)

In [17]:
import torch

criterion = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

In [18]:
from sklearn.preprocessing import OneHotEncoder

In [19]:
data = pd.read_csv("data.csv")
y = data["Transported"].astype(int)
X = data.drop(["SNo","Transported"], axis=1)
encoder = OneHotEncoder(handle_unknown="ignore")
X["CryoSleep"] = X["CryoSleep"].astype(float)
X["VIP"] = X["VIP"].astype(float)
multicol_encoded = encoder.fit(X[["HomePlanet","Destination"]])
with open("utils/encoder_traindata.pickle", 'wb') as f:
    pickle.dump(multicol_encoded, f)
multicol_encoded = encoder.transform(X[["HomePlanet","Destination"]])
multicol_encoded = multicol_encoded.toarray()
multicol_encoded = pd.DataFrame(multicol_encoded, columns=encoder.get_feature_names_out())
X.drop(["HomePlanet","Destination"], axis=1, inplace=True)
X = pd.concat([X, multicol_encoded], axis=1)
X_tensor = torch.Tensor(X.values)
y_tensor = torch.Tensor(y.values)
X_train, X_test, y_train, y_test = train_test_split(X_tensor, y_tensor, test_size=0.20, random_state=21)
train_dataset = CustomDataset(X_train, y_train)
test_dataset = CustomDataset(X_test, y_test)


In [20]:
train_loader = DataLoader(train_dataset, batch_size= 512, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size= 512, shuffle=True)

In [21]:
for epoch in range(6):
    overall_loss = 0 
    for inputs, labels in train_loader:
        outputs = model(inputs)
        loss = criterion(outputs[:,0], labels)
        overall_loss += loss.item()
        loss.backward()
        optimizer.step()
    print(f"Epoch {epoch+1} completed loss {overall_loss}")
    correct_predictions = 0
    total_samples = 0
    with torch.no_grad():
        for inputs, labels in test_loader:
            outputs_test = model(inputs)
            outputs_test = (outputs_test >= 0.3).float()
            correct_predictions += torch.sum(outputs_test[:,0]==labels)
            total_samples += labels.size(0)
    test_accuracy = correct_predictions/total_samples
    print(f"Test Accuracy : {test_accuracy*100:.2f}%")

Epoch 1 completed loss 74.58499646186829
Test Accuracy : 77.40%
Epoch 2 completed loss 38.35793387889862
Test Accuracy : 76.02%
Epoch 3 completed loss 100.10860443115234
Test Accuracy : 64.46%
Epoch 4 completed loss 80.17126035690308
Test Accuracy : 77.69%
Epoch 5 completed loss 41.76813292503357
Test Accuracy : 77.40%
Epoch 6 completed loss 40.82537758350372
Test Accuracy : 77.29%


In [22]:
import onnxruntime
import numpy as np

In [23]:
onnx_model_path = "models/spaceship.onnx"
dummy = torch.randn(14, requires_grad=True)
torch.onnx.export(model,
            dummy,
            onnx_model_path,
            export_params=True,
            opset_version=10,
            do_constant_folding=True,
            input_names = ['input'],
            output_names = ['output'],
            )
torch_out = model(dummy)
ort_session = onnxruntime.InferenceSession(onnx_model_path, providers=["CPUExecutionProvider"])

def to_numpy(tensor):
    return tensor.detach().cpu().numpy() if tensor.requires_grad else tensor.cpu().numpy()

# compute ONNX Runtime output prediction
ort_inputs = {ort_session.get_inputs()[0].name: to_numpy(dummy)}
ort_outs = ort_session.run(None, ort_inputs)

# compare ONNX Runtime and PyTorch results
np.testing.assert_allclose(to_numpy(torch_out), ort_outs[0], rtol=1e-03, atol=1e-05)

print("Exported model has been tested with ONNXRuntime, and the result looks good!")

Exported model has been tested with ONNXRuntime, and the result looks good!
