<a href="https://www.kaggle.com/code/ammar1almutairi/2022-11-30-assessment?scriptVersionId=114949376" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [None]:
import pathlib

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn import compose, impute, linear_model, model_selection, pipeline, preprocessing 
import torch
from torch import nn, optim, utils
import torchmetrics
import torch.nn.functional as F

In [None]:
INPUT_DIR = pathlib.Path("/kaggle/input/kaust-academy-ai-week-november-2022")
WORKING_DIR = pathlib.Path("/kaggle/working")

# 1. Load the training data

In [None]:
_train_df = pd.read_csv(f"{INPUT_DIR}/train.csv")

# need to have some validation data
_seed = 42
train_df, val_df = model_selection.train_test_split(
    _train_df,
    test_size=0.1,
    random_state = np.random.RandomState(_seed),
    stratify=_train_df.loc[:, "Transported"],
)

# 2. Divide the training (validation) features from the training (validation) target 

In [None]:
train_features = train_df.drop("Transported", axis=1)
train_target = train_df["Transported"]

val_features = val_df.drop("Transported", axis=1)
val_target = val_df["Transported"]

# 3. Data preprocessing

In [None]:
boolean_preprocessing = pipeline.make_pipeline(
    impute.SimpleImputer(strategy="most_frequent"),
)

categorical_preprocessing = pipeline.make_pipeline(
    impute.SimpleImputer(strategy="most_frequent"),
    preprocessing.OneHotEncoder(),
)

numeric_preprocessing = pipeline.make_pipeline(
    impute.SimpleImputer(strategy="mean")
)

to_torch_tensor = pipeline.make_pipeline(
    preprocessing.FunctionTransformer(lambda arr: arr.astype(np.float32)),
    preprocessing.FunctionTransformer(lambda arr: torch.from_numpy(arr))
)

feature_column_transformer = compose.make_column_transformer(
    (boolean_preprocessing, ["CryoSleep", "VIP"]),
    (categorical_preprocessing, ["HomePlanet", "Destination"]),
    (numeric_preprocessing, compose.make_column_selector(dtype_include=np.float64)),
    remainder = "drop",
)

feature_preprocessing = pipeline.make_pipeline(
    feature_column_transformer ,
    to_torch_tensor
)

target_preprocessing = pipeline.make_pipeline(
    preprocessing.FunctionTransformer(lambda df: df.to_numpy()),
    to_torch_tensor
)

# 4. Create your datasets and dataloaders

In [None]:
BATCH_SIZE = 32
NUM_WORKERS = 4

train_features_tensor = feature_preprocessing.fit_transform(train_features)
train_target_tensor = target_preprocessing.fit_transform(train_target)

train_dataset = utils.data.TensorDataset(train_features_tensor, train_target_tensor)
train_dataloader = utils.data.DataLoader(
    train_dataset,
    batch_size=BATCH_SIZE,
    num_workers=NUM_WORKERS,
)

val_features_tensor = feature_preprocessing.transform(val_features)
val_target_tensor = target_preprocessing.transform(val_target)

val_dataset = utils.data.TensorDataset(val_features_tensor, val_target_tensor)
val_dataloader = utils.data.DataLoader(
    val_dataset,
    batch_size=BATCH_SIZE,
    num_workers=NUM_WORKERS,
)

# 5. Define a multi-layer perceptron classifier

In [None]:
_, in_features = train_features_tensor.shape
hidden_features = [14,256,128,64,32,1]

model_fn = nn.Sequential(
    nn.Linear(14,128),
    nn.Dropout(0.1),
    nn.Linear(128,64),
    nn.LeakyReLU(0.1),
    nn.Linear(64,32),
    nn.LeakyReLU(0.1),
    nn.Linear(32,16),
    nn.Dropout(0.1),
    nn.LeakyReLU(0.1),
    nn.Linear(16,10),
    nn.LeakyReLU(0.1),
    nn.Linear(10,8),
    nn.LeakyReLU(0.1),
    nn.Linear(8,4),
    nn.LeakyReLU(0.1),
    nn.Linear(4,1),
    nn.Sigmoid()
)

print(model_fn)

loss_fn = nn.BCELoss()

_optimizer_kwargs = {
    "momentum": 0.9,
    "nesterov": False,
}
optimizer = optim.SGD(model_fn.parameters(), lr=0.001, **_optimizer_kwargs)

# 6. Train your classifier

In [None]:
epochs = 141
log_epochs = 20


model_fn.train()
for epoch in range(epochs):
    
    train_losses = []
    for features, targets in train_dataloader:
        
        # forward pass
        predictions = model_fn(features) 
        predictions = predictions.squeeze(1)
        train_loss = loss_fn(predictions, targets)
        train_losses.append(train_loss)
        
        # backward pass
        train_loss.backward()        
        optimizer.step()        
        optimizer.zero_grad()
        
    train_loss = (torch.stack(train_losses)
                       .mean())
    
    
    
    with torch.no_grad():
        
        model_fn.eval()
        
        val_losses = []
        for features, targets in val_dataloader:
            predictions = model_fn(features)
            predictions = predictions.squeeze(1)
            val_loss = loss_fn(predictions, targets)
            val_losses.append(val_loss)
    
        val_loss = (torch.stack(val_losses)
                         .mean())

    if epoch % log_epochs == 0:
        print(f'Epoch {epoch}, Training Loss {train_loss.item():.4f}, Validation Loss {val_loss.item():.4f}')
        

# 7. Load the testing features

In [None]:
test_features = pd.read_csv(f"{INPUT_DIR}/test.csv")
test_data = feature_preprocessing.transform(test_features)

# 8. Make predictions using the test features

In [None]:
features_tensor = feature_preprocessing.transform(test_features)

In [None]:
with torch.no_grad():
    probas = model_fn(features_tensor)
predictions = probas[:, 0] > 0.5

# 9. Load the sample submission file 

In [None]:
test_features = test_features.set_index('PassengerId')
sample_submission_df = test_features

# 10. Create the submission file

In [None]:
_ = (pd.DataFrame({"Transported": predictions}, index=sample_submission_df.index)
       .to_csv(WORKING_DIR / "submission.csv"))  