In [None]:
import numpy as np
import pandas as pd
import tqdm
import time

In [None]:
# dummy tqdm progress bar
for i in tqdm.tqdm(np.arange(100)):
    time.sleep(0.01)
    pass

# 0: The Data

For Basic classification we just use Kaggle's Spaceship Titanic dataset. It's a simple dataset with a few features and a binary label.
Download from: https://www.kaggle.com/competitions/spaceship-titanic/data

### Explore the data briefly, see summary, histograms, ranges, correlations, etc.

The goal here is to get a feel for the data, and to see if there are any obvious issues with it.
Also we prepare the data for learning by doing some basic preprocessing like assigning numerical labels to categorical columns, removing NaNs, imputing values if needed, and normalizing ranges.

In [None]:
train_df = pd.read_csv('../data/spaceship-titanic/train.csv')

In [None]:
train_df.head()

From a common sense POV, passengerID shouldn't affect the passenger's survival, unless it's a proxy for some feature that isn't in the dataset. So we won't drop it just yet.

Our 0 / 1 classification label here is Transported. Our objective is to predict whether a passenger was transported or not given the other features.

In [None]:
train_df.info()

In [None]:
# for categorical columns show set of unique values 
train_df.describe(include=['O'])

In [None]:
# convert categorical data to numerical integer codes
corr_df = train_df.copy()
for col in corr_df.columns:
    if corr_df[col].dtype == 'object' or 'bool':
        corr_df[col] = corr_df[col].astype('category').cat.codes
    else:
        # normalize data by column to -1 to 1 per column
        corr_df[col] = (corr_df[col] - corr_df[col].mean()) / corr_df[col].std()

print(corr_df.head())


In [None]:
corr_df.describe()

In [None]:
import seaborn as sns
correlation_matrix = corr_df.corr(method='pearson')
# plot heatmap
import matplotlib.pyplot as plt

# plot heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm')
plt.title('Correlation Matrix Heatmap')
plt.show()


CryoSleep seems to be decently correlated with Transported, so we should expect it to be a major feature in our model.
RoomService seems to be negatively correlated with Transported, so we should expect it to be a major feature in our model.

Anyway, we're not hand engineering stuff here. The goal is to try out some basic classic ML approaches for classification and see how well they work. Less goo.


In [None]:
# train / val split 80:20 randomly
df = corr_df.copy()

# Set a seed for reproducibility
np.random.seed(42)

# Generate an array of random indices for shuffling
indices = np.arange(len(df))
np.random.shuffle(indices)

# Calculate the split index
split_index = int(0.8 * len(df))

# Split the DataFrame
train_df_split = df.iloc[indices[:split_index]]
val_df = df.iloc[indices[split_index:]]

# Reset the index in the resulting DataFrames
train_df_split.reset_index(drop=True, inplace=True)
val_df.reset_index(drop=True, inplace=True)

# Get train df splits labels and val df's labels
train_labels = train_df_split['Transported']
val_labels = val_df['Transported']
train_df_split=train_df_split.drop(columns=['Transported'])
val_df=val_df.drop(columns=['Transported'])

# Print the shapes of the resulting DataFrames
print("Train set shape:", train_df_split.shape)
print("Validation set shape:", val_df.shape)
print("train_labels", train_labels.shape)
print("val_labels", val_labels.shape)

# Good Ol' Machine Learning.

# 1: K Nearest Neighbours

K Nearest Neighbours works by classifying a given point in an N-D space based on its k nearest neighbours based on some distance metric, usually a euclidian metric, i.e. L2-norm. This might be problematic with categorical features because how exactly does distance play a part there? for e.g. the planet category is earth, trappist etc. putting these on a numerical axis linearly makes little sense.. but let's see how our approaches handle them.

In [None]:
# k nearest neighbours
class KNNClassifier:
    def __init__(self, X_train: pd.DataFrame, y_train: pd.DataFrame, k=3, distance_metric='euclidean'):
        self.k = k
        self.distance_metric = distance_metric
        self.X_train = X_train
        self.y_train = y_train

    def _calculate_distance(self, point1, point2):
        if self.distance_metric == 'euclidean':
            return np.sqrt(np.sum((point1 - point2) ** 2))
        elif self.distance_metric == 'manhattan':
            return np.sum(np.abs(point1 - point2))
        else:
            raise ValueError("Unsupported distance metric")

    def predict(self, X_test):
        predictions = []

        for test_point in X_test:
            # Calculate distances to all training points
            distances = np.array([self._calculate_distance(test_point, train_point) for train_point in self.X_train])

            # Get indices of k-nearest neighbors
            k_nearest_indices = np.argsort(distances)[:self.k]

            # Get the corresponding labels of k-nearest neighbors
            k_nearest_labels = self.y_train[k_nearest_indices]

            # Find the most common class among the k-nearest neighbors
            predicted_label = np.argmax(np.bincount(k_nearest_labels))

            predictions.append(predicted_label)

        return np.array(predictions)

# Example usage:
# Assuming X_train, y_train, X_test are NumPy arrays
# Replace them with your actual training features, training labels, and test features

# Create and train the KNN classifier with Manhattan distance
knn_classifier_manhattan = KNNClassifier(train_df_split, train_labels, k=3, distance_metric='euclidean')

# Make predictions
predictions_manhattan = knn_classifier_manhattan.predict(val_df)

# Print the predictions
print("Predictions with Manhattan distance:", predictions_manhattan)

# 2: Support Vector Machines

Basic operating principle: Draw a hyperplane between the points that maximises sum of distances of the points from the hyperplane. Applying kernel functions such as radial basis functions or polynomials can increase the complexity of seperation. Picture it as embedding the data points in a more complicated manifold and cutting it across with a hyperplane to seperate the points.

# 3: Decision Trees

# 3a: Random Forest