In [None]:
import numpy as np
import pandas as pd
import tqdm
import time
import matplotlib.pyplot as plt


# 0: The Data

For Basic classification we just use Kaggle's Spaceship Titanic dataset. It's a simple dataset with a few features and a binary label.
Download from: https://www.kaggle.com/competitions/spaceship-titanic/data

### Explore the data briefly, see summary, histograms, ranges, correlations, etc.

The goal here is to get a feel for the data, and to see if there are any obvious issues with it.
Also we prepare the data for learning by doing some basic preprocessing like assigning numerical labels to categorical columns, removing NaNs, imputing values if needed, and normalizing ranges.

In [None]:
train_df = pd.read_csv('../data/spaceship-titanic/train.csv')

In [None]:
train_df.head()

From a common sense POV, passengerID shouldn't affect the passenger's survival, unless it's a proxy for some feature that isn't in the dataset. So we won't drop it just yet.

Our 0 / 1 classification label here is Transported. Our objective is to predict whether a passenger was transported or not given the other features.

In [None]:
train_df.info()

In [None]:
# for categorical columns show set of unique values 
train_df.describe(include=['O'])

In [None]:
# convert categorical data to numerical integer codes
corr_df = train_df.copy()
for col in corr_df.columns:
    if corr_df[col].dtype == 'object' or 'bool':
        corr_df[col] = corr_df[col].astype('category').cat.codes
    else:
        # normalize data by column to -1 to 1 per column
        corr_df[col] = (corr_df[col] - corr_df[col].mean()) / corr_df[col].std()

print(corr_df.head())


In [None]:
corr_df.describe()

In [None]:
import seaborn as sns
correlation_matrix = corr_df.corr(method='pearson')
# plot heatmap
import matplotlib.pyplot as plt

# plot heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm')
plt.title('Correlation Matrix Heatmap')
plt.show()


CryoSleep seems to be decently correlated with Transported, so we should expect it to be a major feature in our model.
RoomService seems to be negatively correlated with Transported, so we should expect it to be a major feature in our model.

Anyway, we're not hand engineering stuff here. The goal is to try out some basic classic ML approaches for classification and see how well they work. Less goo.


In [None]:
# train / val split 80:20 randomly
df = corr_df.copy()

# Set a seed for reproducibility
np.random.seed(42)

# Generate an array of random indices for shuffling
indices = np.arange(len(df))
np.random.shuffle(indices)

# Calculate the split index
split_index = int(0.8 * len(df))

# Split the DataFrame
train_df_split = df.iloc[indices[:split_index]]
val_df = df.iloc[indices[split_index:]]

# Reset the index in the resulting DataFrames
train_df_split.reset_index(drop=True, inplace=True)
val_df.reset_index(drop=True, inplace=True)

# Get train df splits labels and val df's labels
train_labels = train_df_split['Transported']
val_labels = val_df['Transported']
train_df_split=train_df_split.drop(columns=['Transported'])
val_df=val_df.drop(columns=['Transported'])

# Print the shapes of the resulting DataFrames
print("Train set shape:", train_df_split.shape)
print("Validation set shape:", val_df.shape)
print("train_labels", train_labels.shape)
print("val_labels", val_labels.shape)

# Good Ol' Machine Learning.

# 1: K Nearest Neighbours

K Nearest Neighbours works by classifying a given point in an N-D space based on its k nearest neighbours using some distance metric, usually the euclidian metric, i.e. L2-norm. This might be problematic with categorical features because how exactly does distance play a part there? for e.g. the planet category is earth, trappist etc. putting these on a numerical axis linearly makes little sense.. but let's see how our approaches handle them.

In [None]:
# k nearest neighbours
class KNNClassifier:
    def __init__(self, X_train: pd.DataFrame, y_train: pd.DataFrame, k=3, distance_metric='euclidean'):
        self.k = k
        self.distance_metric = distance_metric
        self.X_train = X_train
        self.y_train = y_train

    def _calculate_distance(self, point1: np.array, point2: np.array):
        # The square root of the sum of squares of the distances in each axis
        # This will be sensitive to the largest distance (since it gets squared)
        if self.distance_metric == 'euclidean':
            return np.sqrt(np.sum((point1 - point2) ** 2))
        # The sum of absolute distances on each axis
        elif self.distance_metric == 'manhattan':
            return np.sum(np.abs(point1 - point2))
        else:
            raise ValueError("Unsupported distance metric")

    def predict(self, X_test: np.array):
        """ 
            X_test is of the form: [point1, point2,...pointN]
        """        
        predictions = []
        for test_point in tqdm.tqdm(X_test, desc='Calculating distances', unit='point'):
            # Calculate distances to all training points
            distances = np.array([self._calculate_distance(test_point, train_point) for train_point in self.X_train])

            # Get indices of k-nearest neighbors
            k_nearest_indices = np.argsort(distances)[:self.k]

            # Get the corresponding labels of k-nearest neighbors
            k_nearest_labels = self.y_train[k_nearest_indices]

            # Find the most common class among the k-nearest neighbors
            predicted_label = np.argmax(np.bincount(k_nearest_labels))

            predictions.append(predicted_label)

        return np.array(predictions)

# Create and train the KNN classifier with Manhattan distance
knn_classifier_manhattan = KNNClassifier(train_df_split.values, train_labels.values, k=3, distance_metric='manhattan')
predictions_manhattan = knn_classifier_manhattan.predict(val_df.values)

knn_classifier_euclidean = KNNClassifier(train_df_split.values, train_labels.values, k=3, distance_metric='euclidean')
predictions_euclidean = knn_classifier_euclidean.predict(val_df.values)

### Testing Accuracy:
We just subtract predictions from the labels, take the absolute value and calculate the mean. Subtracting this mean from 1 should give us a 0-1 accuracy metric. i.e 1 implies 100%, 0 implies 0%.

In [None]:
manhattan_accuracy = 1-abs(predictions_manhattan - val_labels).mean()
euclidean_accuracy = 1-abs(predictions_euclidean - val_labels).mean()

print("manhattan accuracy", manhattan_accuracy)
print("euclidean accuracy", euclidean_accuracy)

Euclidean metric seems to be almost as good as random.. This is weird, I would've expected somewhat better performance

### Confusion Matrix

The confusion matrix is a visualisation of how many true positives, false positives, true negatives and false negatives our model predicts.
It is useful for getting a sense of how wrong our model is, and where it might be going wrong.

In [None]:
# confusion matrix

# Calculate the confusion matrix
def plot_confusion_matrix(pred, labels):
    conf_matrix = np.zeros((2, 2))
    for i in range(len(labels)):
        conf_matrix[labels[i], pred[i]] += 1

    # Plot the confusion matrix using seaborn
    sns.heatmap(conf_matrix, annot=True, fmt='g', cmap='Blues', xticklabels=['Predicted 0', 'Predicted 1'], yticklabels=['Actual 0', 'Actual 1'])
    plt.xlabel('Predicted Label')
    plt.ylabel('True Label')
    plt.title('Confusion Matrix Manhattan')
    plt.show()

In [None]:
plot_confusion_matrix(predictions_manhattan, val_labels)

In [None]:
plot_confusion_matrix(predictions_euclidean, val_labels)

# 2: Support Vector Machines

Basic operating principle: Draw a hyperplane between the points that maximises sum of distances of the points from the hyperplane. Applying radial basis functions or polynomials can increase the complexity of seperation. Picture it as embedding the data points in a more complicated manifold and cutting it across with a hyperplane to seperate the points. SVMs offer a neat way to do this using the kernel trick. https://medium.com/@zxr.nju/what-is-the-kernel-trick-why-is-it-important-98a98db0961d

## Extra work: compare with scikit learn's SVM, and improve our implementation
https://scikit-learn.org/stable/modules/svm.html

In [None]:
# add -1 to all zeros in labels to prep for hinge loss
train_labels_hinge = train_labels.copy()
train_labels_hinge[train_labels_hinge == 0] = -1

val_labels_hinge = val_labels.copy()
val_labels_hinge[val_labels_hinge == 0] = -1

In [None]:
class SVM:
    def __init__(self, learning_rate=0.01, lambda_param=0.001, num_epochs=1000, kernel='linear', degree=2, gamma=1.0):
        self.learning_rate = learning_rate
        self.lambda_param = lambda_param
        self.num_epochs = num_epochs
        self.weights = None
        self.bias = None
        self.kernel = kernel
        self.degree = degree
        self.gamma = gamma

    def _apply_kernel(self, X):
        if self.kernel == 'linear':
            return X
        elif self.kernel == 'polynomial':
            return X ** self.degree
        elif self.kernel == 'rbf':
            return np.exp(-self.gamma * np.linalg.norm(X - X[:, np.newaxis], axis=2) ** 2)
        else:
            raise ValueError("Invalid kernel. Supported kernels are 'linear', 'polynomial', and 'rbf'.")

    def _hinge_loss(self, X, y):
        scores = 1 - y * (np.dot(X, self.weights) + self.bias)
        return np.maximum(0, scores)

    def _gradient_descent_step(self, X, y):
        hinge_loss = self._hinge_loss(X, y)
        hinge_loss[hinge_loss > 0] = 1  # Binary indicator function for hinge loss
        gradient = -np.dot(X.T, y * hinge_loss) / len(y)
        gradient_reg = self.lambda_param * self.weights  # Regularization term
        self.weights -= self.learning_rate * (gradient + gradient_reg)
        # Update bias
        self.bias -= self.learning_rate * np.sum(y * hinge_loss) / len(y)
        return hinge_loss

    def fit(self, X, y):
        # Apply the selected kernel
        X_transformed = self._apply_kernel(X)

        # Initialize weights and bias
        self.weights = np.zeros(X_transformed.shape[1])
        self.bias = 0

        # Training loop
        for _ in tqdm.tqdm(range(self.num_epochs), desc='Training', unit='epoch'):
            self._gradient_descent_step(X_transformed, y)

    def predict(self, X):
        # Apply the selected kernel to the input for prediction
        X_transformed = self._apply_kernel(X)
        return np.sign(np.dot(X_transformed, self.weights) + self.bias)

In [None]:
svm = SVM()
svm.fit(train_df_split.values, train_labels_hinge.values)
predictions_svm = svm.predict(val_df.values)
predictions_svm[predictions_svm == -1] = 0

In [None]:
svm_accuracy = 1-abs(predictions_svm - val_labels).mean()
print("svm accuracy", svm_accuracy)

In [None]:
plot_confusion_matrix(np.array(predictions_svm, dtype='int8'), val_labels)


# 3: Decision Trees

We're going to breeze through this and just use sklearn for now. We can come back and implement it in numpy later.
Basically it is going column by column, checking how well each column predicts the label, then selecting the most predictive column as a rule, using some relevant if-condition and bifurcating the rows based on that into yes / no -> then repeating the process with other columns for each bifurcation.

Random forests are just taking N-trees with random params and ensembling the outputs to "average" into a better output.

Gradient boosting is using a sub-decision tree to predict the "error" in the main decision tree.

In [None]:
from sklearn.tree import DecisionTreeClassifier

# Create an instance of the DecisionTreeClassifier
tree_classifier = DecisionTreeClassifier()

# Fit the classifier to the training data
tree_classifier.fit(train_df_split, train_labels)

# Predict on the validation data
predictions_tree = tree_classifier.predict(val_df)

# Calculate the accuracy
tree_accuracy = (predictions_tree == val_labels).mean()
print("Decision Tree accuracy:", tree_accuracy)


# 3a: Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

# Create an instance of the RandomForestClassifier
rf_classifier = RandomForestClassifier()

# Fit the classifier to the training data
rf_classifier.fit(train_df_split, train_labels)

# Predict on the validation data
predictions_rf = rf_classifier.predict(val_df)

# Calculate the accuracy
rf_accuracy = (predictions_rf == val_labels).mean()
print("Random Forest accuracy:", rf_accuracy)


# 3b: Gradient Boosting

In [None]:
from sklearn.ensemble import GradientBoostingClassifier

# Create an instance of the GradientBoostingClassifier
gb_classifier = GradientBoostingClassifier()

# Fit the classifier to the training data
gb_classifier.fit(train_df_split, train_labels)

# Predict on the validation data
predictions_gb = gb_classifier.predict(val_df)

# Calculate the accuracy
gb_accuracy = (predictions_gb == val_labels).mean()
print("Gradient Boosting accuracy:", gb_accuracy)


In [None]:
import xgboost as xgb
# Import the necessary module
# Create an instance of the XGBoost classifier
xgb_classifier = xgb.XGBClassifier()

# Fit the classifier to the training data
xgb_classifier.fit(train_df_split, train_labels)

# Predict on the validation data
predictions_xgb = xgb_classifier.predict(val_df)

# Calculate the accuracy
xgb_accuracy = (predictions_xgb == val_labels).mean()
print("XGBoost accuracy:", xgb_accuracy)


We're getting close to 0.82 with very little effort. How can we improve it?
- Explore feature importance
- 