In [None]:
%%capture
%config InlineBackend.figure_format = 'retina'
from sklearn import datasets
from sklearn import impute
from sklearn import neighbors
from sklearn import preprocessing
from sklearn import svm
from sklearn import compose
from sklearn import tree
from sklearn import linear_model
from sklearn import neural_network
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
import itertools
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import scipy.io as sio
import seaborn as sns
import warnings
from joblib import Parallel, delayed
warnings.filterwarnings('ignore')
# pip install gspread and df2gspread
import sys
!{sys.executable} -m pip install gspread
!{sys.executable} -m pip install numpy df2gspread
import gspread
from df2gspread import df2gspread as d2g
from oauth2client.service_account import ServiceAccountCredentials

# Preprocessing data sets

In [None]:
# California housing data set.
# Modified to a classification problem. We want to predict whether a house costs more or less
# than the median price.
X0, Y0 = datasets.fetch_california_housing(return_X_y=True)

X0 = preprocessing.StandardScaler().fit_transform(X0)
Y0 = 2 * (Y0 >= np.median(Y0)).astype(np.float32) - 1

X0.shape, Y0.shape

In [None]:
# 'Adult' data set from UCI repository
path = 'data/adult/adult'
adult_data = pd.read_csv(f'{path}.data', header=None)
adult_test = pd.read_csv(f'{path}.test', skiprows=1, header=None)
adult = pd.concat([adult_data, adult_test])
# These are the columns containing categorical values
categorical = [13, 9, 8, 7, 6, 5, 3, 1]

X1 = Pipeline([
    ('impute_missing', impute.SimpleImputer(missing_values=' ?', strategy='most_frequent')),
    ('norm_and_onehot',
        compose.ColumnTransformer(
            sparse_threshold=0, 
            transformers=[
                ('onehot', preprocessing.OneHotEncoder(), categorical)
            ], 
            remainder=preprocessing.StandardScaler()
        )
    ),
    ('pca', PCA(n_components=0.8, svd_solver='full')),
]).fit_transform(adult.drop(14, axis=1).to_numpy())

Y1 = adult[14].to_numpy()
# A boolean array using +1 for >50K and -1 for <=50K.
Y1 = 2 * np.logical_or(Y1 == ' >50K.', Y1 == ' >50K').astype(np.float) - 1

X1.shape, Y1.shape

In [None]:
path = 'data/optdigits/optdigits.tra'
orig_train = pd.read_csv(f'data/optdigits/optdigits.tra', header=None)
orig_test = pd.read_csv(f'data/optdigits/optdigits.tes', header=None)
optdigits = pd.concat([orig_train, orig_test])

X2 = preprocessing.StandardScaler().fit_transform(optdigits.to_numpy()[:, :-1])
Y2 = optdigits.to_numpy()[:, -1]

X2.shape, Y2.shape

In [None]:
path = 'data/abalone/abalone.data'
abalone = pd.read_csv(path, header=None).to_numpy()

X3 = Pipeline([
    ('preprocess', 
         compose.ColumnTransformer(
            sparse_threshold=0,
            transformers=[
                ('onehot', preprocessing.OneHotEncoder(), [0])
            ],
            remainder=preprocessing.StandardScaler())
    ),
]).fit_transform(abalone[:, :-1])

Y3 = 2 * (abalone[:, -1] <= 9).astype(np.float) - 1

X3.shape, Y3.shape

In [None]:
DATASETS = [
    (X0, Y0, 'california_housing'),
    (X1, Y1, 'adult'),
    (X2, Y2, 'optdigits'),
    (X3, Y3, 'abalone'),
]

# Setting up classifiers

In [None]:
# Name, estimator, param_grid to be used for grid search,
CLASSIFIERS = [
    # Random forest.
    (
        'RF',
        RandomForestClassifier(), 
        {
            'max_depth': [2, 4, 8, 16], 
            'n_estimators': [50, 100, 150]
        },
    ),
    # k-NN.
    (
        'KNN',
        neighbors.KNeighborsClassifier(), 
        {
            'n_neighbors': [1, 2, 4, 8, 16, 32, 64, 128]
        },
    ),
    # SVM.
    (
        'SVM',
        svm.LinearSVC(penalty='l2', loss='hinge', max_iter=10000),
        {
            'C': [10**r for r in range(-8, 5)],
        },
    ),
    # DT
    (
        'DT',
        tree.DecisionTreeClassifier(min_samples_split=0.05),
        {
            'min_samples_split': [0.025, 0.05, 0.1, 0.2],
            'criterion': ['gini', 'entropy'],
        },
    ),
    # LR
    (
        'LR',
        linear_model.LogisticRegression(),
        {
            'C': [10**r for r in range(-8, 5)],
        },
    ),
    # ANN
    (
        'ANN',
        neural_network.MLPClassifier(),
        {
            'alpha': [10**r for r in range(-5, 0)],
            'activation': ['relu', 'logistic', 'tanh'],
            'hidden_layer_sizes': [(64,), (72, 24), (16, 13, 7),],
        },
    ),
]

# Just while testing
CLASSIFIERS = [    
    (
        'DT',
        tree.DecisionTreeClassifier(min_samples_split=0.05),
        {
            'min_samples_split': [0.025, 0.05],
            'criterion': ['gini', 'entropy'],
        },
    ),
]

# Functions for training

- For i in three different datasets
   - For j in three different partitions (20/80,50/50,80/20):
        - For t in three different trials
            - For c in three different classifiers
                 - cross validate
                 - find the optimal hyper-parameter
                 - train using the hyper-parameter above
                 - obtain the training and validation accuracy/error
                 - test
                 - obtain the testing accuracy
       - compute the averaged accuracy (training, validation, and testing) for each classifier c out of three trials
       - rank order the classifiers


In [None]:
# Run 3 datasets x 3 partitions x 3 trials x 3 classifiers.
def params():
    # This is gonna take some time.
    for (X, Y, name) in DATASETS:
        for p in [0.2, 0.5, 0.8]:
            for trial in range(3):
                # The trial data.
                X_train, X_test, Y_train, Y_test = train_test_split(X, Y, train_size = p)
                # Run this trial over each classifier.
                for (clf_name, clf, params) in CLASSIFIERS:
                    yield {
                        # Meta data to be used for data collection purposes.
                        'meta': {
                            'dataset': name,
                            'train_size': p,
                            'trial': trial,
                            'classifier': clf_name,
                        },
                        # Used in order to train this model.
                        'data': {
                            'clf': clf,
                            'params': params,
                            'x_train': X_train,
                            'y_train': Y_train,
                            'x_test': X_test,
                            'y_test': Y_test,
                        }
                    }

In [None]:
def _train_instance(x_train, y_train, x_test, y_test, clf, params):
    # Fit the classifier to the transformed input.
    grid_search = GridSearchCV(clf, params, cv=5, return_train_score=True, n_jobs=-1)
    grid_search.fit(x_train, y_train)
    # After determining the best hyper parameters, we can attain the
    # estimator that performed the best.
    estimator = grid_search.best_estimator_
    
    # Then, we need these four metrics from the trained classifier
    train = grid_search.cv_results_['mean_train_score'][grid_search.best_index_]
    validation = grid_search.cv_results_['mean_test_score'][grid_search.best_index_]
    test = estimator.score(x_test, y_test)
    # As well as the actual parameters that were used
    params = grid_search.best_params_
    
    # This is used to compute f-score
    y_pred = estimator.predict(x_test)
    
    return {
        'f_score': f1_score(y_test, y_pred, average='macro'),
        'training_accuracy': train,
        'validation_accuracy': validation,
        'test_accuracy': test,
        'params': params,
        'cv_results': grid_search.cv_results_,
    }
    
def train_instance(param):
    return {
        **param['meta'],
        **_train_instance(**param['data']),
    }

In [None]:
def train(seed = 1, parameters = params()):
    # For reproducible results
    np.random.seed(seed)
    # Parallel training
    return pd.DataFrame(
        Parallel(n_jobs=-1, verbose=50)(
            delayed(train_instance)(param) for param in parameters
        )
    )

# Distributed training
Since training is boring and time consuming, we can attempt to parallelize it as much as possible. The `train` method above is already written to utilize multiple threads. However, we can do better. We can distribute the computation among several workers, enabling a substantial speed up. Therefore, I am proud to present the world's most ad hoc distributed machine learning training algorithm, ever.

In [None]:
import os

# When distributing work across several machines, we'll use a very simple (and inefficient) method to divide
# the work between them. We'll simply assign each of N machines an id in range(0, N), and give each of them an
# equal chunk of the total work. The id is here represented by the environment variable `WORKER_NUM`, while the
# number of workers is denoted by `TOTAL_WORKERS`. Default: *single machine doing all the work*.
user = int(os.environ.get('WORKER_NUM', 0))
workers = int(os.environ.get('TOTAL_WORKERS', 1))

# Retrieve the work to be done by this instance.
work = list(params())
items_per_worker = len(work) // workers + 1
work = work[user * items_per_worker: (user + 1) * items_per_worker]

# Train on this worker's subset.
results = train(seed = user, parameters=work)

In [None]:
# Just to print the results if running with in Jupyter with a GUI.
results

In [None]:
raise ValueError("This is just here to stop the upload to google sheets");

# Upload to Google Sheets
Now after computing the results, we'll want to upload it to Google Sheets. This enables us to easily crowdsource the computation between gutta.

> NOTE: The keyfile is not included in this submission, so this portion of the project will crash horribly. But, it's only really needed when doing distributed training anyway.

## Authorization for using Google Cloud API.

In [None]:
# Where we want the credentials to be sent in order to be authorized
scope = ['https://spreadsheets.google.com/feeds', 'https://www.googleapis.com/auth/drive']
keyfile = 'gserviceaccount-client-secret.json'

credentials = ServiceAccountCredentials.from_json_keyfile_name(keyfile, scope)
gc = gspread.authorize(credentials)

## Uploading the computed results.

In [None]:
spreadsheet_key = '1QUMP6tlBqR3CqYlh7uC18e_XA4FgMT9SqMSc5agoGUA'

# This adjusts between our 0-indexing and GS' 1-indexing, as well as the extra space
# taken up by the header row.
offset = 1 if user == 0 else 2
d2g.upload(
    results,
    spreadsheet_key,
    credentials=credentials,
    clean=False,
    row_names=False,
    col_names=user == 0,
    start_cell=f'A{user * items_per_worker + offset}',
    wks_name = 'Results',
)