In [1]:
import sys, os
import numpy as np
from load_data import load_data
from utils import save_submission
from data_preprocessing import flatten_data

os.makedirs('data', exist_ok=True)
os.makedirs('submission', exist_ok=True)

In [57]:
%autosave 1
%load_ext aicrowd.magic
%aicrowd login

Autosaving every 1 seconds
Please login here: https://api.aicrowd.com/auth/mttGkNm24VrEoYa78dn-wAdUjCuNrmt_a1vJKPALnHA
API Key valid
Gitlab access token valid
Saved details successfully!


In [58]:
%aicrowd ds dl -c obstacle-prediction -o data

data.npz:   0%|          | 0.00/6.52M [00:00<?, ?B/s]

sample_submission.csv:   0%|          | 0.00/9.01k [00:00<?, ?B/s]

In [5]:
train_data, train_labels = load_data('train')
test_data, _ = load_data('test')

train_data = flatten_data(train_data)
test_data = flatten_data(test_data)

train_data[train_data == -1] = np.nan
test_data[test_data == -1] = np.nan

## Remove columns containing only -1
#to_remove = np.where(np.all(np.isnan(train_data), axis=0))[0]
#train_data = np.delete(train_data, to_remove, axis=1)
#test_data = np.delete(test_data, to_remove, axis=1)

# Remove samples containing only -1
to_remove = np.where(np.all(np.isnan(train_data), axis=1))[0]
train_data = np.delete(train_data, to_remove, axis=0)
train_labels = np.delete(train_labels, to_remove, axis=0)

In [70]:
from sklearn.impute import SimpleImputer, KNNImputer

imputer = SimpleImputer(strategy='median')
#imputer = KNNImputer(add_indicator=True)
train_data = imputer.fit_transform(train_data)
test_data = imputer.transform(test_data)

In [71]:
from sklearn.decomposition import PCA

pca = PCA(n_components=0.99)
train_data = pca.fit_transform(train_data)
test_data = pca.transform(test_data)

In [3]:
# Remove data which are too correlated
from sklearn.feature_selection import VarianceThreshold, SelectFromModel
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier

scaler = StandardScaler()
train_data = scaler.fit_transform(train_data)
test_data = scaler.transform(test_data)
selector = VarianceThreshold(threshold=0.1)
train_data = selector.fit_transform(train_data)
test_data = selector.transform(test_data)
#selector = SelectFromModel(estimator=RandomForestClassifier(n_estimators=100))
#train_data = selector.fit_transform(train_data, train_labels)
#test_data = selector.transform(test_data)

  updated_mean = (last_sum + new_sum) / updated_sample_count
  T = new_sum / new_sample_count
  new_unnormalized_variance -= correction**2 / new_sample_count
  self.variances_ = np.nanvar(X, axis=0)


In [6]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.linear_model import LogisticRegression, RidgeClassifier

from scipy.stats import loguniform

models = [
    {
        'estimator': LogisticRegression(max_iter=1000),
        'param_distributions': {
            'C': loguniform(10**-5, 10**-3),
        },
        'n_iter': 20,
    },
    {
        'estimator': RidgeClassifier(),
        'param_distributions': {
            'alpha': loguniform(10**-5, 10**5),
        },
        'n_iter': 20,
    },
    {
        'estimator': KNeighborsClassifier(),
        'param_distributions': {
            'n_neighbors': np.arange(1, 10, 1),
            'weights': ['uniform', 'distance'],
        },
        'n_iter': 18,
    },
    {
        'estimator': SVC(),
        'param_distributions': {
            'C': loguniform(10**-5, 10**5),
            'kernel': ['linear', 'poly', 'rbf'],
        },
        'n_iter': 20,
    },
    {
        'estimator': GaussianProcessClassifier(),
        'param_distributions': {},
        'n_iter': 1,
    },
    {
        'estimator': DecisionTreeClassifier(),
        'param_distributions': {
            'criterion': ['gini', 'entropy'],
            'splitter': ['best', 'random'],
            'max_depth': np.arange(1, 30, 1),
        },
        'n_iter': 20,
    },
    {
        'estimator': RandomForestClassifier(),
        'param_distributions': {
            'n_estimators': np.arange(1, 100, 1),
            'criterion': ['gini', 'entropy'],
            'max_depth': np.arange(1, 30, 1),
        },
        'n_iter': 20,
    },
    {
        'estimator': AdaBoostClassifier(),
        'param_distributions': {
            'n_estimators': np.arange(1, 100, 1),
            'learning_rate': loguniform(10**-5, 10**5),
        },
        'n_iter': 20,
    },
    {
        'estimator': MLPClassifier(),
        'param_distributions': {
            'hidden_layer_sizes': [(0,)] + [(x,) for x in [10, 50, 100]] + [(x, x) for x in [10, 50]],
            'activation': ['relu', 'logistic', 'tanh'],
            'alpha': loguniform(10**-5, 10**5),
            'learning_rate': ['constant', 'invscaling', 'adaptive'],
            'learning_rate_init': loguniform(10**-5, 10**-2),   
        },
        'n_iter': 20,
    },
    {
        'estimator': GaussianNB(),
        'param_distributions': {},
        'n_iter': 1,
    },
    {
        'estimator': QuadraticDiscriminantAnalysis(),
        'param_distributions': {},
        'n_iter': 1,
    }
]

models = [{
    'estimator': RandomForestClassifier(max_depth=15),
    'param_distributions': {
    },
    'n_iter': 1
}]

names = [model['estimator'].__class__.__name__ for model in models]

scores = {name: None for name in names}
params = {name: None for name in names}

for model, name in zip(models, names):
    
    print("Searching for", name, "...")

    search = RandomizedSearchCV(cv=5, scoring=['f1', 'accuracy'], verbose=3, refit='f1', **model)
    search.fit(train_data, train_labels)
    scores[name] = search.best_score_
    params[name] = search.best_estimator_.get_params()

    print("Best score:", scores[name])
    print("Best params:", params[name])

    print("\n=====================================\n")

    # Save it in a file
    with open('results.txt', 'w') as f:
        for name in names:
            f.write(f'{name}\n')
            f.write(f'  score: {scores[name]}\n')
            f.write(f'  params: {params[name]}\n')


Searching for RandomForestClassifier ...
Fitting 5 folds for each of 1 candidates, totalling 5 fits
[CV 1/5] END ........ accuracy: (test=0.983) f1: (test=0.971) total time=   3.8s
[CV 2/5] END ........ accuracy: (test=0.978) f1: (test=0.963) total time=   3.1s
[CV 3/5] END ........ accuracy: (test=0.986) f1: (test=0.976) total time=   3.4s
[CV 4/5] END ........ accuracy: (test=0.989) f1: (test=0.982) total time=   3.5s
[CV 5/5] END ........ accuracy: (test=0.989) f1: (test=0.981) total time=   3.7s
Best score: 0.9746999684517151
Best params: {'bootstrap': True, 'ccp_alpha': 0.0, 'class_weight': None, 'criterion': 'gini', 'max_depth': 15, 'max_features': 'sqrt', 'max_leaf_nodes': None, 'max_samples': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 100, 'n_jobs': None, 'oob_score': False, 'random_state': None, 'verbose': 0, 'warm_start': False}




In [7]:
best_method = None
best_score = 0

for name in names:
    if scores[name] > best_score:
        best_score = scores[name]
        best_method = name

print("Best method:", best_method)
print("Best score:", best_score)

final_model = getattr(sys.modules[__name__], best_method)(**params[best_method])
predictions = final_model.fit(train_data, train_labels).predict(test_data)
save_submission(predictions, 'submission/submission.csv')

Best method: RandomForestClassifier
Best score: 0.9746999684517151


# Create the submission

Save the notebook in the assets folder and zip it

In [8]:
import shutil, os

os.makedirs('submission', exist_ok=True)

shutil.copyfile('notebook.ipynb', 'submission/original_notebook.ipynb')
shutil.make_archive('submission', 'zip', 'submission')

'c:\\Users\\Axeld\\Desktop\\AIcrowd\\Blitz\\obstacle_prediction\\submission.zip'