In [1]:
import time
from datetime import datetime

import numpy as np
from sklearn.ensemble import RandomForestClassifier

from src import FeaturesExtractor

In [None]:
# paths of the training and test files
training_path = "./data/training"
test_path = "./data/test"

#   For this simple algorithm, we only compute the features for a subset of the training points. We choose N points
#   per class in each training file. This has two advantages : balancing the class for our classifier and saving a
#   lot of computational time.

print("Collect Training Features")
t0 = time.time()
f_extractor = FeaturesExtractor()
training_features, training_labels = f_extractor.extract_training(training_path)
t1 = time.time()
print("Done in %.3fs\n" % (t1 - t0))

print("Training Random Forest")
t0 = time.time()
clf = RandomForestClassifier()
clf.fit(training_features, training_labels)
t1 = time.time()
print("Done in %.3fs\n" % (t1 - t0))

print("Compute testing features")
t0 = time.time()
test_features = f_extractor.extract_test(test_path)
t1 = time.time()
print("Done in %.3fs\n" % (t1 - t0))

print("Test")
t0 = time.time()
predictions = clf.predict(test_features)
t1 = time.time()
print("Done in %.3fs\n" % (t1 - t0))

assert predictions.shape[0] == 3079187, "Incorrect number of predictions"

print("Save predictions")
t0 = time.time()
np.savetxt(
    f"submissions/feat-{datetime.now().strftime('%Y_%m_%d-%H_%M')}.txt",
    predictions,
    fmt="%d",
)
t1 = time.time()
print("Done in %.3fs\n" % (t1 - t0))

Collect Training Features
(3000, 21)
(3000, 0)
(3000, 21)
(3000, 21)
(3000, 21)
(3000, 42)
(3000, 21)
(3000, 63)
(3000, 21)
(3000, 84)
(3000, 21)
(3000, 105)
(3000, 21)
(3000, 126)
(3000, 21)
(3000, 147)
(2500, 21)
(2500, 0)
(2500, 21)
(2500, 21)
(2500, 21)
(2500, 42)
(2500, 21)
(2500, 63)
(2500, 21)
(2500, 84)
(2500, 21)
(2500, 105)
(2500, 21)
(2500, 126)
(2500, 21)
(2500, 147)
(2500, 21)
(2500, 0)
(2500, 21)
(2500, 21)
(2500, 21)
(2500, 42)
(2500, 21)
(2500, 63)
(2500, 21)
(2500, 84)
(2500, 21)
(2500, 105)
(2500, 21)
(2500, 126)
(2500, 21)
(2500, 147)
Done in 153.887s

Training Random Forest
Done in 4.590s

Compute testing features
(3079187, 21)
(3079187, 0)
(3079187, 21)
(3079187, 21)
(3079187, 21)
(3079187, 42)
(3079187, 21)
(3079187, 63)
(3079187, 21)
(3079187, 84)
(3079187, 21)
(3079187, 105)
(3079187, 21)
(3079187, 126)
