In [None]:
# REQUIRED IMPORTS FROM STANDARD PACKAGES

import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

import csv
import random

from os.path import join as pjoin
from glob import glob

import sklearn as sk
from sklearn import preprocessing
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score

# IMPORTS FROM THE UTIL LIBRARY PROVIDED BY US

import util.vis as V
import util.helpers as H

%load_ext autoreload
%autoreload 1

%aimport util.helpers, util.vis
rng = np.random.RandomState(42)



In [None]:
# PATHS

DATA_DIR = '../data'
POSE_DIR = '../data/pose'

In [None]:
## 0. To obtain reproducible results, we set the random seeds
random.seed(2019)
np.random.seed(2019)

## 1. Load training set

dataset_file = pjoin(DATA_DIR, 'labels.csv')

train_samples = []
train_labels = []

with open(dataset_file) as csvfile:
    reader = csv.reader(csvfile, delimiter=',')
    next(reader) # Skips the first row, which is the header
    for row in reader:
        name, _gloss, label, _person = row
        sample = np.load(pjoin(POSE_DIR, 'train', name + '.npy'))
        train_samples.append(sample)
        train_labels.append(int(label))

## 2. Load test set.
# Important: load according to the order in files_test.txt to ensure correct submissions on Kaggle!
# The code below does this for you.
test_samples_file = pjoin(DATA_DIR, 'files_test.txt')

test_samples = []

with open(test_samples_file) as test_file:
    test_file_names = [l.strip() for l in test_file.readlines()]
    for name in test_file_names:
        test_samples.append(np.load(pjoin(POSE_DIR, 'test', name + '.npy')))
    
## 3. Extract features you will use in your model
#     (just a very basic dummy here!!!)
#     Transform the training set and test set to a numpy array

# As very basic features, we will use the average values of x, y and c 
# for every keypoint array over time,
# resulting in 3*137=411 features per sample
def extract_features(samples_list):
    # Calculate the average over time
    l = [np.mean(sample, axis=0) for sample in samples_list] 
    # Create a numpy array
    X = np.stack(l, axis=0)  
    # Reshape to (n_samples, n_features)
    X = X.reshape((len(samples_list), -1))                    
    return X

X_train = extract_features(train_samples)
y_train = np.array(train_labels)
X_test = extract_features(test_samples)

## 4. Create a classifier and fit the training set
## Note that this is JUST AN EXAMPLE 
#  in which logistic regression is used with the default settings!!
clf = LogisticRegression(max_iter=500000)
clf.fit(X_train, y_train)

In [None]:
# Show the accuracy obtained on the training set
print('Training set accuracy:', clf.score(X_train, y_train))
train_probas = clf.predict_proba(X_train)
print('Training set score (map@3):', H.mapk(train_probas, y_train))
print('Training set top-3 accuracy:', H.top3_accuracy(train_probas, y_train))

# 5. Create a submission using the test set data and write the submission file using the provided code
test_probas = clf.predict_proba(X_test)
H.create_submission(test_probas, 'baseline_submission.csv')

This submission should give you a leaderboard map@3 score of 0.5822, which shows that the model is clearly overfitting!