In [22]:
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from preprocessing import *
import seaborn as sns
import pandas as pd
import numpy as np
import pickle
import json

In [68]:
def data_sampling(dataframe):
    over_bench_mult = 2.2;
    under_bench_mult = 1.5;

    smp2 = dataframe[dataframe.Grade == 2]
    smp3 = dataframe[dataframe.Grade == 3]
    smp4 = dataframe[dataframe.Grade == 4]
    smp5 = dataframe[dataframe.Grade == 5]
    smp6 = dataframe[dataframe.Grade == 6]
    smp7 = dataframe[dataframe.Grade == 7]
    smp8 = dataframe[dataframe.Grade == 8]
    smp9 = dataframe[dataframe.Grade == 9]
    smp10 = dataframe[dataframe.Grade == 10]

    under_3 = pd.concat([smp3[smp3.IsBenchmark != True].sample(frac=.73), 
                     smp3[smp3.IsBenchmark == True].sample(frac=under_bench_mult, replace=True)])
    under_5 = pd.concat([smp5[smp5.IsBenchmark != True].sample(frac=.77), 
                     smp5[smp5.IsBenchmark == True].sample(frac=under_bench_mult, replace=True)])

    over_2 = smp2.sample(frac=2, replace=True)
    over_4 = pd.concat([smp4.sample(frac=1.4, replace=True), 
                    smp4[smp4.IsBenchmark == True].sample(frac=over_bench_mult, replace=True)])
    over_6 = pd.concat([smp6.sample(frac=1.2, replace=True), 
                    smp6[smp6.IsBenchmark == True].sample(frac=over_bench_mult, replace=True)])
    over_7 = pd.concat([smp7.sample(frac=1.5, replace=True), 
                    smp7[smp7.IsBenchmark == True].sample(frac=over_bench_mult, replace=True)])
    over_8 = pd.concat([smp8.sample(frac=1.3, replace=True), 
                    smp8[smp8.IsBenchmark == True].sample(frac=over_bench_mult, replace=True)])
    over_9 = pd.concat([smp9.sample(frac=2.1, replace=True), 
                    smp9[smp9.IsBenchmark == True].sample(frac=over_bench_mult, replace=True)])
    over_10 = smp10.sample(frac=3.5, replace=True)

    return pd.concat([over_2, under_3, over_4, under_5, over_6, over_7, over_8, over_9, over_10])

def sample_train_test_val_df(X_train, X_test, X_val):
    return data_sampling(X_train), data_sampling(X_test), data_sampling(X_val)

def showcase_sampled_train_split(dataframe=routeImport.copy()):
    sns.countplot(x='Grade', data=data_sampling(dataframe.sample(frac=0.8)))

In [82]:
def split_train_test_val(df, train_size, test_size, val_size):
    X = df
    y = df.Grade.to_numpy()-df.Grade.min()

    # train is now 80% of the entire data set, test is now 20% of the dataset
    # X_train and X_test are DataFrames of routes, y_train and y_test are numpy arrays of grades
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=train_size, stratify=y)
    
    # test is now 10% of the initial data set
    # validation is now 10% of the initial data set
    # X_val and X_test are DataFrames of routes, y_val and y_test are numpy arrays of grades
    X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, test_size=(test_size / (test_size + val_size)), stratify=y_test) 
    
    print("X train, test, val (before sampling): ", len(X_train), len(X_val), len(X_test))
    print("y train, test, val (before sampling): ", len(y_train), len(y_val), len(y_test))
    
    # over and undersample the datasets without bleed between train, test, and validation sets
    X_train, X_test, X_val = sample_train_test_val_df(X_train, X_test, X_val)
    # NOTE: the oversamping method appears to cause the datasets to be slightly different sizes each time
    # this is assumed to be due to the stratification
    
    print("X train, test, val (after sampling): ", len(X_train), len(X_val), len(X_test))
    print("y train, test, val (after sampling): ", len(y_train), len(y_val), len(y_test))
    
    tr_y = X_train.Grade.to_numpy()-2
    tst_y = X_test.Grade.to_numpy()-2
    vl_y = X_val.Grade.to_numpy()-2
    
    X_train, t, y_train, t = train_test_split(X_train, tr_y, test_size=0.0005, stratify=tr_y)
    X_test, t, y_test, t = train_test_split(X_test, tst_y, test_size=0.005, stratify=tst_y) 
    X_val, t, y_val, t = train_test_split(X_val, vl_y, test_size=0.005, stratify=vl_y) 
    
    print("X train, test, val (after fixing): ", len(X_train), len(X_val), len(X_test))
    print("y train, test, val (after fixing): ", len(y_train), len(y_val), len(y_test))
    
    return (np.array(X_train.TokenizedSequence.to_list()), 
            np.array(X_test.TokenizedSequence.to_list()), 
            np.array(X_val.TokenizedSequence.to_list()), 
            y_train, y_test, y_val)

In [70]:
df = pd.read_json('./data/route_data.json')

In [71]:
for i in range(11, 15):
    df = df[df.Grade != i]
df = df[df.MoonBoardHoldSetup == 'MoonBoard Masters 2017']
df = df[df.RepeatText != 'Be the first to repeat this problem']

In [72]:
route_sequences = df.apply(generate_route_sequence, axis=1)
df['TokenizedSequence'] = route_sequences.map(tokenize_sequence)

In [39]:
train_size = 0.8
test_size = 0.1
val_size = 0.1
assert train_size + test_size + val_size == 1

In [83]:
X_train, X_test, X_val, y_train, y_test, y_val = split_train_test_val(df, train_size, test_size, val_size)

X train, test, val (before sampling):  17356 2169 2170
y train, test, val (before sampling):  17356 2169 2170
X train, test, val (after sampling):  19052 2383 2393
y train, test, val (after sampling):  17356 2169 2170
X train, test, val (after fixing):  19042 2371 2381
y train, test, val (after fixing):  19042 2371 2381


In [88]:
train_data = {'X': X_train, 'y': y_train}
test_data = {'X': X_test, 'y': y_test}
val_data = {'X': X_val, 'y': y_val}

In [89]:
for data, split in zip([train_data, test_data, val_data], ['train', 'test', 'val']):
    with open(f'./data/{split}_preprocessed_routes', 'wb') as f:
        pickle.dump(data, f)