In [1]:
import os
from tqdm import tqdm
from time import time
import numpy as np
from sklearn.metrics import accuracy_score
import itertools

In [None]:
!head -2 ../../data/stackoverflow.10kk.tsv

In [None]:
%%time
!wc -l ../../data/stackoverflow.10kk.tsv

### Preprocessing data

In [None]:
%%time
!python3 preprocess.py ../../data/stackoverflow.10kk.tsv stackoverflow.vw

In [None]:
%%time
!wc -l stackoverflow.vw

### Spliting data on train, valid and test

In [None]:
%%time
! split -l 1463018 stackoverflow.vw stackoverflow_

In [None]:
!mv stackoverflow_aa stackoverflow_train.vw

In [None]:
!mv stackoverflow_ab stackoverflow_valid.vw

In [None]:
!mv stackoverflow_ac stackoverflow_test.vw

In [None]:
!wc -l stackoverflow_*.vw

In [None]:
!cat stackoverflow_test.vw  | cut -f 1 -d ' ' > test_labels.txt

In [None]:
%%time
!cat stackoverflow_valid.vw  | cut -f 1 -d ' ' > valid_labels.txt

#### loading label for test and valid

In [None]:
%%time
y_valid = np.loadtxt('valid_labels.txt')
y_test = np.loadtxt('test_labels.txt')

### Training and validation

In [None]:
def training_vw(file_output='stackoverflow.vw', 
                input_dataset='stackoverflow_train.vw', 
                ngram=None, passes=None, loss_function='hinge', random_seed=123, bit_precision=28):
    #if ngram or passes will be used add required command
    if ngram is None or ngram == 1:
        ngram = ''
    else:
        ngram = f'--ngram={ngram}'
        
    if passes is None or passes == 1:
        passes = ''
    else:
        passes = f'--passes={passes} --cache'
    
    vw_string = (f'vw --random_seed {random_seed} --oaa 10 --threads -d {input_dataset}'
                f' --loss_function {loss_function} -b {bit_precision} {ngram}' 
                f' -f {file_output} {passes}')
            
    print(vw_string)
    res = os.system(vw_string)
    print('Allright' if not res else 'Failure')

In [None]:
def testing_vw(model='stackoverflow.vw', test_file='stackoverflow_valid.vw', 
               predictions_file='valid_predictions.txt'):
    vw_text = f'vw -i {model} -t -d {test_file} -p {predictions_file}'
    print(vw_text)
    res = os.system(vw_text)
    print('Allright' if not res else 'Failure')

#### checking default parameters

In [None]:
%%time
training_vw(passes=2)

In [None]:
%%time
testing_vw()

In [None]:
pred_labels = np.loadtxt('valid_predictions.txt')

In [None]:
accuracy_score(y_valid, pred_labels)

#### Training model for all parameters

In [None]:
for i, (ngram, passes) in tqdm(enumerate(itertools.product([1, 2, 3], 
                                                      [1, 3, 5]))):
    training_vw(file_output=f'stackoverflow_model{i}.vw', 
                ngram=ngram, passes=passes)
    testing_vw(model=f'stackoverflow_model{i}.vw', 
               predictions_file=f'valid_predictions_{i}.txt')
    pred_labels = np.loadtxt(f'valid_predictions_{i}.txt')
    print(ngram, passes, f'accuracy is {accuracy_score(y_valid, pred_labels)}')