# Postprocessing

In [102]:
import re
import os
import pandas as pd

## Read validation data

In [103]:
def read_validation_directory():
    available_content = os.listdir('.') 
    select_pattern = re.compile('^validation-[a-z][a-z][a-z]')
    selected_dir = [content for content in available_content if select_pattern.match(content)]
    return selected_dir

In [104]:
def read_validation_directory_content(validation_directories):
    iteration = 0
    for directory in validation_directories:
        available_content = os.listdir(directory)
        for file in available_content:
            if iteration == 0:
                imported_data = pd.read_csv('{}/{}'.format(directory, file))
                imported_data['model'] = directory[11:]
            else:
                tmp_data = pd.read_csv('{}/{}'.format(directory, file)) 
                tmp_data['model'] = directory[11:] 
                imported_data = pd.concat([imported_data, tmp_data])
            iteration += 1
    
    imported_data.reset_index(drop=True, inplace=True)
    return imported_data

In [105]:
def text_split(x):
    y = x.split('-')
    return y

In [106]:
def extract_feature_information(df):
    
    df_splitted = df['data'].apply(text_split)
    
    feature_method_list = []
    document_size_list = []
    feature_size_list = []
    ngram_size_list = []
    
    for splitted_entries in df_splitted:
        if splitted_entries[1] == 'idf': # inverse document frequency case
            feature_method_list.append('tf-idf')
            document_size_list.append(splitted_entries[2])
            feature_size_list.append(splitted_entries[3])
            ngram_size_list.append(splitted_entries[4])
        else:
            feature_method_list.append('tf')
            document_size_list.append(splitted_entries[1])
            feature_size_list.append(splitted_entries[2])
            ngram_size_list.append(splitted_entries[3])
    
    df['method'] = feature_method_list
    df['doc'] = document_size_list
    df['feature'] = feature_size_list
    df['ngram'] = ngram_size_list
    df.drop(['data'], axis=1, inplace=True)
    
    # set index
    df.index = df['model'] + '-' + df['method'] + '-' + df['doc'] + '-' + df['feature'] + '-' + df['ngram']
    
    return df

In [107]:
def to_percent(x):
    return round(x * 100, 2)

In [108]:
def read_results():
    validation_directories = read_validation_directory()
    raw_results = read_validation_directory_content(validation_directories)
    results = extract_feature_information(raw_results)
    results.sort_values(by=['validation_accuracy'], ascending=False, inplace=True)
    results['validation_accuracy'] = to_percent(results['validation_accuracy'])
    results['test_accuracy'] = to_percent(results['test_accuracy'])
    return results

In [109]:
# read from all validation directories the accuracy and determine model settings
results = read_results()
results.head(results.shape[0]) # display all

Unnamed: 0,validation_accuracy,test_accuracy,model,method,doc,feature,ngram
knn-tf-idf-5000-1000-1,92.07,92.25,knn,tf-idf,5000,1000,1
knn-tf-5000-1000-1,91.56,92.66,knn,tf,5000,1000,1
knn-tf-5000-500-1,91.15,92.35,knn,tf,5000,500,1
knn-tf-idf-5000-500-1,90.74,91.95,knn,tf-idf,5000,500,1
knn-tf-5000-5000-1,89.83,91.13,knn,tf,5000,5000,1
knn-tf-idf-5000-5000-1,79.25,79.41,knn,tf-idf,5000,5000,1


## Read tuned models

In [110]:
def read_tuned_model_directory_content(prefix):
    path = 'tuned-model-{}'.format(prefix)
    available_content = os.listdir(path)
    iteration = 0
    for file in available_content:
        if iteration == 0:
            imported_data = pd.read_csv('{}/{}'.format(path, file))
            imported_data['model'] = prefix
        else:
            tmp_data = pd.read_csv('{}/{}'.format(path, file)) 
            tmp_data['model'] = prefix
            imported_data = pd.concat([imported_data, tmp_data])
        iteration += 1   
            
    imported_data.reset_index(drop=True, inplace=True)
    return imported_data    

In [111]:
def read_tuned_model_results(prefix):
    raw_results = read_tuned_model_directory_content(prefix)
    results = extract_feature_information(raw_results)
    return results

In [112]:
def join_with_accuracy(tuned_model_results, accuracy_results):
    accuracy_results.drop(['model', 'method', 'doc', 'feature', 'ngram'], axis=1, inplace=True)
    tuned_model_results = tuned_model_results.join(accuracy_results, how='inner')
    tuned_model_results.sort_values(by=['validation_accuracy'], ascending=False, inplace=True)
    tuned_model_results.reset_index(drop=True, inplace=True)
    return tuned_model_results

## k nearest neighbors model

In [113]:
results_knn = read_tuned_model_results('knn')
results_knn = join_with_accuracy(results_knn, results)
results_knn.head(results_knn.shape[0]) # display all

Unnamed: 0,p,weights,n_neighbors,model,method,doc,feature,ngram,validation_accuracy,test_accuracy
0,1,uniform,13,knn,tf-idf,5000,1000,1,92.07,92.25
1,1,uniform,13,knn,tf,5000,1000,1,91.56,92.66
2,1,distance,8,knn,tf,5000,500,1,91.15,92.35
3,1,distance,12,knn,tf-idf,5000,500,1,90.74,91.95
4,1,uniform,15,knn,tf,5000,5000,1,89.83,91.13
5,1,distance,6,knn,tf-idf,5000,5000,1,79.25,79.41
