# Instancias

In [None]:
from subprocess import check_output
import string
import random
import os

def random_word(length):
    letters = string.ascii_lowercase
    return ''.join(random.choice(letters) for i in range(length))

def wc(filename):
    return int(check_output(["wc", "-l", filename]).split()[0])

def load_dir(directory):
    paths = sorted(os.listdir(directory))
    file_paths = [os.path.join(directory, filename) for filename in paths]
    word_count = [wc(file_path) for file_path in file_paths]
    return file_paths, word_count

In [None]:
# Random
for i in range(0, 20):
    n = 2**i
    f = open('./data/random_incremental/random_inc_{:03d}.txt'.format(i), 'w+')
    for j in range(n):
        f.write(random_word(3) + '\n')

for i in range(1,101):
    f = open('./data/random_fixed/random_fix_{:03d}.txt'.format(i), 'w+')
    for j in range(0,100):
        f.write(random_word(3) + '\n')

# Colisiones
def words(length):
    if length == 1:
        return string.ascii_lowercase
    else:
        result = []
        for word in words(length - 1):
            result = result + [word + letter for letter in string.ascii_lowercase]
        return result


f = open('./data/collision/collision.txt', 'w+')
for word in words(3):
    f.write(word + '\n')

# Experimentacion

In [None]:
!cd .. && make clean && make benchmark

In [None]:
import subprocess

def run(files, load_threads, max_threads, times):
    args = ['../build/ContarPalabras', '{}'.format(load_threads), '{}'.format(max_threads)] + files
    times_load = 0.0
    times_max = 0.0
    for t in range(times):        
        proc = subprocess.Popen(args, stdout=subprocess.PIPE)
        out = proc.stdout.readline().split()[0].decode('UTF-8').split(',')
        times_load += float(out[0])
        times_max += float(out[1])
    return times_load/times,times_max/times

In [None]:
import numpy as np
headers = 'file_count,word_count,threads_load,threads_max,time_load,time_max\n'

### Random

In [None]:
random_inc_results = open('./results/random_inc.csv', 'w+')
random_inc_results.write(headers)

file_paths, word_count = load_dir('./data/random_incremental/')
# Cantidad de threads maximo, 1 archivo
threads_load = 1
for threads_max in range(1,16):
    load_time, max_time = run([file_paths[-1]], threads_load, threads_max, 3)
    random_inc_results.write('{},{},{},{},{},{}\n'.format(1,word_count[-4],threads_load,threads_max,load_time,max_time))

random_inc_results.close()

In [None]:
random_fix_load_results = open('./results/random_fix_load.csv', 'w+')
random_fix_load_results.write(headers)

file_paths, word_count = load_dir('./data/random_fixed/')
threads_max = 4
for j in range(0,5):
    threads_to_load = 2**j
    for i in range(1,len(file_paths)+1):
        load_time, max_time = run(file_paths[0:i], threads_to_load, threads_max, 10)
        random_fix_load_results.write('{},{},{},{},{},{}\n'.format(i,np.sum(word_count[0:i]),threads_to_load,threads_max,load_time,max_time))

random_fix_load_results.close()

In [None]:
random_fix_max_results = open('./results/random_fix_max.csv', 'w+')
random_fix_max_results.write(headers)

threads_to_load = 16
for j in range(0,5):
    threads_to_max = 2**j
    for i in range(1,len(file_paths)+1):
        load_time, max_time = run(file_paths[0:i], threads_to_load, threads_to_max, 3)
        random_fix_max_results.write('{},{},{},{},{},{}\n'.format(i,np.sum(word_count[0:i]),threads_to_load,threads_max,load_time,max_time))

random_fix_max_results.close()

## Colisiones

In [None]:
colisiones_max_results = open('./results/colisiones_max.csv', 'w+')
colisiones_max_results.write(headers)

file_paths, word_count = load_dir('./data/collision/')

threads_to_load = 1
for threads_to_max in range(1, 8):
    load_time, max_time = run(file_paths[0:1], threads_load, threads_to_max, 4)
    colisiones_max_results.write('{},{},{},{},{},{}\n'.format(i,np.sum(word_count[0:1]),threads_to_load,threads_to_max,load_time,max_time))

colisiones_max_results.close()

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
df = pd.read_csv('./results/colisiones_max.csv')
sns.scatterplot(df, x='threads_max', y='time_max')

plt.xlabel('Cantidad de threads máximo')
plt.ylabel('Tiempo para calcular máximo (ms)')

plt.tight_layout()
plt.savefig('./graphs/colisiones_vs_threads')

In [None]:
df = pd.read_csv('./results/random_inc.csv')
sns.scatterplot(df, x='threads_max', y='time_max')

plt.xlabel('Cantidad de threads máximo')
plt.ylabel('Tiempo para calcular máximo (ms)')

plt.tight_layout()
plt.savefig('./graphs/ttm_vs_threads')

In [None]:
df = pd.read_csv('./results/random_fix_load.csv')
sns.scatterplot(df, x='file_count', y='time_load', hue='threads_load', label='#Threads lectura')
plt.legend()
plt.xlabel('Cantidad de archivos')
plt.ylabel('Time to load(ms)')

plt.tight_layout()
plt.savefig('./graphs/ttl_vs_file_count')