# Instructions for Reproduction

1. Run Figures_1,2,3.ipynb;
2. Run this notebook;
3. Run the generated scripts.

## Preparation

#### Directories

In [None]:
%%bash

mkdir ../results
mkdir ../unpacked
mkdir ../images

#### Images

In [1]:
import os
import cv2
from misc import *
import pandas as pd
import os.path as op
from tqdm import tqdm

In [2]:
ru = Study.load_from_file("ru_study.pkl").compute_word_set()
en = Study.load_from_file("en_study.pkl").compute_word_set()

In [3]:
for k in [ru, en]:
    for a in tqdm(k):
        u = Study.onehot(a, color=True)
        u = u.reshape(int(u.shape[0]/13), 13, 3)
        cv2.imwrite(op.join("../images/", a+".png"), u)

100%|██████████| 241/241 [00:00<00:00, 1419.25it/s]
100%|██████████| 241/241 [00:00<00:00, 3452.87it/s]


## Dataset construction

In [4]:
EN_VEC = "/home/bakirillov/HDD/weights/fasttext/aligned/wiki.en.align.vec"
RU_VEC = "/home/bakirillov/HDD/weights/fasttext/aligned/wiki.ru.align.vec"
VC = "python vectors.py -v VEC -s STUDY -w WHAT -o OUTPUT"

In [5]:
script = open("unpack_data_PI.sh", "w")
script.write("#!/bin/sh\n\n\n")
for a in ["wv", "1hot"]:
    for b,c in zip([EN_VEC, RU_VEC], ["en_study.pkl", "ru_study.pkl"]):
        command = VC.replace("WHAT", a).replace("VEC", b).replace(
            "STUDY", c
        ).replace("OUTPUT", "../unpacked/"+c+"_"+a+".csv")+"\n"
        script.write(command)
script.close()

In [6]:
VC = "python vectors.py -v VEC -s STUDY -w WHAT -p PARTICIPANT -o OUTPUT"
script = open("unpack_data_PD.sh", "w")
script.write("#!/bin/sh\n\n\n")
for a in ["wv", "1hot"]:
    for b,c,d in zip(
        ["../unpacked/en_study.pkl_wv.csv", "../unpacked/ru_study.pkl_wv.csv"], 
        ["en_study.pkl", "ru_study.pkl"], [34, 103]
    ):
        for e in range(d):
            command = VC.replace("WHAT", a).replace("VEC", b).replace(
                "STUDY", c
            ).replace(
                "OUTPUT", "../unpacked/"+c+"_"+a+"_"+str(e)+".csv"
            ).replace("PARTICIPANT", str(e))+"\n"
            script.write(command)
script.close()

In [7]:
%%bash

sh unpack_data_PI.sh
sh unpack_data_PD.sh

0it [00:00, ?it/s]4165it [00:00, 41648.95it/s]8272it [00:00, 41473.27it/s]12503it [00:00, 41719.69it/s]16741it [00:00, 41914.05it/s]21032it [00:00, 42205.32it/s]25345it [00:00, 42475.55it/s]29712it [00:00, 42824.82it/s]34058it [00:00, 43011.08it/s]38384it [00:00, 43083.81it/s]42733it [00:01, 43202.85it/s]47078it [00:01, 43275.45it/s]51427it [00:01, 43339.02it/s]55762it [00:01, 43341.08it/s]60070it [00:01, 43261.66it/s]64395it [00:01, 43255.95it/s]68728it [00:01, 43277.35it/s]73057it [00:01, 43280.19it/s]77376it [00:01, 43195.47it/s]81722it [00:01, 43274.35it/s]86053it [00:02, 43283.10it/s]90379it [00:02, 43114.87it/s]94689it [00:02, 33650.37it/s]98374it [00:02, 33594.87it/s]102526it [00:02, 35631.49it/s]106624it [00:02, 37082.58it/s]110835it [00:02, 38459.33it/s]115081it [00:02, 39575.87it/s]119359it [00:02, 40485.23it/s]123553it [00:03, 40910.60it/s]127693it [00:03, 40882.62it/s]131918it [00:03, 41281.15it/s]136092it [00:03, 41417.08it/s]140330it [00:

## Model training

In [8]:
files = [op.join("../unpacked/", b) for b in [a for a in os.walk("../unpacked/")][0][2]]

### Participant-Independent word vector

In [10]:
VC = "python PI.py -d DATASET -a 3 -s 10 -o OUTPUT"

In [11]:
current = list(filter(lambda x: "_wv.csv" in x, files))
pd.concat([pd.read_csv(a, index_col=0) for a in current]).to_csv("../unpacked/joint_wv.csv")
current.append("../unpacked/joint_wv.csv")

In [12]:
script = open("reproduce_PI_wv.sh", "w")
script.write("#!/bin/sh\n\n\n")
for a in current:
    command = VC.replace(
        "DATASET", a
    ).replace(
        "OUTPUT", a.replace("unpacked", "results").replace(".csv", "")
    )
    script.write(command+"\n")
script.close()

### Participant-Independent one-hot

In [13]:
VC = "python PI.py -d DATASET -a 0 -s 10 -o OUTPUT"

In [14]:
current = list(filter(lambda x: "_1hot.csv" in x, files))

In [15]:
script = open("reproduce_PI_1hot.sh", "w")
script.write("#!/bin/sh\n\n\n")
for a in current:
    command = VC.replace(
        "DATASET", a
    ).replace(
        "OUTPUT", a.replace("unpacked", "results").replace(".csv", "")
    )
    script.write(command+"\n")
script.close()

### Participant-Dependent word vector

In [16]:
current = list(filter(lambda x: "_wv_" in x, files))

In [17]:
VC = "python PD.py -d DATASET -a 3 -s 10 -o OUTPUT"

In [18]:
script = open("reproduce_PD_wv.sh", "w")
script.write("#!/bin/sh\n\n\n")
for a in current:
    command = VC.replace(
        "DATASET", a
    ).replace(
        "OUTPUT", a.replace("unpacked", "results").replace(".csv", "")
    )
    script.write(command+"\n")
script.close()

### Participant-Dependent 1hot

In [19]:
current = list(filter(lambda x: "_1hot_" in x, files))

In [20]:
VC = "python PD.py -d DATASET -a 0 -s 10 -o OUTPUT"

In [21]:
script = open("reproduce_PD_1hot.sh", "w")
script.write("#!/bin/sh\n\n\n")
for a in current:
    command = VC.replace(
        "DATASET", a
    ).replace(
        "OUTPUT", a.replace("unpacked", "results").replace(".csv", "")
    )
    script.write(command+"\n")
script.close()