# Instructions for Reproduction

1. Run Figures_1,2,3.ipynb;
2. Run this notebook;
3. Run the generated scripts.

## Preparation

#### Directories

In [None]:
%%bash

mkdir ../results
mkdir ../unpacked
mkdir ../images

#### Images

In [1]:
import os
import cv2
from misc import *
import pandas as pd
import os.path as op
from tqdm import tqdm

In [2]:
ru = Study.load_from_file("ru_study.pkl").compute_word_set()
en = Study.load_from_file("en_study.pkl").compute_word_set()

In [3]:
for k in [ru, en]:
    for a in tqdm(k):
        u = Study.onehot(a, color=True)
        u = u.reshape(int(u.shape[0]/13), 13, 3)
        cv2.imwrite(op.join("../images/", a+".png"), u)

100%|██████████| 241/241 [00:00<00:00, 2456.53it/s]
100%|██████████| 241/241 [00:00<00:00, 3682.42it/s]


## Dataset construction

In [5]:
EN_VEC = "/home/bakirillov/HDD/weights/fasttext/aligned/wiki.en.align.vec"
RU_VEC = "/home/bakirillov/HDD/weights/fasttext/aligned/wiki.ru.align.vec"
VC = "python vectors.py -v VEC -s STUDY -w WHAT -o OUTPUT"

In [6]:
script = open("unpack_data_PI.sh", "w")
script.write("#!/bin/sh\n\n\n")
for a in ["wv", "1hot"]:
    for b,c in zip([EN_VEC, RU_VEC], ["en_study.pkl", "ru_study.pkl"]):
        command = VC.replace("WHAT", a).replace("VEC", b).replace(
            "STUDY", c
        ).replace("OUTPUT", "../unpacked/"+c+"_"+a+".csv")+"\n"
        script.write(command)
script.close()

In [7]:
VC = "python vectors.py -v VEC -s STUDY -w WHAT -p PARTICIPANT -o OUTPUT"
script = open("unpack_data_PD.sh", "w")
script.write("#!/bin/sh\n\n\n")
for a in ["wv", "1hot"]:
    for b,c,d in zip(
        ["../unpacked/en_study.pkl_wv.csv", "../unpacked/ru_study.pkl_wv.csv"], 
        ["en_study.pkl", "ru_study.pkl"], [34, 103]
    ):
        for e in range(d):
            command = VC.replace("WHAT", a).replace("VEC", b).replace(
                "STUDY", c
            ).replace(
                "OUTPUT", "../unpacked/"+c+"_"+a+"_"+str(e)+".csv"
            ).replace("PARTICIPANT", str(e))+"\n"
            script.write(command)
script.close()

In [8]:
%%bash

sh unpack_data_PI.sh
sh unpack_data_PD.sh

0it [00:00, ?it/s]4079it [00:00, 40787.51it/s]8228it [00:00, 40994.34it/s]12448it [00:00, 41347.30it/s]16699it [00:00, 41687.17it/s]20867it [00:00, 41684.31it/s]25166it [00:00, 42065.92it/s]29449it [00:00, 42291.65it/s]33742it [00:00, 42480.02it/s]38014it [00:00, 42551.29it/s]42252it [00:01, 42497.21it/s]46541it [00:01, 42613.30it/s]50866it [00:01, 42801.26it/s]55218it [00:01, 43013.20it/s]59491it [00:01, 42922.89it/s]63757it [00:01, 42754.91it/s]68014it [00:01, 42480.21it/s]72250it [00:01, 42149.87it/s]76457it [00:01, 42065.48it/s]80658it [00:01, 41875.82it/s]84884it [00:02, 41988.53it/s]89120it [00:02, 42096.69it/s]93365it [00:02, 42199.56it/s]97602it [00:02, 42250.53it/s]101827it [00:02, 42239.42it/s]106078it [00:02, 42319.07it/s]110351it [00:02, 42440.05it/s]114662it [00:02, 42636.04it/s]118935it [00:02, 42661.49it/s]123235it [00:02, 42761.18it/s]127537it [00:03, 42837.14it/s]131821it [00:03, 42542.08it/s]136099it [00:03, 42613.10it/s]140361it [00:

## Model training

In [19]:
files = [op.join("../unpacked/", b) for b in [a for a in os.walk("../unpacked/")][0][2]]

### Participant-Independent word vector

In [5]:
VC = "python PI.py -d DATASET -a 3 -s 10 -o OUTPUT"

In [20]:
current = list(filter(lambda x: "_wv.csv" in x, files))
pd.concat([pd.read_csv(a, index_col=0) for a in current]).to_csv("../unpacked/joint_wv.csv")
current.append("../unpacked/joint_wv.csv")

In [12]:
script = open("reproduce_PI_wv.sh", "w")
script.write("#!/bin/sh\n\n\n")
for a in current:
    command = VC.replace(
        "DATASET", a
    ).replace(
        "OUTPUT", a.replace("unpacked", "results").replace(".csv", "")
    )
    script.write(command+"\n")
script.close()

### Participant-Independent one-hot

In [13]:
VC = "python PI.py -d DATASET -a 0 -s 10 -o OUTPUT"

In [14]:
current = list(filter(lambda x: "_1hot.csv" in x, files))

In [15]:
script = open("reproduce_PI_1hot.sh", "w")
script.write("#!/bin/sh\n\n\n")
for a in current:
    command = VC.replace(
        "DATASET", a
    ).replace(
        "OUTPUT", a.replace("unpacked", "results").replace(".csv", "")
    )
    script.write(command+"\n")
script.close()

### Participant-Dependent word vector

In [16]:
current = list(filter(lambda x: "_wv_" in x, files))

In [17]:
VC = "python PD.py -d DATASET -a 3 -s 10 -o OUTPUT"

In [18]:
script = open("reproduce_PD_wv.sh", "w")
script.write("#!/bin/sh\n\n\n")
for a in current:
    command = VC.replace(
        "DATASET", a
    ).replace(
        "OUTPUT", a.replace("unpacked", "results").replace(".csv", "")
    )
    script.write(command+"\n")
script.close()

### Participant-Dependent 1hot

In [19]:
current = list(filter(lambda x: "_1hot_" in x, files))

In [20]:
VC = "python PD.py -d DATASET -a 0 -s 10 -o OUTPUT"

In [21]:
script = open("reproduce_PD_1hot.sh", "w")
script.write("#!/bin/sh\n\n\n")
for a in current:
    command = VC.replace(
        "DATASET", a
    ).replace(
        "OUTPUT", a.replace("unpacked", "results").replace(".csv", "")
    )
    script.write(command+"\n")
script.close()