# Instructions for Reproduction

1. Run Figures_1,2,3.ipynb;
2. Run this notebook;
3. Run reproduce_all.sh script;

## Preparation

#### Directories

In [None]:
%%bash

mkdir ../results
mkdir ../unpacked
mkdir ../images

#### Images

In [20]:
import os
import cv2
from misc import *
import pandas as pd
import os.path as op
from tqdm import tqdm

In [21]:
ru = Study.load_from_file("ru_study.pkl").compute_word_set()
en = Study.load_from_file("en_study.pkl").compute_word_set()

In [23]:
for k in [ru, en]:
    for a in tqdm(k):
        u = Study.onehot(a, color=True)
        u = u.reshape(int(u.shape[0]/13), 13, 3)
        cv2.imwrite(op.join("../images/", a+".png"), u)

100%|██████████| 241/241 [00:00<00:00, 2496.40it/s]
100%|██████████| 241/241 [00:00<00:00, 2743.41it/s]


## Dataset construction

In [1]:
EN_VEC = "/home/bakirillov/HDD/weights/fasttext/aligned/wiki.en.align.vec"
RU_VEC = "/home/bakirillov/HDD/weights/fasttext/aligned/wiki.ru.align.vec"
VC = "python vectors.py -v VEC -s STUDY -w WHAT -o OUTPUT"

In [2]:
script = open("unpack_data_PI.sh", "w")
script.write("#!/bin/sh\n\n\n")
for a in ["wv", "1hot"]:
    for b,c in zip([EN_VEC, RU_VEC], ["en_study.pkl", "ru_study.pkl"]):
        command = VC.replace("WHAT", a).replace("VEC", b).replace(
            "STUDY", c
        ).replace("OUTPUT", "../unpacked/"+c+"_"+a+".csv")+"\n"
        script.write(command)
script.close()

In [3]:
VC = "python vectors.py -v VEC -s STUDY -w WHAT -p PARTICIPANT -o OUTPUT"
script = open("unpack_data_PD.sh", "w")
script.write("#!/bin/sh\n\n\n")
for a in ["wv", "1hot"]:
    for b,c,d in zip(
        ["../unpacked/en_study.pkl_wv.csv", "../unpacked/ru_study.pkl_wv.csv"], 
        ["en_study.pkl", "ru_study.pkl"], [34, 103]
    ):
        for e in range(d):
            command = VC.replace("WHAT", a).replace("VEC", b).replace(
                "STUDY", c
            ).replace(
                "OUTPUT", "../unpacked/"+c+"_"+a+"_"+str(e)+".csv"
            ).replace("PARTICIPANT", str(e))+"\n"
            script.write(command)
script.close()

In [4]:
%%bash

sh unpack_data_PI.sh
sh unpack_data_PD.sh

0it [00:00, ?it/s]4360it [00:00, 43598.27it/s]8781it [00:00, 43779.46it/s]13297it [00:00, 44184.13it/s]17764it [00:00, 44328.42it/s]22282it [00:00, 44579.95it/s]26812it [00:00, 44793.24it/s]31334it [00:00, 44920.34it/s]35813it [00:00, 44880.08it/s]40340it [00:00, 44995.59it/s]44826it [00:01, 44954.65it/s]49384it [00:01, 45138.60it/s]53931it [00:01, 45234.85it/s]58446it [00:01, 45207.86it/s]62978it [00:01, 45241.26it/s]67502it [00:01, 45240.77it/s]72006it [00:01, 45132.53it/s]76568it [00:01, 45275.21it/s]81098it [00:01, 45281.24it/s]85638it [00:01, 45314.35it/s]90196it [00:02, 45391.21it/s]94766it [00:02, 45481.06it/s]99330it [00:02, 45526.33it/s]103882it [00:02, 45474.84it/s]108429it [00:02, 45396.83it/s]112982it [00:02, 45433.95it/s]117525it [00:02, 45370.70it/s]122062it [00:02, 45281.01it/s]126590it [00:02, 45254.47it/s]131129it [00:02, 45293.80it/s]135659it [00:03, 45231.78it/s]140212it [00:03, 45320.75it/s]144745it [00:03, 45212.57it/s]149282it [00

## Model training

In [6]:
files = [op.join("../unpacked/", b) for b in [a for a in os.walk("../unpacked/")][0][2]]

### Participant-Independent word vector

In [7]:
VC = "python PI.py -d DATASET -a 3 -s 10 -o OUTPUT"

In [8]:
current = list(filter(lambda x: "_wv.csv" in x, files))
pd.concat([pd.read_csv(a) for a in current]).to_csv("../unpacked/joint_wv.csv")
current.append("../unpacked/joint_wv.csv")

In [9]:
script = open("reproduce_PI_wv.sh", "w")
script.write("#!/bin/sh\n\n\n")
for a in current:
    command = VC.replace(
        "DATASET", a
    ).replace(
        "OUTPUT", a.replace("unpacked", "results").replace(".csv", "")
    )
    script.write(command+"\n")
script.close()

### Participant-Independent one-hot

In [16]:
VC = "python PI.py -d DATASET -a 0 -s 10 -o OUTPUT"

In [17]:
current = list(filter(lambda x: "_1hot.csv" in x, files))

In [18]:
script = open("reproduce_PI_1hot.sh", "w")
script.write("#!/bin/sh\n\n\n")
for a in current:
    command = VC.replace(
        "DATASET", a
    ).replace(
        "OUTPUT", a.replace("unpacked", "results").replace(".csv", "")
    )
    script.write(command+"\n")
script.close()

### Participant-Dependent word vector

In [25]:
current = list(filter(lambda x: "_wv_" in x, files))

In [26]:
VC = "python PD.py -d DATASET -a 3 -s 10 -o OUTPUT"

In [27]:
script = open("reproduce_PD_wv.sh", "w")
script.write("#!/bin/sh\n\n\n")
for a in current:
    command = VC.replace(
        "DATASET", a
    ).replace(
        "OUTPUT", a.replace("unpacked", "results").replace(".csv", "")
    )
    script.write(command+"\n")
script.close()

### Participant-Dependent 1hot

In [28]:
current = list(filter(lambda x: "_1hot_" in x, files))

In [29]:
VC = "python PD.py -d DATASET -a 0 -s 10 -o OUTPUT"

In [30]:
script = open("reproduce_PD_1hot.sh", "w")
script.write("#!/bin/sh\n\n\n")
for a in current:
    command = VC.replace(
        "DATASET", a
    ).replace(
        "OUTPUT", a.replace("unpacked", "results").replace(".csv", "")
    )
    script.write(command+"\n")
script.close()