# Introduction to Python and Natural Language Technologies

__Lecture 8-2, Data preparation__

__March 30, 2021__

__Judit Ács__

This is a helper notebook that downloads and prepares the dataset used in Lecture 8.

In [None]:
import pandas as pd
import numpy as np
import os
import subprocess

In [None]:
unimorph_path = os.environ["HOME"] + "/repo/external/unimorph_hun/"

In [None]:
pipe = subprocess.Popen(f"git clone git@github.com:unimorph/hun.git {unimorph_path}",
                        shell=True, stderr=subprocess.PIPE, stdout=subprocess.PIPE)
stdout, stderr = pipe.communicate()

In [None]:
stdout

In [None]:
stderr

In [None]:
hun = pd.read_table(f"{unimorph_path}/hun", names=['lemma', 'infl', 'tags'], skip_blank_lines=True)

In [None]:
hun['pos'] = hun.tags.str[0]

In [None]:
hun[hun.tags.str[0]=='V']

In [None]:
def get_case(tags):
    if tags[0] == 'N':
        return tags.split(';')[1]
    return None

hun['case'] = hun['tags'].apply(get_case)
hun = hun[hun.pos=='N']

In [None]:
print(hun.case.nunique())
hun.case.value_counts(dropna=False)

In [None]:
lemmas = hun.lemma.unique()
len(lemmas), type(lemmas)

In [None]:
np.random.seed(12)
np.random.shuffle(lemmas)
train_size = int(0.8 * len(lemmas))
dev_size = int(0.1 * len(lemmas))
train_lemmas = lemmas[:train_size]
dev_lemmas = lemmas[train_size:train_size+dev_size]
test_lemmas = lemmas[train_size+dev_size:]

train_lemmas = set(train_lemmas)
dev_lemmas = set(dev_lemmas)
test_lemmas = set(test_lemmas)

In [None]:
len(train_lemmas & dev_lemmas), len(train_lemmas & test_lemmas), len(test_lemmas & dev_lemmas)

In [None]:
hun_train = hun[hun.lemma.isin(train_lemmas)]
hun_dev = hun[hun.lemma.isin(dev_lemmas)]
hun_test = hun[hun.lemma.isin(test_lemmas)]
len(hun_train), len(hun_dev), len(hun_test)

In [None]:
os.makedirs("data", exist_ok=True)
os.makedirs("data/unimorph", exist_ok=True)

In [None]:
hun_train.to_csv("data/unimorph/hun_train.tsv", sep="\t", index=False)
hun_dev.to_csv("data/unimorph/hun_dev.tsv", sep="\t", index=False)
hun_test.to_csv("data/unimorph/hun_test.tsv", sep="\t", index=False)

In [None]:
hun_train.head()