## Setup

In [None]:
from google.colab import drive
drive.mount('/content/drive')

!git clone -b legacy https://github.com/OpenNMT/OpenNMT-py
!pip install OpenNMT-py
import os
outdir = 'drive/MyDrive/EnglishToleranceBaseline/output'
if not os.path.exists(outdir):
    os.makedirs(outdir)
    
!pip install torch==1.6.0+cu101 torchvision==0.7.0+cu101 -f https://download.pytorch.org/whl/torch_stable.html

Mounted at /content/drive
Cloning into 'OpenNMT-py'...
remote: Enumerating objects: 134, done.[K
remote: Counting objects: 100% (134/134), done.[K
remote: Compressing objects: 100% (115/115), done.[K
remote: Total 17083 (delta 76), reused 46 (delta 19), pack-reused 16949[K
Receiving objects: 100% (17083/17083), 273.24 MiB | 18.98 MiB/s, done.
Resolving deltas: 100% (12297/12297), done.
Collecting OpenNMT-py
[?25l  Downloading https://files.pythonhosted.org/packages/e5/62/2c50d622c24cdce54523ec64051511793661ec14d396e05875597befa00d/OpenNMT_py-2.0.1-py3-none-any.whl (207kB)
[K     |████████████████████████████████| 215kB 4.1MB/s 
[?25hCollecting waitress==1.4.4
[?25l  Downloading https://files.pythonhosted.org/packages/26/d1/5209fb8c764497a592363c47054436a515b47b8c3e4970ddd7184f088857/waitress-1.4.4-py2.py3-none-any.whl (58kB)
[K     |████████████████████████████████| 61kB 8.3MB/s 
Collecting torch==1.6.0
[?25l  Downloading https://files.pythonhosted.org/packages/38/53/914885a9

## Generate Data

In [None]:
datasizes = [f'{s}_{i}' for s in ['100','200','400','600','800','1000'] for i in range(10)]

In [None]:
import numpy as np
import random
import operator
random.seed(1)

def preprocess_line(line):
  tokens = line.split()
  source = tokens[1].lower()
  target = tokens[3].lower()
  
  return f'{" ".join(source)}',  f'{" ".join(target)}'

for datasize in datasizes:
  new_data_path = f'drive/MyDrive/EnglishToleranceBaseline/processed_data/{datasize}/'
  if not os.path.exists(new_data_path):
    os.makedirs(new_data_path)


  with open(f'drive/MyDrive/EnglishToleranceBaseline/raw_data/unimorph_celex0_train{datasize}.txt','r') as raw_file:
    with open(f'{new_data_path}english-src-train.txt','w') as src_file:
      with open(f'{new_data_path}english-tgt-train.txt','w') as tgt_file:
        lines = raw_file.readlines()
        random.shuffle(lines)
        for line in lines:
          src, tgt = preprocess_line(line)
          print(src, file=src_file)
          print(tgt, file=tgt_file)
           
  with open(f'drive/MyDrive/EnglishToleranceBaseline/raw_data/unimorph_celex0_dev_{datasize.split("_")[1]}.txt','r') as raw_file:
    with open(f'{new_data_path}english-src-val.txt','w') as src_file:
      with open(f'{new_data_path}english-tgt-val.txt','w') as tgt_file:
        lines = raw_file.readlines()
        random.shuffle(lines)
        for line in lines:
          src, tgt = preprocess_line(line)
          print(src, file=src_file)
          print(tgt, file=tgt_file)
  
  with open(f'drive/MyDrive/EnglishToleranceBaseline/raw_data/unimorph_celex0_test_{datasize.split("_")[1]}.txt','r') as raw_file:
    with open(f'{new_data_path}english-src-test.txt','w') as src_file:
      with open(f'{new_data_path}english-tgt-test.txt','w') as tgt_file:
        lines = raw_file.readlines()
        random.shuffle(lines)
        for line in lines:
          src, tgt = preprocess_line(line)
          print(src, file=src_file)
          print(tgt, file=tgt_file)

## Preprocess data

In [None]:
for datasize in datasizes:
  datadir = f'drive/MyDrive/EnglishToleranceBaseline/processed_data/{datasize}'
  !python OpenNMT-py/preprocess.py -train_src $datadir/english-src-train.txt -train_tgt $datadir/english-tgt-train.txt -valid_src $datadir/english-src-val.txt -valid_tgt $datadir/english-tgt-val.txt -save_data $datadir/processed

## Train

In [None]:
for datasize in datasizes:
  epochs, n_examples, batchsize,  = 100, int(datasize.split('_')[0]), 20
  steps = str(int(epochs * n_examples / batchsize))

  datadir = f'drive/MyDrive/EnglishToleranceBaseline/processed_data/{datasize}'
  rnn_modelpath = f'{outdir}/english_rnn_model_{datasize}'
  rnn_train_args = ' '.join([
    f'-data {datadir}/processed',
    '-save_model '+rnn_modelpath,
    '-enc_layers 2',
    '-dec_layers 2',
    '-rnn_size 100',
    '-batch_size 20',
    '-word_vec_size 300',
    '-gpu_ranks 0',
    '-train_steps '+steps,
    '-save_checkpoint_steps '+steps
    ])

  !python OpenNMT-py/train.py $rnn_train_args

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
[2021-01-30 02:27:28,413 INFO] number of examples: 800
[2021-01-30 02:27:28,611 INFO] Step 850/ 4000; acc:  86.68; ppl:  1.63; xent: 0.49; lr: 1.00000; 5092/7274 tok/s;     17 sec
[2021-01-30 02:27:29,197 INFO] Loading dataset from drive/MyDrive/EnglishToleranceBaseline/processed_data/800_5/processed.train.0.pt
[2021-01-30 02:27:29,204 INFO] number of examples: 800
[2021-01-30 02:27:29,600 INFO] Step 900/ 4000; acc:  89.32; ppl:  1.53; xent: 0.42; lr: 1.00000; 4771/6865 tok/s;     18 sec
[2021-01-30 02:27:29,993 INFO] Loading dataset from drive/MyDrive/EnglishToleranceBaseline/processed_data/800_5/processed.train.0.pt
[2021-01-30 02:27:30,001 INFO] number of examples: 800
[2021-01-30 02:27:30,577 INFO] Step 950/ 4000; acc:  91.06; ppl:  1.41; xent: 0.34; lr: 1.00000; 5140/7292 tok/s;     19 sec
[2021-01-30 02:27:30,751 INFO] Loading dataset from drive/MyDrive/EnglishToleranceBaseline/processed_data/800_5/processed.train.0

## Predict Test

In [None]:
for datasize in datasizes:
  epochs, n_examples, batchsize,  = 100, int(datasize.split('_')[0]), 20
  steps = str(int(epochs * n_examples / batchsize))
  datadir = f'drive/MyDrive/EnglishToleranceBaseline/processed_data/{datasize}'
  rnn_modelpath = f'{outdir}/english_rnn_model_{datasize}'
  rnn_trans_args = ' '.join([
    '-model '+rnn_modelpath+'_step_'+steps+'.pt',
    f'-src {datadir}/english-src-test.txt',
    f'-output {outdir}/english-rnn-{datasize}-pred.txt',
    '-replace_unk -verbose',
    '-beam_size 12'
    ])
  !python OpenNMT-py/translate.py $rnn_trans_args


[1;30;43mStreaming output truncated to the last 5000 lines.[0m

[2021-01-30 02:53:28,092 INFO] 
SENT 3: ['p', 'i', 'k']
PRED 3: p i k t
PRED SCORE: -0.0009

[2021-01-30 02:53:28,092 INFO] 
SENT 4: ['ʧ', 'æ', 't']
PRED 4: ʧ æ t ɪ d
PRED SCORE: -0.0006

[2021-01-30 02:53:28,092 INFO] 
SENT 5: ['s', 'k', 'ɪ', 'd']
PRED 5: s k ɪ d ɪ d
PRED SCORE: -0.4805

[2021-01-30 02:53:28,092 INFO] 
SENT 6: ['s', 'ə', 'b', 't', 'ɹ', 'æ', 'k', 't']
PRED 6: s ə b t ɹ æ k t ɪ d
PRED SCORE: -0.0014

[2021-01-30 02:53:28,093 INFO] 
SENT 7: ['n', 'o', 'ʊ']
PRED 7: n o ʊ d
PRED SCORE: -0.0000

[2021-01-30 02:53:28,093 INFO] 
SENT 8: ['ɪ', 'n', 'f', 'ɔ', 'ɹ', 'm']
PRED 8: ɪ n f ɔ ɹ m d
PRED SCORE: -0.0002

[2021-01-30 02:53:28,093 INFO] 
SENT 9: ['s', 't', 'r', 'ə', 'g', 'ə', 'l']
PRED 9: s t r ə g ə l d
PRED SCORE: -0.0000

[2021-01-30 02:53:28,093 INFO] 
SENT 10: ['ʃ', 'ʌ', 'v', 'ə', 'l']
PRED 10: ʃ ʌ v ə l d
PRED SCORE: -0.0005

[2021-01-30 02:53:28,093 INFO] 
SENT 11: ['k', 'r', 'i', 'e', 'ɪ', 't']
PRED 

## Evaluate English

In [None]:
from collections import Counter

results = []
for datasize in datasizes:
  print('\nData type', datasize)
  datadir = f'drive/MyDrive/EnglishToleranceBaseline/processed_data/{datasize}'

  train_tgt_lines = open(f'{datadir}/english-tgt-train.txt','r').read().splitlines()
  train_src_lines = open(f'{datadir}/english-src-train.txt','r').read().splitlines()
    
  train_pairs = list(zip([t.replace(' ','') for t in train_src_lines], 
                        [t.replace(' ','') for t in train_tgt_lines]))

  predlines = open(f'{outdir}/english-rnn-{datasize}-pred.txt','r').read().splitlines()
  test_src_lines = open(f'{datadir}/english-src-test.txt','r').read().splitlines()
  test_tgt_lines = open(f'{datadir}/english-tgt-test.txt','r').read().splitlines()
  tups = list(zip(test_src_lines,test_tgt_lines,predlines))

  r = []
  for tst,tgt,pred in tups:
    # get the learned inflection
    gold, learned = tgt.strip(),pred.strip()
    r.append((gold,learned,tst.strip()))

  test_accuracy = sum([t[0]==t[1] for t in r])/len(r)
  print('Test accuracy:', test_accuracy)
  ed_predicted = sum([t[1][-1] in ['d','t'] and t[1]!=t[2] for t in r if t[1]])/len(r)
  print('-ed predicted:', ed_predicted)
  
  # Frequency test
  # For each of the verbs in the test set, get the original ending. Then, get the predictions for number of times that ending appears 
  c = 1
  verbs_with_inflections_matching_train = [] 
  for i,verb in enumerate(test_src_lines):
    verb = verb.replace(' ','')
    ending = verb[-c:]
    predicted_inflection = predlines[i].replace(' ','')[-c:]
    train_pairs_with_same_ending = [(s,t) for s,t in train_pairs if s[-c:] == ending]
    # get most frequent inflection for the train data verbs that share the ending
    inflections = [t[-c:] for s,t in train_pairs_with_same_ending]
    if not inflections: continue
    popular_inflection = Counter(inflections).most_common(1)[0][0]
    if predicted_inflection == popular_inflection:
      verbs_with_inflections_matching_train.append((verb, predicted_inflection, popular_inflection))

  train_match = len(verbs_with_inflections_matching_train)/len(test_src_lines)
  print(f'% of inflections (len {c}) that match most popular in training', train_match)
  size, split = tuple(datasize.split('_'))
  results.append((size,split, test_accuracy, ed_predicted, train_match))

print()
print('datasize, split, test_accuracy, ed_predicted, train_match')
for r in results:
  print(','.join(str(x) for x in r))


Data type 100_0
Test accuracy: 0.015
-ed predicted: 0.745
% of inflections (len 1) that match most popular in training 0.53

Data type 100_1
Test accuracy: 0.01
-ed predicted: 0.87
% of inflections (len 1) that match most popular in training 0.5

Data type 100_2
Test accuracy: 0.005
-ed predicted: 0.86
% of inflections (len 1) that match most popular in training 0.57

Data type 100_3
Test accuracy: 0.025
-ed predicted: 0.945
% of inflections (len 1) that match most popular in training 0.58

Data type 100_4
Test accuracy: 0.015
-ed predicted: 0.81
% of inflections (len 1) that match most popular in training 0.54

Data type 100_5
Test accuracy: 0.015
-ed predicted: 0.865
% of inflections (len 1) that match most popular in training 0.525

Data type 100_6
Test accuracy: 0.005
-ed predicted: 0.905
% of inflections (len 1) that match most popular in training 0.56

Data type 100_7
Test accuracy: 0.015
-ed predicted: 0.88
% of inflections (len 1) that match most popular in training 0.52

Data