In [2]:
import pandas as pd

# Language Modelling

RNN language models (with LSTM cells) were trained on monolingual word lists in 2 configurations: 

- one-hot embeddings
- phonetic vectors

##  Experimental Settings

- LSTM based LM implementation: Modification of Tensorflow's PTB LM sample code
- Configuration: 
  - init_scale = 0.1
  - learning_rate = 1.0
  - max_grad_norm = 5
  - num_layers = 2
  - num_steps = 20
  - hidden_size = 200
  - max_epoch = 4
  - max_max_epoch = 13
  - keep_prob = 1.0
  - lr_decay = 0.5
  - batch_size = 20
  - GradientDescent Optimizer

** Note** 

- For characters which don't have phonetic embeddings, one-hot embeddings were used
- LMs were trained with varying amounts of training corpora (number of words)

## Results 
** (See table below) ** 

_Experiment name: 1-train-size _

- The perplexity of the models trained with phonetic representation is substantially less than that of one-hot representation. 
- With increase in data-size, the phonetic representation seems to be doing better.
- Corpus normalization reduces perplexity. The original NEWS corpus contains nukta-adjoined characters as a single codepoint. Normalization separates the nukta and the character and a phonetic embedding is clearly useful for that


In [42]:
lm_results_fname='lm_results.csv'

In [43]:
lm_results=pd.read_csv(lm_results_fname,sep='|',names=['set','size','exp','lang','perplexity'])

** Dataset: CoNLL 2016 paper dataset **

In [44]:
lm_results[lm_results['set']=='conll16'].pivot_table(index=['size'],columns=['lang','exp'],values=['perplexity'])

Unnamed: 0_level_0,perplexity,perplexity,perplexity,perplexity,perplexity,perplexity,perplexity,perplexity
lang,bn,bn,hi,hi,kn,kn,ta,ta
exp,onehot,phonetic,onehot,phonetic,onehot,phonetic,onehot,phonetic
size,Unnamed: 1_level_3,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3
1000,15.5,15.367,16.774,16.661,14.657,14.648,12.966,13.211
2000,13.923,13.89,15.369,15.217,12.398,12.468,11.042,11.412
5000,12.43,12.463,13.7,13.872,11.783,11.341,9.672,9.657
15000,12.687,11.923,13.7,12.79,10.918,9.916,9.706,8.842
25000,12.031,11.126,13.067,12.258,9.771,9.141,8.78,8.376
35000,11.577,10.693,12.571,11.636,9.22,8.675,8.379,8.106


** Dataset: Old NEWS 2012 dataset (the one used by Gurneet in his experiments) **

In [45]:
lm_results[lm_results['set']=='news12_old'].pivot_table(index=['size'],columns=['lang','exp'],values=['perplexity'])

Unnamed: 0_level_0,perplexity,perplexity,perplexity,perplexity
lang,hi,hi,kn,kn
exp,onehot,phonetic,onehot,phonetic
size,Unnamed: 1_level_3,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3
1000,17.73,17.309,12.436,12.283
2000,15.629,15.206,11.241,11.152
5000,13.095,12.981,9.875,9.876
8000,12.577,12.188,9.498,9.285
10000,12.313,11.999,9.388,9.209
13000,12.223,11.783,9.135,8.942


** Dataset: NEWS 2012 dataset (Old NEWS 2012 corpus normalized) **

In [46]:
lm_results[lm_results['set']=='news12'].pivot_table(index=['size'],columns=['lang','exp'],values=['perplexity'])

Unnamed: 0_level_0,perplexity,perplexity,perplexity,perplexity
lang,hi,hi,kn,kn
exp,onehot,phonetic,onehot,phonetic
size,Unnamed: 1_level_3,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3
1000,17.524,16.789,12.494,12.315
2000,14.879,15.157,11.159,11.142
5000,12.704,12.735,9.926,9.882
8000,12.285,11.869,9.511,9.488
10000,12.017,11.698,9.368,9.111
13000,12.036,11.209,9.289,8.773


# Supervised Transliteration 

In [60]:
supxlit_rtable=pd.read_csv('sup_transliteration.csv',sep='|',
            header=None, index_col=False,
            names=['model','exp','representation','slang','tlang','a1','f1','mrr','mapref','a10'])
supxlit_rtable

Unnamed: 0,model,exp,representation,slang,tlang,a1,f1,mrr,mapref,a10
0,2,2_bisup_nomono_again,onehot,hi,kn,0.191734,0.755251,0.191734,0.191734,0.191734
1,4,4_bisup_mono,onehot,ta,kn,0.213756,0.760105,0.213756,0.213756,0.213756
2,4,4_bisup_mono,onehot,hi,kn,0.332656,0.835045,0.332656,0.332656,0.332656
3,4,4_bisup_mono,onehot,bn,hi,0.209505,0.770459,0.209505,0.209505,0.209505
4,4,4_bisup_mono,phonetic,ta,kn,0.20499,0.745852,0.20499,0.20499,0.20499
5,4,4_bisup_mono,phonetic,hi,kn,0.29607,0.832345,0.29607,0.29607,0.29607
6,4,4_bisup_mono,phonetic,bn,hi,0.217537,0.765092,0.217537,0.217537,0.217537
7,4,4_2_sumloss_again4,phonetic,hi,kn,0.29336,0.784806,0.29336,0.29336,0.29336
8,3,3_sup_mono,onehot,ta,kn,0.227916,0.766578,0.227916,0.227916,0.227916
9,3,3_sup_mono,onehot,hi,kn,0.325881,0.84518,0.325881,0.325881,0.325881


In [57]:
z=supxlit_rtable[~supxlit_rtable.exp.isin(['4_2_sumloss_again2','4_2_sumloss_again3','4_2_sumloss_again4','2_bisup_nomono_again','1_newrep'])]
z1=z.pivot_table(index=['slang','tlang','exp'],columns=['representation'],values=['a1','f1'])
z1

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,a1,a1,f1,f1
Unnamed: 0_level_1,Unnamed: 1_level_1,representation,onehot,phonetic,onehot,phonetic
slang,tlang,exp,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
bn,hi,1_sup_nomono,0.115797,0.129183,0.686696,0.693187
bn,hi,2_bisup_nomono,0.110442,0.057564,0.685726,0.589711
bn,hi,3_sup_mono,0.222892,0.21419,0.771576,0.773675
bn,hi,4_2_sumloss,0.117805,0.147256,0.688398,0.695617
bn,hi,4_3_ll_rep_loss,0.105087,0.137216,0.67801,0.697443
bn,hi,4_bisup_mono,0.209505,0.217537,0.770459,0.765092
hi,kn,1_sup_nomono,0.276423,0.29336,0.780671,0.784648
hi,kn,2_bisup_nomono,0.230352,0.173442,0.774275,0.722633
hi,kn,3_sup_mono,0.325881,0.356369,0.84518,0.850064
hi,kn,4_2_sumloss,0.279133,0.28252,0.793678,0.779694


In [55]:
za=supxlit_rtable[supxlit_rtable.exp.isin(['3_sup_mono','4_bisup_mono'])]
za1=za.pivot_table(index=['slang','tlang','exp'],columns=['representation'],values=['a1','f1'])
za1

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,a1,a1,f1,f1
Unnamed: 0_level_1,Unnamed: 1_level_1,representation,onehot,phonetic,onehot,phonetic
slang,tlang,exp,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
bn,hi,3_sup_mono,0.222892,0.21419,0.771576,0.773675
bn,hi,4_bisup_mono,0.209505,0.217537,0.770459,0.765092
hi,kn,3_sup_mono,0.325881,0.356369,0.84518,0.850064
hi,kn,4_bisup_mono,0.332656,0.29607,0.835045,0.832345
ta,kn,3_sup_mono,0.227916,0.227242,0.766578,0.76992
ta,kn,4_bisup_mono,0.213756,0.20499,0.760105,0.745852


In [61]:
zb=supxlit_rtable[supxlit_rtable.model==3]
zb1=zb.pivot_table(index=['slang','tlang','exp'],columns=['representation'],values=['a1','f1'])
zb1

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,a1,a1,f1,f1
Unnamed: 0_level_1,Unnamed: 1_level_1,representation,onehot,phonetic,onehot,phonetic
slang,tlang,exp,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
bn,hi,3_3_use_src,0.121821,0.135877,0.688952,0.695598
bn,hi,3_3_use_tgt,0.124498,0.134538,0.694996,0.696605
bn,hi,3_sup_mono,0.222892,0.21419,0.771576,0.773675
hi,kn,3_2_use_src,0.28794,0.29336,0.788099,0.795534
hi,kn,3_3_use_tgt,0.288618,0.264905,0.788334,0.772454
hi,kn,3_sup_mono,0.325881,0.356369,0.84518,0.850064
ta,kn,3_3_use_src,0.155091,0.188806,0.706159,0.718785
ta,kn,3_3_use_tgt,0.172623,0.188806,0.721672,0.722339
ta,kn,3_sup_mono,0.227916,0.227242,0.766578,0.76992
