In [1]:
import json
import pathlib

from fastai.text import *

import numpy as np
import pandas as pd

# Data Preparation

In [2]:
BOS = 'xbos'  # beginning-of-sentence tag
FLD = 'xfld'  # data field tag

PATH = pathlib.Path("lm/telugu/data/")

In [7]:
LM_PATH=Path('lm/telugu/telugu_lm/')
LM_PATH.mkdir(exist_ok=True)

In [4]:
LANG_FILENAMES = [str(f) for f in PATH.rglob("*/*")]
print(len(LANG_FILENAMES))
LANG_FILENAMES[0:5]

471


['lm/telugu/data/AD/wiki_98',
 'lm/telugu/data/AD/wiki_83',
 'lm/telugu/data/AD/wiki_52',
 'lm/telugu/data/AD/wiki_82',
 'lm/telugu/data/AD/wiki_21']

In [5]:
LANG_TEXT = []
for i in LANG_FILENAMES:
    for line in open(i):
        LANG_TEXT.append(json.loads(line))
        
LANG_TEXT = pd.DataFrame(LANG_TEXT)

In [8]:
LANG_TEXT.to_csv(f"{LM_PATH}/Wiki_Telugu_Corpus.csv", index=False)

In [10]:
LANG_TEXT = pd.read_csv("Wiki_Telugu_Corpus.csv")

In [11]:
(LANG_TEXT.assign(labels = 0)
    .pipe(lambda x: x[['labels', 'text']])
    .to_csv(f"{LM_PATH}/Wiki_Telugu_Corpus2.csv", header=None, index=False))

# Some statistics of Telugu Wikipedia

In [12]:
# Getting rid of the title name in the text field
def split_title_from_text(text):
    words = text.split("\n\n")
    if len(words) >= 2:
        return ''.join(words[1:])
    else:
        return ''.join(words)
    
LANG_TEXT['text'] = LANG_TEXT['text'].apply(lambda x: split_title_from_text(x))

### Number of documents

In [13]:
LANG_TEXT.shape

(69001, 4)

### Number of words in all the documents

In [14]:
LANG_TEXT['text'].apply(lambda x: len(x.split(" "))).sum()

22174830

### Number of unique tokens across documents

In [15]:
len(set(''.join(LANG_TEXT['text'].values).split(" ")))

2023536

In [16]:
def get_texts(df, n_lbls=1):
    labels = df.iloc[:,range(n_lbls)].values.astype(np.int64)
    texts = f'\n{BOS} {FLD} 1 ' + df[n_lbls].astype(str)
    for i in range(n_lbls+1, len(df.columns)): texts += f' {FLD} {i-n_lbls} ' + df[i].astype(str)
    #texts = texts.apply(fixup).values.astype(str)

    tok = Tokenizer().proc_all_mp(partition_by_cores(texts)) # splits the list into sublists for processing by each core
    # Lower and upper case is inside the tokenizer
    return tok, list(labels)

def get_all(df, n_lbls):
    tok, labels = [], []
    for i, r in enumerate(df):
        print(i)
        #pdb.set_trace()
        tok_, labels_ = get_texts(r, n_lbls)
        tok += tok_;
        labels += labels_
    return tok, labels

In [17]:
LANG_TEXT = pd.read_csv(f"{LM_PATH}/Wiki_Telugu_Corpus2.csv", header=None)#, chunksize=5000)

In [18]:
trn_texts,val_texts = sklearn.model_selection.train_test_split(
    LANG_TEXT, test_size=0.1) # split the data into train and validation sets

In [19]:
np.random.seed(42)
trn_idx = np.random.permutation(len(trn_texts)) # generate a random ordering
val_idx = np.random.permutation(len(val_texts))

df_trn = trn_texts.iloc[trn_idx,:] # sort things randomly
df_val = val_texts.iloc[val_idx,:] # sort things randomly

df_trn.columns = ['labels', 'text']
df_val.columns = ['labels', 'text']

df_trn.to_csv(LM_PATH/'train.csv', header=False, index=False)
df_val.to_csv(LM_PATH/'test.csv', header=False, index=False) # saving the data in our new format to disk

In [20]:
chunksize = 10000
df_trn = pd.read_csv(LM_PATH/'train.csv', header=None, chunksize=chunksize)
df_val = pd.read_csv(LM_PATH/'test.csv', header=None, chunksize=chunksize)

In [21]:
tok_trn, trn_labels = get_all(df_trn, 1)
tok_val, val_labels = get_all(df_val, 1)

0
1
2
3
4
5
6
0


In [22]:
# create a tmp directory to store the upcoming numpy arrays
(LM_PATH/'tmp').mkdir(exist_ok=True)

# save the train and validation tokens in the tmp directories
np.save(LM_PATH/'tmp'/'tok_trn.npy', tok_trn)
np.save(LM_PATH/'tmp'/'tok_val.npy', tok_val)

In [23]:
tok_trn = np.load(LM_PATH/'tmp'/'tok_trn.npy')
tok_val = np.load(LM_PATH/'tmp'/'tok_val.npy')

In [24]:
# Identify the most common tokens and numericalizing the text
freq = Counter(p for o in tok_trn for p in o) 
freq.most_common(25)

[(',', 1407089),
 ('\n', 671359),
 ('\n\n', 434391),
 ('నుండి', 374401),
 ('ఉన్నాయి.', 307043),
 ('దూరంలో', 253309),
 ('గ్రామం', 253138),
 ('ఉంది.', 252431),
 ('10', 220890),
 ('గ్రామంలో', 190353),
 ('"', 180130),
 ('ఈ', 171605),
 ('మరియు', 168879),
 ('(', 155718),
 ('కి.మీ.', 155143),
 (')', 152823),
 ('5', 146692),
 ('కేంద్రం', 137280),
 ('సమీప', 134781),
 ('.', 120096),
 ('ఒక', 104031),
 ('సౌకర్యం', 94277),
 ('ద్వారా', 89490),
 ('కూడా', 88957),
 ('పైబడిన', 85834)]

In [25]:
# Truncating our vocab to ignore the rare words
max_vocab = 60000
min_freq = 5

itos = [o for o,c in freq.most_common(max_vocab) if c>min_freq] # getting rid of the rare words
itos.insert(0, '_pad_') # 
itos.insert(0, '_unk_') # itos is the list of all the strings in the vocab

In [26]:
# creating a index-key dictionary for our vocabulary
stoi = collections.defaultdict(lambda:0, {v:k for k,v in enumerate(itos)})
len(itos)

60002

In [27]:
# creating a index representation for our train and validation dataset
trn_lm = np.array([[stoi[o] for o in p] for p in tok_trn])
val_lm = np.array([[stoi[o] for o in p] for p in tok_val])

In [28]:
# saving our indexed representation of our dataset to disk
# we also save the index-word mapping to retrieve the complete text representation from these numpy arrays
np.save(LM_PATH/'tmp'/'trn_ids.npy', trn_lm)
np.save(LM_PATH/'tmp'/'val_ids.npy', val_lm)
pickle.dump(itos, open(LM_PATH/'tmp'/'itos.pkl', 'wb'))

In [29]:
# Loading the indexed representation of our dataset from disk
# we also load the index-word mapping to to help us convert the indexes to word datasets, if need be.
trn_lm = np.load(LM_PATH/'tmp'/'trn_ids.npy')
val_lm = np.load(LM_PATH/'tmp'/'val_ids.npy')
itos = pickle.load(open(LM_PATH/'tmp'/'itos.pkl', 'rb'))

In [30]:
# checking vocabulary size
vs=len(itos)
vs,len(trn_lm)

(60002, 62100)

# Model Setup

In [32]:
# ! wget -nH -r -np http://files.fast.ai/models/wt103/
# mv models/ {LM_PATH}

In [35]:
em_sz,nh,nl = 400,1150,3

PRE_PATH = LM_PATH/'models'/'wt103'
PRE_LM_PATH = PRE_PATH/'fwd_wt103.h5'

itos2 = pickle.load((PRE_PATH/'itos_wt103.pkl').open('rb')) # mapping the itos from wiki to our own mapping
stoi2 = collections.defaultdict(lambda:-1, {v:k for k,v in enumerate(itos2)})

In [None]:
# we train from scratch so these are unused
# wgts = torch.load(PRE_LM_PATH, map_location=lambda storage, loc: storage)

# enc_wgts = to_np(wgts['0.encoder.weight'])
# row_m = enc_wgts.mean(0)

# wgts['0.encoder.weight'] = T(new_w)
# wgts['0.encoder_with_dropout.embed.weight'] = T(np.copy(new_w))
# wgts['1.decoder.weight'] = T(np.copy(new_w))

# Language Model

In [38]:
wd=1e-7
bptt=70
bs=52
opt_fn = partial(optim.Adam, betas=(0.8, 0.99))

In [39]:
trn_dl = LanguageModelLoader(np.concatenate(trn_lm), bs, bptt)
val_dl = LanguageModelLoader(np.concatenate(val_lm), bs, bptt)
md = LanguageModelData(PATH, 1, vs, trn_dl, val_dl, bs=bs, bptt=bptt)

In [40]:
drops = np.array([0.25, 0.1, 0.2, 0.02, 0.15])*0.7 # if you're overfitting, increase this. Underfitting? decrease this.

In [41]:
learner= md.get_model(opt_fn, em_sz, nh, nl, 
    dropouti=drops[0], dropout=drops[1], wdrop=drops[2], dropoute=drops[3], dropouth=drops[4])

learner.metrics = [accuracy]
learner.unfreeze()

In [42]:
lr=1e-3
lrs = lr

In [None]:
learner.fit(lrs/2, 1, wds=wd, use_clr=(32,2), cycle_len=1) # last layer is the embedding weights

 99%|█████████▊| 6653/6740 [16:09<00:12,  6.87it/s, loss=3.67]

In [46]:
learner.save('lm_telugu_fromscratch')

In [45]:
print(1)

1


In [None]:
learner.load('lm_telugu_fromscratch')

In [None]:
# learner.lr_find(start_lr=lrs/10, end_lr=lrs*10, linear=True)

In [None]:
# learner.sched.plot()

In [47]:
learner.fit(lrs, 1, wds=wd, use_clr=(20,10), cycle_len=1)

 86%|████████▌ | 5773/6740 [13:19<02:13,  7.22it/s, loss=3.39]

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [48]:
learner.fit(lrs, 1, wds=wd, use_clr=(20,10), cycle_len=15)

epoch      trn_loss   val_loss   accuracy                     
    0      3.064934   3.230878   0.550889  
    1      2.937351   3.15471    0.551992                     
    2      2.927289   3.08986    0.556666                     
    3      2.867486   3.055522   0.559318                     
    4      2.82416    3.033953   0.560748                     
    5      2.810316   3.015779   0.562172                     
    6      2.765557   2.998677   0.563972                     
    7      2.745653   2.986154   0.565125                     
    8      2.808281   2.972621   0.566626                     
    9      2.733467   2.963909   0.566832                     
    10     2.705382   2.952384   0.56811                      
    11     2.718853   2.939511   0.569547                     
    12     2.675635   2.938468   0.569217                     
    13     2.671267   2.928721   0.570154                     
    14     2.692976   2.91787    0.571475                     



[2.9178704515847462, 0.5714752661445426]

In [49]:
learner.save('lm_telugu_fromscratch_2')