### Language Detection

In [1]:
from pathlib import Path
import os
import pandas as pd
import numpy as np

### Checking Data

In [2]:
DATA = Path("/data/wili")
os.listdir(DATA)

['x_train.txt',
 'labels.csv',
 'y_train.txt',
 'x_test.txt',
 'y_test.txt',
 'README.txt',
 'vocab.json',
 'urls.txt']

In [3]:
pd.read_csv(DATA/"labels.csv")

Unnamed: 0,Label;English;Wiki Code;ISO 369-3;German;Language family;Writing system;Remarks;Synonyms
0,ace;Achinese;ace;ace;Achinesisch;Austronesian;;;
1,afr;Afrikaans;af;afr;Afrikaans;Indo-European;;;
2,als;Alemannic German;als;gsw;Alemannisch;Indo-...
3,amh;Amharic;am;amh;Amharisch;Afro-Asiatic;;;
4,ang;Old English ;ang;ang;Altenglisch;Indo-Euro...
5,ara;Arabic;ar;ara;Arabisch;Afro-Asiatic;;;
6,arg;Aragonese;an;arg;Aragonesisch;Indo-Europea...
7,arz;Egyptian Arabic;arz;arz;Ägyptisch-Arabisch...
8,asm;Assamese;as;asm;Assamesisch;Indo-European;;;
9,ast;Asturian;ast;ast;Asturisch;Indo-European;;;


In [4]:
train_txt = open(DATA/"x_train.txt").read().split("\n")[:-2]
test_txt = open(DATA/"x_test.txt").read().split("\n")[:-2]

In [5]:
train_txt[:20],len(train_txt)

(['Klement Gottwaldi surnukeha palsameeriti ning paigutati mausoleumi. Surnukeha oli aga liiga hilja ja oskamatult palsameeritud ning hakkas ilmutama lagunemise tundemärke. 1962. aastal viidi ta surnukeha mausoleumist ära ja kremeeriti. Zlíni linn kandis aastatel 1949–1989 nime Gottwaldov. Ukrainas Harkivi oblastis kandis Zmiivi linn aastatel 1976–1990 nime Gotvald.',
  'Sebes, Joseph; Pereira Thomas (1961) (på eng). The Jesuits and the Sino-Russian treaty of Nerchinsk (1689): the diary of Thomas Pereira. Bibliotheca Instituti historici S. I., 99-0105377-3 ; 18. Rome. Libris 677492',
  'भारतीय स्वातन्त्र्य आन्दोलन राष्ट्रीय एवम क्षेत्रीय आह्वान, उत्तेजनासभ एवम प्रयत्नसँ प्रेरित, भारतीय राजनैतिक सङ्गठनद्वारा सञ्चालित अहिंसावादी आ सैन्यवादी आन्दोलन छल, जेकर एक समान उद्देश्य, अङ्ग्रेजी शासनक भारतीय उपमहाद्वीपसँ जडीसँ उखाड फेकनाई छल। ई आन्दोलनक शुरुआत १८५७ मे भेल सिपाही विद्रोहक मानल जाइत अछि। स्वाधीनताक लेल हजारो लोग अपन प्राणक बलि देलक। भारतीय राष्ट्रीय कांग्रेस १९३० कांग्रेस अधिवेशन मे 

In [6]:
train_lbl = open(DATA/"y_train.txt").read().split("\n")[:-2]
test_lbl = open(DATA/"y_test.txt").read().split("\n")[:-2]

In [7]:
train_lbl[:30],len(train_lbl)

(['est',
  'swe',
  'mai',
  'oci',
  'tha',
  'orm',
  'lim',
  'guj',
  'pnb',
  'zea',
  'krc',
  'hat',
  'pcd',
  'tam',
  'vie',
  'pan',
  'szl',
  'ckb',
  'fur',
  'wuu',
  'arz',
  'ton',
  'eus',
  'map-bms',
  'glk',
  'map-bms',
  'nld',
  'arz',
  'bod',
  'jpn'],
 117499)

In [8]:
chars = list(set(open(DATA/"x_train.txt").read()))

In [9]:
print(chars[:30], len(chars))

['殮', '侃', '폭', '뭇', '沸', '鷟', '戟', 'ì', '擇', '耶', '୯', '፸', '力', 'ⵣ', 'ᑉ', '🇸', '绝', '豬', '谁', '変', '맬', '隰', 'Ꮰ', 'ず', '腐', '恃', 'ম', 'ਈ', 'ू', '은'] 10807


In [10]:
ord('頸',)

38968

In [11]:
from forgebox.ftorch.prepro import Seq_Dataset,Arr_Dataset,fuse,test_DS
import torch

loading configs from /etc/forgebox.cfg


In [12]:
BS = 64
CUDA = torch.cuda.is_available()
DIM = 50

In [13]:
x_seq = Seq_Dataset("nlp",train_txt,seq_len=100,bs=BS,
                    vocab_path="/data/wili/vocab.json", 
                    vocab_size=10807,sep_tok="",build_vocab=False, fixlen=True)
x_seq_test = Seq_Dataset("nlp_test",test_txt,seq_len=100,bs=BS,
                    vocab_path="/data/wili/vocab.json", 
                    vocab_size=10807,sep_tok="",build_vocab=False, fixlen=True)

nlp sequence type: <class 'list'>
nlp sequence total_length type: 117499
nlp_test sequence type: <class 'list'>
nlp_test sequence total_length type: 117499


In [14]:
dt = test_DS(x_seq)
dt()

tensor([[[112,  11,   4,  ...,   2,  37,   3],
         [ 66,   4,  23,  ...,  10,  15,   2],
         [248,  47,  63,  ...,  84, 124,  97],
         ...,
         [ 40,  60,   2,  ...,  62,   2, 646],
         [400, 205, 753,  ..., 753, 272, 839],
         [204,   2,  13,  ...,  69,   6,  13]]])

In [15]:
langs = list(set(train_lbl))

In [16]:
idx2lan = dict((k,v) for k,v in enumerate(langs))
lan2idx = dict((v,k) for k,v in enumerate(langs))

train_y = np.vectorize(lambda x:lan2idx[x])(train_lbl)
test_y = np.vectorize(lambda x:lan2idx[x])(test_lbl)
train_y

In [18]:
train_y_ds = Arr_Dataset(train_y,bs=BS)
test_y_ds = Arr_Dataset(test_y,bs=BS)

train_ds = fuse(x_seq,train_y_ds)
test_ds_ = fuse(x_seq_test,test_y_ds)

test_DS(train_ds)()

[tensor([[[112,  11,   4,  ...,   2,  37,   3],
          [ 66,   4,  23,  ...,  10,  15,   2],
          [248,  47,  63,  ...,  84, 124,  97],
          ...,
          [ 40,  60,   2,  ...,  62,   2, 646],
          [400, 205, 753,  ..., 753, 272, 839],
          [204,   2,  13,  ...,  69,   6,  13]]]),
 [tensor([[ 31,  49, 136,  51, 134, 122, 225, 187,   8, 210,  12, 186, 127,  23,
            41, 203, 105, 170,  62, 183, 202, 121,  29, 232, 219, 232, 145, 202,
            83,  78,  69,  10, 204, 231,  30,  13,  33, 224, 119, 201, 148,  10,
            12,   0, 160, 188,  78, 106,  83, 124, 168, 139,  80,  25, 147, 122,
           224, 103,  63, 108,  90, 132, 141, 197]])]]

In [19]:
VOCAB_SIZE=x_seq.vocab_size
print(VOCAB_SIZE)

10807


### Model

In [20]:
from torch import nn
from torch.optim import Adam

In [21]:
len(langs)

235

Simple Gated Reccurent Unit (GRU)

In [22]:
class ld(nn.Module):
    def __init__(self, hs):
        super().__init__()
        self.emb = nn.Embedding(10807,hs)
        self.rnn = nn.GRU(input_size=hs, hidden_size=hs,batch_first = True, num_layers = 1)
        self.mlp = nn.Sequential(*[
            nn.Linear(hs,hs,bias=False),
            nn.BatchNorm1d(hs),
            nn.ReLU(),
            nn.Linear(hs,236),
            nn.Softmax(dim = -1),
                                  ])
    def forward(self,x):
        x = self.emb(x)
        x,h_n = self.rnn(x)
        x = x[:,-1,:]
        x = self.mlp(x)
        return x

In [23]:
from forgebox.ftorch.train import Trainer

md = ld(DIM)
if CUDA: md.cuda()
opt = Adam(md.parameters())
lossf = nn.CrossEntropyLoss()

In [24]:
t = Trainer(train_ds,val_dataset=test_ds_,batch_size=1,shuffle=True, print_on=5)

### A training step

Predict y_, metrics: loss, accuracy

In [25]:
@t.step_train
def action(*args, **kwargs):
    opt.zero_grad()
    x,y = args[0]
    x=x[0]
    y = y[0][0]
    if CUDA:
        x = x.cuda()
        y = y.cuda()
    y_ = md(x)   
    loss = lossf(y_,y)
    
    loss.backward()
    opt.step()
    acc = (torch.max(y_, dim=-1)[1]==y).float().mean()
    
    return {"loss":loss.item(),"acc":acc.item()}

@t.step_val
def action(*args, **kwargs):
    x,y = args[0]
    x=x[0]
    y = y[0][0]
    if CUDA:
        x = x.cuda()
        y = y.cuda()
    y_ = md(x)   
    loss = lossf(y_,y)
    
    acc = (torch.max(y_, dim=-1)[1]==y).float().mean()
    
    return {"loss":loss.item(),"acc":acc.item()}

In [None]:
t.train(5)

HBox(children=(IntProgress(value=0, max=1836), HTML(value='')))




HBox(children=(IntProgress(value=0, max=1836), HTML(value='')))




HBox(children=(IntProgress(value=0, max=1836), HTML(value='')))




HBox(children=(IntProgress(value=0, max=1836), HTML(value='')))




HBox(children=(IntProgress(value=0, max=1836), HTML(value='')))




HBox(children=(IntProgress(value=0, max=1836), HTML(value='')))




HBox(children=(IntProgress(value=0, max=1836), HTML(value='')))




HBox(children=(IntProgress(value=0, max=1836), HTML(value='')))




HBox(children=(IntProgress(value=0, max=1836), HTML(value='')))




HBox(children=(IntProgress(value=0, max=1836), HTML(value='')))