In [1]:
import numpy as np
import pandas as pd
import os

#### Load data

In [2]:
data_path = "/workspace/lang-detect/txt/"
dir_list = os.listdir(data_path)
print(dir_list)

['sl', 'es', 'el', 'nl', 'hu', 'it', 'bg', 'sk', 'da', 'sv', 'cs', 'lt', 'de', 'en', 'pl', 'fr', 'fi', 'lv', 'pt', 'et', 'ro']


In [3]:
def read_data(filepath):
    with open(filepath, "r") as f:
        lines = f.readlines()
        if len(lines) > 1:
            return lines[1].strip("\n")
        return None

In [4]:
%%time
data, labels = [], []
for dir_name in dir_list:
    files_list = os.listdir(data_path + dir_name)
    for f in files_list:
        sent = read_data(data_path + dir_name + "/" + f)
        if sent:
            data.append(sent)
            labels.append(dir_name)

CPU times: user 6.53 s, sys: 7.54 s, total: 14.1 s
Wall time: 45.1 s


In [5]:
print("Length of data", len(data))

('Length of data', 186458)


In [6]:
# Check data sample
import random
rand_indices = random.sample(range(len(data)),  20)
for i in rand_indices:
    print(data[i],labels[i])

('3. Burma (afstemning) ', 'da')
('5. Dispositifs de remorquage et de marche arri\xc3\xa8re des tracteurs agricoles ou forestiers \xc3\xa0 roues (Version codifi\xc3\xa9e) (codified version) (vote) ', 'fr')
('4. B\xc4\x9blorusko (hlasov\xc3\xa1n\xc3\xad) ', 'cs')
('5. Elintarvikkeiden hintojen nousu Euroopan unionissa ja kehitysmaissa (\xc3\xa4\xc3\xa4nestys) ', 'fi')
('8. Az energiapiac integrit\xc3\xa1sa \xc3\xa9s \xc3\xa1tl\xc3\xa1that\xc3\xb3s\xc3\xa1ga (', 'hu')
('The impact of cohesion policy on the integration of vulnerable communities and groups (debate) ', 'en')
('15. Posilnenie chemickej, biologickej, r\xc3\xa1diologickej a jadrovej bezpe\xc4\x8dnosti v Eur\xc3\xb3pskej \xc3\xbanii - ak\xc4\x8dn\xc3\xbd pl\xc3\xa1n E\xc3\x9a v oblasti CBRN bezpe\xc4\x8dnosti (', 'sk')
('Egyperces felsz\xc3\xb3lal\xc3\xa1sok jelent\xc5\x91s politikai k\xc3\xa9rd\xc3\xa9sekben', 'hu')
('7. Projecto de or\xc3\xa7amento rectificativo n.\xc2\xba 4/2010: Sec\xc3\xa7\xc3\xa3o III - Comiss\xc3\xa3o (E

#### Preprocess the data

In [7]:
import utils

test_text = "12. Aftale mellem EF\" 'og' Japans !2### ( 100.9 lin 1233 ??? p\xc3\xb6yt\xc3\xa4kirjan hyv\xc3\xa4ksyminen:"
utils.preprocess(test_text)

'aftale mellem EF og japans lin p\xc3\xb6yt\xc3\xa4kirjan hyv\xc3\xa4ksyminen '

In [8]:
%%time
for i in range(len(data)):
    data[i] = utils.preprocess(data[i])

CPU times: user 1.26 s, sys: 94.5 ms, total: 1.36 s
Wall time: 1.28 s


In [9]:
#test
print(data[:5])

['predlog splo\xc5\xa1nega prora\xc4\x8duna za leto oddelek III ', '\xc5\xbdenske in vodenje podjetij ', 'razmere na bli\xc5\xbenjem vzhodugaza glasovanje ', 'predlo\xc5\xbeitev dokumentov glej zapisnik ', 'javna ponudba vrednostnih papirjev in uskladitev zahtev v zvezi s preglednostjo razprava ']


#### Create dictionary for each language

In [10]:
from collections import defaultdict

lang_dict = defaultdict(lambda: defaultdict(int))

for i in range(len(data)):
    for token in data[i].split():
        if (labels[i] in lang_dict) and (token in lang_dict[labels[i]]):
            lang_dict[labels[i]][token] += 1
        else:
            lang_dict[labels[i]][token] = 0

In [11]:
lang_dict['en']['the']

5324

In [12]:
# Vocabulary size in each language
for key in lang_dict.keys():
    print(key, len(lang_dict[key].keys()))

('el', 6713)
('fr', 4738)
('bg', 5449)
('nl', 5100)
('ro', 5111)
('pt', 4241)
('lv', 6186)
('sv', 5645)
('de', 5617)
('it', 4819)
('hu', 6841)
('sk', 6480)
('et', 6733)
('lt', 6431)
('en', 4035)
('pl', 6485)
('sl', 6391)
('cs', 6002)
('fi', 7015)
('da', 5240)
('es', 4273)


#### Save the data

In [14]:
import json

with open('data/lang_dict.json', 'wb') as outfile:
    json.dump(lang_dict, outfile)

#### **Create n-gram train set**

In [15]:
import utils

test_text = "ab abc abcd abcde abcdef"
print(utils.create_n_gram(test_text, 3))
print(utils.create_n_gram(test_text, 4))

ab abc abc bcd abc bcd cde abc bcd cde def
ab abc abcd abcd bcde abcd bcde cdef


In [16]:
new_data = [0]*len(data)
for i in range(len(data)):
    new_data[i] = utils.create_n_gram(data[i], 4)

In [17]:
lang_dict_n_gram = {}

for i in range(len(new_data)):
    for token in new_data[i].split():
        if (labels[i] in lang_dict_n_gram) and (token in lang_dict_n_gram[labels[i]]):
            lang_dict_n_gram[labels[i]][token] += 1
        else:
            if not (labels[i] in lang_dict_n_gram):
                lang_dict_n_gram[labels[i]] = {}
            lang_dict_n_gram[labels[i]][token] = 0

In [18]:
print(len(lang_dict_n_gram['en'].keys()))
print(lang_dict_n_gram['en']['the'])

8192
5324


In [19]:
# Vocabulary size in each language
for key in lang_dict_n_gram.keys():
    print(key, len(lang_dict_n_gram[key].keys()))

('el', 5224)
('fr', 8727)
('bg', 3993)
('nl', 11132)
('ro', 8310)
('pt', 8414)
('lv', 10040)
('sv', 11468)
('de', 11788)
('it', 7959)
('hu', 12743)
('sk', 11041)
('et', 11157)
('lt', 10750)
('en', 8192)
('pl', 10860)
('sl', 9759)
('cs', 11008)
('fi', 11643)
('da', 11128)
('es', 8069)


In [20]:
import pickle
with open('data/lang_dict_4_gram.json', 'w') as f:
    pickle.dump(lang_dict_n_gram, f)

In [21]:
with open('data/lang_dict_4_gram.json', 'r') as f:
    ld = pickle.load(f)

In [22]:
print(len(ld['en'].keys()))
print(ld['en']['the'])

8192
5324


#### **Create combined n-gram and full word train set**

In [23]:
test_text = "ab abc abcd abcde abcdef"
print(utils.create_full_word_and_n_gram(test_text, 3))
print(utils.create_full_word_and_n_gram(test_text, 4))

ab abc abcd abc bcd abcde abc bcd cde abcdef abc bcd cde def
ab abc abcd abcde abcd bcde abcdef abcd bcde cdef


In [24]:
new_data = [0]*len(data)
for i in range(len(data)):
    new_data[i] = utils.create_full_word_and_n_gram(data[i], 4)

lang_dict_n_gram = {}

for i in range(len(new_data)):
    for token in new_data[i].split():
        if (labels[i] in lang_dict_n_gram) and (token in lang_dict_n_gram[labels[i]]):
            lang_dict_n_gram[labels[i]][token] += 1
        else:
            if not (labels[i] in lang_dict_n_gram):
                lang_dict_n_gram[labels[i]] = {}
            lang_dict_n_gram[labels[i]][token] = 0

print(len(lang_dict_n_gram['en'].keys()))
print(lang_dict_n_gram['en']['the'])

# Vocabulary size in each language
for key in lang_dict_n_gram.keys():
    print(key, len(lang_dict_n_gram[key].keys()))
    
import pickle
with open('data/lang_dict_full_word_4_gram.json', 'w') as f:
    pickle.dump(lang_dict_n_gram, f)

11703
5324
('el', 11711)
('fr', 13030)
('bg', 9304)
('nl', 15864)
('ro', 13051)
('pt', 12239)
('lv', 15893)
('sv', 16726)
('de', 17082)
('it', 12348)
('hu', 19245)
('sk', 17138)
('et', 17583)
('lt', 16888)
('en', 11703)
('pl', 16940)
('sl', 15699)
('cs', 16629)
('fi', 18424)
('da', 15995)
('es', 11965)


In [25]:
with open('data/lang_dict_full_word_4_gram.json', 'r') as f:
    ld = pickle.load(f)

print(len(ld['en'].keys()))
print(ld['en']['the'])

11703
5324


#### **Create 3-gram, 4-gram, 5-gram and full word train set.**

In [26]:
test_text = "where is the treasure hidden"
print(utils.create_full_word_and_multiple_n_gram(test_text, [3,4,5]))

where whe her ere wher here is the treasure tre rea eas asu sur ure trea reas easu asur sure treas reasu easur asure hidden hid idd dde den hidd idde dden hidde idden


In [27]:
new_data = [0]*len(data)
for i in range(len(data)):
    new_data[i] = utils.create_full_word_and_multiple_n_gram(data[i], [3,4,5])

lang_dict_n_gram = {}

for i in range(len(new_data)):
    for token in new_data[i].split():
        if (labels[i] in lang_dict_n_gram) and (token in lang_dict_n_gram[labels[i]]):
            lang_dict_n_gram[labels[i]][token] += 1
        else:
            if not (labels[i] in lang_dict_n_gram):
                lang_dict_n_gram[labels[i]] = {}
            lang_dict_n_gram[labels[i]][token] = 0

print(len(lang_dict_n_gram['en'].keys()))
print(lang_dict_n_gram['en']['the'])

# Vocabulary size in each language
for key in lang_dict_n_gram.keys():
    print(key, len(lang_dict_n_gram[key].keys()))
    
import pickle
with open('data/lang_dict_full_word_3_4_5_gram.json', 'w') as f:
    pickle.dump(lang_dict_n_gram, f)

24208
5480
('el', 24062)
('fr', 27117)
('bg', 18528)
('nl', 34970)
('ro', 26743)
('pt', 25531)
('lv', 32789)
('sv', 36119)
('de', 37694)
('it', 25305)
('hu', 41441)
('sk', 35545)
('et', 37241)
('lt', 35389)
('en', 24208)
('pl', 35068)
('sl', 31510)
('cs', 34704)
('fi', 40213)
('da', 34847)
('es', 24752)


In [28]:
with open('data/lang_dict_full_word_3_4_5_gram.json', 'r') as f:
    ld = pickle.load(f)

print(len(ld['en'].keys()))
print(ld['en']['the'])

24208
5480


#### **Create 4-5 gram and full word model**

In [29]:
new_data = [0]*len(data)
for i in range(len(data)):
    new_data[i] = utils.create_full_word_and_multiple_n_gram(data[i], [4,5])

lang_dict_n_gram = {}

for i in range(len(new_data)):
    for token in new_data[i].split():
        if (labels[i] in lang_dict_n_gram) and (token in lang_dict_n_gram[labels[i]]):
            lang_dict_n_gram[labels[i]][token] += 1
        else:
            if not (labels[i] in lang_dict_n_gram):
                lang_dict_n_gram[labels[i]] = {}
            lang_dict_n_gram[labels[i]][token] = 0

print(len(lang_dict_n_gram['en'].keys()))
print(lang_dict_n_gram['en']['the'])

# Vocabulary size in each language
for key in lang_dict_n_gram.keys():
    print(key, len(lang_dict_n_gram[key].keys()))
    
import pickle
with open('data/lang_dict_full_word_4_5_gram.json', 'w') as f:
    pickle.dump(lang_dict_n_gram, f)
    
with open('data/lang_dict_full_word_4_5_gram.json', 'r') as f:
    ld = pickle.load(f)

print(len(ld['en'].keys()))
print(ld['en']['the'])

20338
5324
('el', 20765)
('fr', 23072)
('bg', 15955)
('nl', 30176)
('ro', 23002)
('pt', 21631)
('lv', 28159)
('sv', 31235)
('de', 32683)
('it', 21933)
('hu', 35591)
('sk', 30423)
('et', 32916)
('lt', 30800)
('en', 20338)
('pl', 30223)
('sl', 27296)
('cs', 29412)
('fi', 35776)
('da', 30069)
('es', 21052)
20338
5324


#### **Create multiwords key dictionary**

In [14]:
%%time
new_data = [0]*len(data)
for i in range(len(data)):
    new_data[i] = utils.create_full_word_and_multiple_n_gram(data[i], [3,4,5])

lang_dict_n_gram = {}

for i in range(len(new_data)):
    for token in new_data[i].split():
        if (labels[i] in lang_dict_n_gram) and (token in lang_dict_n_gram[labels[i]]):
            lang_dict_n_gram[labels[i]][token] += 1
        else:
            if not (labels[i] in lang_dict_n_gram):
                lang_dict_n_gram[labels[i]] = {}
            lang_dict_n_gram[labels[i]][token] = 0

for i in range(len(new_data)):
    d = new_data[i].split()
    for j in range(len(d)-1):
        token = d[j] + '_' + d[j+1]
        if (labels[i] in lang_dict_n_gram) and (token in lang_dict_n_gram[labels[i]]):
            lang_dict_n_gram[labels[i]][token] += 1
        else:
            if not (labels[i] in lang_dict_n_gram):
                lang_dict_n_gram[labels[i]] = {}
            lang_dict_n_gram[labels[i]][token] = 0

for i in range(len(new_data)):
    d = new_data[i].split()
    for j in range(len(d)-2):
        token = d[j] + '_' + d[j+1] + '_' + d[j+2]
        if (labels[i] in lang_dict_n_gram) and (token in lang_dict_n_gram[labels[i]]):
            lang_dict_n_gram[labels[i]][token] += 1
        else:
            if not (labels[i] in lang_dict_n_gram):
                lang_dict_n_gram[labels[i]] = {}
            lang_dict_n_gram[labels[i]][token] = 0
            
for i in range(len(new_data)):
    d = new_data[i].split()
    for j in range(len(d)-3):
        token = d[j] + '_' + d[j+1] + '_' + d[j+2] + '_' + d[j+3]
        if (labels[i] in lang_dict_n_gram) and (token in lang_dict_n_gram[labels[i]]):
            lang_dict_n_gram[labels[i]][token] += 1
        else:
            if not (labels[i] in lang_dict_n_gram):
                lang_dict_n_gram[labels[i]] = {}
            lang_dict_n_gram[labels[i]][token] = 0
            
print(len(lang_dict_n_gram['en'].keys()))
print(lang_dict_n_gram['en']['the'])

# Vocabulary size in each language
for key in lang_dict_n_gram.keys():
    print(key, len(lang_dict_n_gram[key].keys()))
    
import pickle
with open('data/lang_dict_multiword_4_gram_key_model.json', 'w') as f:
    pickle.dump(lang_dict_n_gram, f)

217868
5480
('el', 307666)
('fr', 252892)
('bg', 235303)
('nl', 309236)
('ro', 256304)
('pt', 234432)
('lv', 315895)
('sv', 321174)
('de', 334457)
('it', 251587)
('hu', 378673)
('sk', 329805)
('et', 351870)
('lt', 340465)
('en', 217868)
('pl', 333695)
('sl', 303643)
('cs', 316170)
('fi', 384354)
('da', 301523)
('es', 230871)
CPU times: user 1min 23s, sys: 798 ms, total: 1min 24s
Wall time: 1min 25s
