In [2]:
import numpy as np
import pandas as pd
import os

#### Load data

In [3]:
data_path = "/workspace/lang-detect/txt/"
dir_list = os.listdir(data_path)
print(dir_list)

['sl', 'es', 'el', 'nl', 'hu', 'it', 'bg', 'sk', 'da', 'sv', 'cs', 'lt', 'de', 'en', 'pl', 'fr', 'fi', 'lv', 'pt', 'et', 'ro']


In [4]:
def read_data(filepath):
    with open(filepath, "r") as f:
        lines = f.readlines()
        if len(lines) > 1:
            return lines[1].strip("\n")
        return None

In [5]:
%%time
data, labels = [], []
for dir_name in dir_list:
    files_list = os.listdir(data_path + dir_name)
    for f in files_list:
        sent = read_data(data_path + dir_name + "/" + f)
        if sent:
            data.append(sent)
            labels.append(dir_name)

CPU times: user 4.9 s, sys: 5.6 s, total: 10.5 s
Wall time: 41.9 s


In [6]:
print("Length of data", len(data))

('Length of data', 186458)


In [7]:
# Check data sample
import random
rand_indices = random.sample(range(len(data)),  20)
for i in rand_indices:
    print(data[i],labels[i])

('Hodina ot\xc3\xa1zok (ot\xc3\xa1zky pre Radu)', 'sk')
('9. Beschikbaarstelling van middelen uit het Europees Fonds voor aanpassing aan de globalisering: Lear/Spanje (', 'nl')
('EU-Turkey relations (debate) ', 'en')
('2. Struktury zarz\xc4\x85dzania europejskimi programami radionawigacji satelitarnej (', 'pl')
('Onderzoek geloofsbrieven: zie notulen', 'nl')
('22. Aprobaci\xc3\xb3n de la gesti\xc3\xb3n 2006: Centro Europeo para la Prevenci\xc3\xb3n y el Control de las Enfermedades (', 'es')
('Beziehungen EU/Tunesien (Aussprache)', 'de')
('Chiusura della seduta', 'it')
('7. Az \xc3\xa9lelmiszerek \xc3\xa9s a takarm\xc3\xa1nyok radioakt\xc3\xadv szennyezetts\xc3\xa9g\xc3\xa9nek legmagasabb megengedhet\xc5\x91 hat\xc3\xa1r\xc3\xa9rt\xc3\xa9ke (kodifik\xc3\xa1lt v\xc3\xa1ltozat) (szavaz\xc3\xa1s) ', 'hu')
('1. Az Eur\xc3\xb3pai Globaliz\xc3\xa1ci\xc3\xb3s Alkalmazkod\xc3\xa1si Alap ig\xc3\xa9nybev\xc3\xa9tele: \xc3\x8drorsz\xc3\xa1g - SR Technics (', 'hu')
('Tekster til aftaler sendt af R\

#### Preprocess the data

In [8]:
import utils

test_text = "12. Aftale mellem EF\" 'og' Japans !2### ( 100.9 lin 1233 ??? p\xc3\xb6yt\xc3\xa4kirjan hyv\xc3\xa4ksyminen:"
utils.preprocess(test_text)

'aftale mellem EF og japans lin p\xc3\xb6yt\xc3\xa4kirjan hyv\xc3\xa4ksyminen '

In [9]:
%%time
for i in range(len(data)):
    data[i] = utils.preprocess(data[i])

CPU times: user 1.3 s, sys: 79.2 ms, total: 1.38 s
Wall time: 1.32 s


In [10]:
#test
print(data[:5])

['predlog splo\xc5\xa1nega prora\xc4\x8duna za leto oddelek III ', '\xc5\xbdenske in vodenje podjetij ', 'razmere na bli\xc5\xbenjem vzhodugaza glasovanje ', 'predlo\xc5\xbeitev dokumentov glej zapisnik ', 'javna ponudba vrednostnih papirjev in uskladitev zahtev v zvezi s preglednostjo razprava ']


#### Create dictionary for each language

In [11]:
from collections import defaultdict

lang_dict = defaultdict(lambda: defaultdict(int))

for i in range(len(data)):
    for token in data[i].split():
        if (labels[i] in lang_dict) and (token in lang_dict[labels[i]]):
            lang_dict[labels[i]][token] += 1
        else:
            lang_dict[labels[i]][token] = 0

In [12]:
lang_dict['en']['the']

5324

In [13]:
# Vocabulary size in each language
for key in lang_dict.keys():
    print(key, len(lang_dict[key].keys()))

('el', 6713)
('fr', 4738)
('bg', 5449)
('nl', 5100)
('ro', 5111)
('pt', 4241)
('lv', 6186)
('sv', 5645)
('de', 5617)
('it', 4819)
('hu', 6841)
('sk', 6480)
('et', 6733)
('lt', 6431)
('en', 4035)
('pl', 6485)
('sl', 6391)
('cs', 6002)
('fi', 7015)
('da', 5240)
('es', 4273)


#### Save the data

In [30]:
import json

with open('lang_dict.json', 'wb') as outfile:
    json.dump(lang_dict, outfile)

#### **Create n-gram train set**

In [17]:
def create_n_gram(text, n):
    new_string = ""
    for token in text.split():
        if len(token) <= n:
            new_string += (token + " ")
        else:
            for i in range(len(token)-n+1):
                new_string += (token[i:i+n] + " ")
    return new_string.strip()

test_text = "ab abc abcd abcde abcdef"
print(create_n_gram(test_text, 3))
print(create_n_gram(test_text, 4))

ab abc abc bcd abc bcd cde abc bcd cde def
ab abc abcd abcd bcde abcd bcde cdef


In [43]:
new_data = [0]*len(data)
for i in range(len(data)):
    new_data[i] = create_n_gram(data[i], 3)

In [51]:
lang_dict_n_gram = {}

for i in range(len(new_data)):
    for token in new_data[i].split():
        if (labels[i] in lang_dict_n_gram) and (token in lang_dict_n_gram[labels[i]]):
            lang_dict_n_gram[labels[i]][token] += 1
        else:
            if not (labels[i] in lang_dict_n_gram):
                lang_dict_n_gram[labels[i]] = {}
            lang_dict_n_gram[labels[i]][token] = 0

In [53]:
print(len(lang_dict_n_gram['en'].keys()))
print(lang_dict_n_gram['en']['the'])

4100
5480


In [54]:
# Vocabulary size in each language
for key in lang_dict_n_gram.keys():
    print(key, len(lang_dict_n_gram[key].keys()))

('el', 3414)
('fr', 4259)
('bg', 2642)
('nl', 4985)
('ro', 3925)
('pt', 4106)
('lv', 4793)
('sv', 5078)
('de', 5177)
('it', 3582)
('hu', 6017)
('sk', 5293)
('et', 4465)
('lt', 4747)
('en', 4100)
('pl', 5038)
('sl', 4418)
('cs', 5474)
('fi', 4551)
('da', 4965)
('es', 3888)


In [62]:
import pickle
with open('lang_dict_3_gram.json', 'w') as f:
    pickle.dump(lang_dict_n_gram, f)

In [63]:
with open('lang_dict_3_gram.json', 'r') as f:
    ld = pickle.load(f)

In [64]:
print(len(ld['en'].keys()))
print(ld['en']['the'])

4100
5480
