In [1]:
import numpy as np
import pandas as pd
import os

#### Load data

In [2]:
data_path = "/workspace/lang-detect/txt/"
dir_list = os.listdir(data_path)
print(dir_list)

['sl', 'es', 'el', 'nl', 'hu', 'it', 'bg', 'sk', 'da', 'sv', 'cs', 'lt', 'de', 'en', 'pl', 'fr', 'fi', 'lv', 'pt', 'et', 'ro']


In [3]:
def read_data(filepath):
    with open(filepath, "r") as f:
        lines = f.readlines()
        if len(lines) > 1:
            return lines[1].strip("\n")
        return None

In [4]:
%%time
data, labels = [], []
for dir_name in dir_list:
    files_list = os.listdir(data_path + dir_name)
    for f in files_list:
        sent = read_data(data_path + dir_name + "/" + f)
        if sent:
            data.append(sent)
            labels.append(dir_name)

CPU times: user 2.98 s, sys: 1.62 s, total: 4.6 s
Wall time: 4.63 s


In [5]:
print("Length of data", len(data))

('Length of data', 186458)


In [6]:
# Check data sample
import random
rand_indices = random.sample(range(len(data)),  20)
for i in rand_indices:
    print(data[i],labels[i])

('Zahteva za za\xc5\xa1\xc4\x8dito poslanske imunitete: glej zapisnik', 'sl')
('Ordine del giorno della prossima seduta: vedi verbale', 'it')
('EU external strategy on Passenger Name Record (PNR) (debate) ', 'en')
('Contenedores perdidos en el mar y compensaci\xc3\xb3n (debate) ', 'es')
('5. Przyst\xc4\x85pienie Bu\xc5\x82garii i Rumunii do umowy o wsp\xc3\xb3\xc5\x82pracy i unii celnej mi\xc4\x99dzy EWG i Republik\xc4\x85 San Marino (g\xc5\x82osowanie)', 'pl')
('Stemmeforklaringer', 'da')
('20. Accordo di partenariato CE/Repubblica di Kiribati nel settore della pesca (votazione) ', 'it')
('15. Raming van de inkomsten en uitgaven voor het begrotingsjaar 2011 - Afdeling I - Europees Parlement (', 'nl')
('13. Aplicaci\xc3\xb3n del principio de reconocimiento mutuo de resoluciones en materia penal (', 'es')
('6. Fishing opportunities and financial contribution provided for in the EU-S\xc3\xa3o Tom\xc3\xa9 and Pr\xc3\xadncipe Fisheries Partnership Agreement (', 'en')
('10. A m\xc3\xa9lyten

#### Preprocess the data

In [7]:
import re
def preprocess(text):
    # remove punctuations
    text = re.sub("[()/.:?,!@#$%\"']", "", text)
    new_string = ""
    for token in text.split():
        # check if no alphabet character
        if re.search("[a-zA-Z]",token):
            # if not in all-caps, make it lower case
            if not token.isupper():
                token = token.lower()
            new_string += (token + " ")
    return new_string

test_text = "12. Aftale mellem EF\" 'og' Japans !2### ( 100.9 lin 1233 ??? p\xc3\xb6yt\xc3\xa4kirjan hyv\xc3\xa4ksyminen:"
preprocess(test_text)

'aftale mellem EF og japans lin p\xc3\xb6yt\xc3\xa4kirjan hyv\xc3\xa4ksyminen '

In [8]:
%%time
for i in range(len(data)):
    data[i] = preprocess(data[i])

CPU times: user 2.04 s, sys: 19.9 ms, total: 2.06 s
Wall time: 2.04 s


In [9]:
#test
print(data[:5])

['predlog splo\xc5\xa1nega prora\xc4\x8duna za leto oddelek III ', '\xc5\xbdenske in vodenje podjetij ', 'razmere na bli\xc5\xbenjem vzhodugaza glasovanje ', 'predlo\xc5\xbeitev dokumentov glej zapisnik ', 'javna ponudba vrednostnih papirjev in uskladitev zahtev v zvezi s preglednostjo razprava ']


#### Create dictionary for each language

In [10]:
from collections import defaultdict

lang_dict = defaultdict(lambda: defaultdict(int))

for i in range(len(data)):
    for token in data[i].split():
        if (labels[i] in lang_dict) and (token in lang_dict[labels[i]]):
            lang_dict[labels[i]][token] += 1
        else:
            lang_dict[labels[i]][token] = 0

In [11]:
lang_dict['en']['the']

5324

In [12]:
# Vocabulary size in each language
for key in lang_dict.keys():
    print(key, len(lang_dict[key].keys()))

('el', 709)
('fr', 4734)
('bg', 463)
('nl', 5100)
('ro', 5109)
('pt', 4177)
('lv', 6182)
('sv', 5643)
('de', 5617)
('it', 4807)
('hu', 6836)
('sk', 6478)
('et', 6730)
('lt', 6428)
('en', 4035)
('pl', 6481)
('sl', 6390)
('cs', 6000)
('fi', 7015)
('da', 5240)
('es', 4256)


#### Save the data

In [13]:
import json

with open('lang_dict.json', 'w') as outfile:
    json.dump(lang_dict, outfile)