    We are trying to program a program for language detection, using neural networks and training data from: http://www.statmt.org/europarl/ 

In [3]:
import os #To manage access to directories from python
import string
import unicodedata
#from nltk.corpus import words
from nltk import ngrams
import pickle
import time
import pandas as pd
import nltk
import operator

In [82]:
rootdir="C:\\Users\\YsfDS\\Documents\\AI Project\\Data"

Acquiring data from different language directories, then we create a dictionary for which keys are the languages and the value is a list of lists representing each file. Every element of the sub-list is a line of the file.

In [1]:
def aquire_data(files):
    return_list=[]
    for file in files:
        with open (file,encoding="latin-1") as f:
            temp_list=[]
            for line in f.readlines():
                temp_list.append(line)
        return_list.append(temp_list)
    return(return_list)

In [84]:
def load_separate(directory):
    language_dict={}
    for directory,subdirectories,subfiles in os.walk(directory):#os.walk does a simple task it prints a 3 value tuple (current folder,list of sub directories,list of sub files) 
        current_directory=os.path.split(directory)[1]
        if (current_directory == 'portuguese'):
            language_dict['portuguese']=aquire_data([os.path.join(directory,subfile) for subfile in subfiles])
        elif (current_directory == 'french'):
            language_dict['french']=aquire_data([os.path.join(directory,subfile) for subfile in subfiles])
        elif (current_directory == 'spanish'):
            language_dict['spanish']=aquire_data([os.path.join(directory,subfile) for subfile in subfiles])
        elif (current_directory == 'italian'):
            language_dict['italian']=aquire_data([os.path.join(directory,subfile) for subfile in subfiles])
    return(language_dict)
        

Here we do the cleaning of the sentences extracted from files, this process consists of:

1- Remove punctuation.

2- Unicode allows multiple character sequences to represent the same string. For example, the string "capital A with two dots" can be represented as a single character "\u00C4", or as the two characters "A\u0308". The Unicode standard defines multiple ways to "normalize" a Unicode string so different ways of representing a given string map to the same "canonical form". Normalizing Unicode strings is necessary in order to consistently compare or sort strings in languages with accented characters. 

3-Tokenize to changer to lower case and eliminate alpha-numeric/numeric data.

In [102]:
def clean_line(line):
    line=''.join(el for el in line if el not in set(string.punctuation)) #Removing punctuation
    line = unicodedata.normalize('NFD', line).encode('ascii', 'ignore') #First by normalize we assure that same caracter from different files would be compared the right way, #With encode we change the codage of a string with options when errors occur ignoring in this case. 
    line = line.decode('UTF-8')  #f-1 of encode.
    line=line.split()
    line=[word.lower() for word in line]
    line=[word for word in line if word.isalpha()]
    #line=[word for word in line if word not in words.words()]
    line=' '.join(line)
    return(line)

In [None]:
initial_dict=load_separate(rootdir)
with open ('initial_dict','wb') as pickle_out:
    pickle.dump(initial_dict,pickle_out)

Cleaning lines of files by language:

In [106]:
fr_files=initial_dict['french']
#print(clean_line(fr_files[0][0:5]))
start=time.time()
for i in range (0,len(fr_files)):
    for j in range (0,len(fr_files[i])):
        fr_files[i][j]=clean_line(fr_files[i][j])
end = time.time()
print(end - start)

778.0771102905273


In [114]:
sp_files=initial_dict['spanish']
#print(clean_line(fr_files[0][0:5]))
start=time.time()
for i in range (0,len(sp_files)):
    for j in range (0,len(sp_files[i])):
        sp_files[i][j]=clean_line(sp_files[i][j])
end = time.time()
print(end - start)

858.3679540157318


In [116]:
it_files=initial_dict['italian']
#print(clean_line(fr_files[0][0:5]))
start=time.time()
for i in range (0,len(it_files)):
    for j in range (0,len(it_files[i])):
        it_files[i][j]=clean_line(it_files[i][j])
end = time.time()
print(end - start)

829.4154944419861


In [118]:
pt_files=initial_dict['portuguese']
#print(clean_line(fr_files[0][0:5]))
start=time.time()
for i in range (0,len(pt_files)):
    for j in range (0,len(pt_files[i])):
        pt_files[i][j]=clean_line(pt_files[i][j])
end = time.time()
print(end - start)

823.8137321472168


In [121]:
final_dict={}
final_dict['french']=fr_files
final_dict['spanish']=sp_files
final_dict['italian']=it_files
final_dict['portuguese']=pt_files

Saving cleaned dictionary.

In [122]:
with open ('final_dict','wb') as pickle_out2:
    pickle.dump(final_dict,pickle_out2)

    Now we have a dictionnary, its keys are the languages and the values are lists representing files and each file is represented by a list containing the cleaned lines of the file. 
    
    Well now let's prepare our features: we will use n-grams (Trigrams to be exact).

In [2]:
start=time.time()
with open('final_dict','rb') as pickle_in:
    final_dict = pickle.load(pickle_in)
end=time.time()
print(end-start)

17.74961566925049


In [3]:
fr_files=final_dict['french']
#sp_files=final_dict['spanish']
#it_files=final_dict['italian']
#pt_files=final_dict['portuguese']

In [None]:
del(final_dict) #unbind to release memory

In [1]:
def generate_ngram(term,n):
    generated_ngrams = ngrams(term,n)
    return(generated_ngrams)

In [28]:
h=generate_ngram('Hello everyone! I am been trigrammed',3)
g=list(h)

In [31]:
for k,v in nltk.FreqDist(g).items():
    print(k,v)

('H', 'e', 'l') 1
('e', 'l', 'l') 1
('l', 'l', 'o') 1
('l', 'o', ' ') 1
('o', ' ', 'e') 1
(' ', 'e', 'v') 1
('e', 'v', 'e') 1
('v', 'e', 'r') 1
('e', 'r', 'y') 1
('r', 'y', 'o') 1
('y', 'o', 'n') 1
('o', 'n', 'e') 1
('n', 'e', '!') 1
('e', '!', ' ') 1
('!', ' ', 'I') 1
(' ', 'I', ' ') 1
('I', ' ', 'a') 1
(' ', 'a', 'm') 1
('a', 'm', ' ') 1
('m', ' ', 'b') 1
(' ', 'b', 'e') 1
('b', 'e', 'e') 1
('e', 'e', 'n') 1
('e', 'n', ' ') 1
('n', ' ', 't') 1
(' ', 't', 'r') 1
('t', 'r', 'i') 1
('r', 'i', 'g') 1
('i', 'g', 'r') 1
('g', 'r', 'a') 1
('r', 'a', 'm') 1
('a', 'm', 'm') 1
('m', 'm', 'e') 1
('m', 'e', 'd') 1


In [6]:
def language_ngrams(files):
    ngrams_list={}
    i=0
    for file in files:
        ngrams_list[i]=generate_ngram(file,3)
        i=i+1
    return(ngrams_list)

In [9]:
nfr_files=[' '.join(f) for f in fr_files] 

Now all files in nfr_lines will be a plain cleaned text, all in one list. Ready for language_ngrams.
language_ngrams calls for generate_ngram which takes a token and creates a generator object (which is in the comutationaly speaking very powerfull compared to doing it the classic way), listing this object returns a list of n-grams as 3 elements tuples.

In [10]:
del(fr_files) #We now use nfr_files instead ==> release ressources!

In this following code I'm creating trigrams for all the files in the french dataset.

In [12]:
start=time.time()
r=language_ngrams(nfr_files)
end=time.time()
print(end-start)

0.10933971405029297


Turning those generator objects we talked about earlier to lists.

In [13]:
start=time.time()
l=[]
for i in range(0,len(r)):
    l.append(list(r[i]))
end=time.time()
print(end-start)

4236.878187656403


In [14]:
del(r) #We are working with l now instead so release memory, I won't get bored of saying it so hhh.

Well, if you are going so well till here and you too only have 8Go of Ram like me :( . U'll notice that almost half of it is taken by l, and if you try to pickle it, you'll consistenly face memory error and even worst than that wait for 2 hours and face it too xD. So as an alternative I created a folder in the project directory and pickled only small portions of l. 10 enteries to be exact. Don't forget to close files you open with pickle or use the 'with' statement as it deals with it automatically. So, now we have 945 files each containing 10 files each of them is 'trigrammed'.

In [46]:
start=time.time()
for i in range (0,945):
    fr_trigrams=l[i*10:(i+1)*10]
    file_name="C:\\Users\\YsfDS\\Documents\\AI Project\\trigrams\\"+'fr_trigrams'+str(i+1)
    with open (file_name,'wb') as pickle_out31:
        pickle.dump(fr_trigrams1,pickle_out31)
end=time.time()
print(end-start)

2605.944283723831


Now, I'm loading those pickeled files and directly applying the frequency_dataframe() function to them, this function is bellow so obviously run it first hh. Anyway, I've tried many algorithms to do this AKA counting the frequencies for the trigrams but by far, nltk.FreqDist is the fastest I found, so you might wanna use that. Unless you are okay with waiting for 34H which is the best optimized algorithm I could have come with. 

In [9]:
start=time.time()
final=[]
for i in range (0,945):
    file_name="C:\\Users\\YsfDS\\Documents\\AI Project\\trigrams\\"+'fr_trigrams'+str(i+1)
    with open(file_name,'rb') as pickle_in:
        final.append(frequency_dataframe(pickle.load(pickle_in)))
    del pickle_in
end=time.time()
print(end-start)

4858.647314071655


Okay, 4858 seconds, fair enough, I expected more. Dump pickle_in so that the garbage collector can free the space taken.
Because, I'm paranoid I'll just dump final too right now, before this crashes and I lose everything yet again.

In [50]:
start=time.time()
for i in range (0,945):
    file_name="C:\\Users\\YsfDS\\Documents\\AI Project\\fr_frequency\\"+'fr_frequency'+str(i+1)
    with open (file_name,'wb') as pickle_out32:
        pickle.dump(final[i],pickle_out32)
end=time.time()
print(end-start)

34.31194090843201


In [1]:
def frequency_dataframe(trigram_file):
    h=[]
    for file in trigram_file:
        h.append(nltk.FreqDist(file))
    return(h)

In [22]:
start=time.time()
final=[]
for i in range (0,945):
    file_name="C:\\Users\\YsfDS\\Documents\\AI Project\\fr_frequency\\"+'fr_frequency'+str(i+1)
    with open (file_name,'rb') as pickle_in332:
        final.append(pickle.load(pickle_in332))
    del(pickle_in332)
end=time.time()
print(end-start)

313.90294790267944


In [11]:
def extract_max_trigrams(freq_dictt):
    new_dict={}
    for key in freq_dictt:
        if (freq_dictt[key]>=300):
            new_dict[key]=freq_dictt[key]
    return(sorted(new_dict.items(),key=operator.itemgetter(1)))

To be frank we are only interested in trigrams that are redundnant in the dataset which means that they are very representative of the the language. So i choosed that the treshold be 300 occurences, in fact in litterature 300 is the commonly chosen barrier. Anyway, we still can modify later on if our model overfits, that's the only risk we taking right now.

In [12]:
mn=[]
start=time.time()
for i in range (0,945):
    for dictio in final[i]:
        mn.append(extract_max_trigrams(dictio))
end=time.time()
print(end-start)

8.093190908432007


In [13]:
del(final) #Release ressources man!!

In [14]:
mnn=[]
start=time.time()
for i in range (0,len(mn)):
    mnn.append(mn[i])
end=time.time()
print(end-start)            

0.015572309494018555


Now creating a list that contains all 20 top re-occurring trigrams in each file.

In [15]:
list_trigrams=[]
start=time.time()
for i in range (0,len(mnn)):
    for j in range (0,len(mnn[i])):
        list_trigrams.append(mnn[i][j][0])
end=time.time()
print(end-start)

1.4530220031738281


Now let's remove redundancies!

In [16]:
fr_features=set(list_trigrams)
print(len(fr_features))
print(fr_features)

512
{(' ', 'v', 'i'), ('s', ' ', 's'), ('n', 't', 'i'), ('r', 's', ' '), ('i', 'd', 'e'), ('s', ' ', 'o'), (' ', 'd', 'u'), (' ', 's', 'e'), ('d', 'e', 'm'), ('s', 'i', 'd'), ('o', 't', 'r'), ('u', 'e', 'l'), ('p', 'e', 'e'), ('r', 'a', 't'), ('p', 'o', 'l'), ('d', 'u', ' '), ('t', 'i', 'e'), ('s', 'u', 'r'), ('s', ' ', 'c'), (' ', 'i', 'n'), ('n', 'i', 'o'), ('r', 'e', 'c'), ('f', 'o', 'r'), ('t', 'r', 'o'), ('s', 'e', 's'), ('t', 't', 'e'), ('a', 't', 'i'), ('o', 'i', 's'), ('s', 'e', 'm'), ('l', 'a', 't'), (' ', 'd', ' '), ('l', 'u', 's'), ('u', 'r', 's'), ('s', 'o', 'c'), ('t', ' ', 'q'), ('e', ' ', 'j'), ('a', 'n', 't'), ('i', 't', ' '), ('r', 'a', 'i'), ('u', 'e', ' '), (' ', 'n', 'a'), ('i', 'l', 'l'), ('s', ' ', 'e'), ('l', ' ', 'a'), ('a', ' ', 'r'), ('u', 't', 'i'), ('a', 'g', 'e'), ('r', 't', ' '), ('g', 'r', 'a'), (' ', 'u', 'n'), ('n', ' ', 'c'), (' ', 'j', 'u'), ('e', 'a', 'u'), ('m', 'i', 's'), ('p', 'r', 'o'), ('n', 'i', 'e'), ('e', ' ', 't'), ('a', 'i', 'r'), ('n', 'a'

In [17]:
del(mn)
del(mnn) #Boy gotta clear those ressources! We don't need them anyway we got our french features vector.

Saving the features vector for French language.

In [18]:
file_name="C:\\Users\\YsfDS\\Documents\\AI Project\\features\\fr_features"
with open (file_name,'wb') as pickle_out32:
    pickle.dump(fr_features,pickle_out32)
del(pickle_out32)

In [23]:
file_name="C:\\Users\\YsfDS\\Documents\\AI Project\\features\\fr_features"
with open (file_name,'rb') as pickle_out32:
    fr_features=pickle.load(pickle_out32)
del(pickle_out32)

In [24]:
file_name="C:\\Users\\YsfDS\\Documents\\AI Project\\features\\sp_features"
with open (file_name,'rb') as pickle_out32:
    sp_features=pickle.load(pickle_out32)
del(pickle_out32)

In [25]:
file_name="C:\\Users\\YsfDS\\Documents\\AI Project\\features\\it_features"
with open (file_name,'rb') as pickle_out32:
    it_features=pickle.load(pickle_out32)
del(pickle_out32)

In [26]:
file_name="C:\\Users\\YsfDS\\Documents\\AI Project\\features\\pt_features"
with open (file_name,'rb') as pickle_out32:
    pt_features=pickle.load(pickle_out32)
del(pickle_out32)

In [27]:
b=0
for i in it_features:
    if i in sp_features:
        b=b+1
print(b)

161


In [28]:
c=0
ffr_features=[]
for i in fr_features:
    if (i not in it_features and i not in pt_features and i not in sp_features):
        c=c+1
        ffr_features.append(i)
print(c)

286


In [29]:
d=0
fpt_features=[]
for i in pt_features:
    if (i not in it_features and i not in fr_features and i not in sp_features):
        d=d+1
        fpt_features.append(i)
print(d)

57


In [30]:
e=0
fsp_features=[]
for i in sp_features:
    if (i not in it_features and i not in pt_features and i not in fr_features):
        e=e+1
        fsp_features.append(i)
print(e)

51


In [31]:
f=0
fit_features=[]
for i in it_features:
    if (i not in fr_features and i not in pt_features and i not in sp_features):
        f=f+1
        fit_features.append(i)
print(f)

70


In this phase we will make the representation of our text files in function of all trigrams we extracted earlier. Each cell of the following will construct column by column the frequency of the current trigram in all files in the french folder.

In [32]:
dif={'g':1,'h':0}
if dif['g']:
    print('haaaaaaaa')

haaaaaaaa


In [33]:
df_dict={}
start=time.time()
for trigram in fsp_features:
    temp=[]
    for i in range (0,len(final)):
        for j in range (0,len(final[i])):
            if final[i][j][trigram]:
                temp.append(final[i][j][trigram]/sum(list(final[i][j].values())))
            else: 
                temp.append(0)
    df_dict[trigram]=temp
end=time.time()
print(end-start)

278.469407081604


In [34]:
start=time.time()
for trigram in fit_features:
    temp=[]
    for i in range (0,len(final)):
        for j in range (0,len(final[i])):
            if final[i][j][trigram]:
                temp.append(final[i][j][trigram]/sum(list(final[i][j].values())))
            else: 
                temp.append(0)
    df_dict[trigram]=temp
end=time.time()
print(end-start)

149.2558102607727


In [35]:
start=time.time()
for trigram in fpt_features:
    temp=[]
    for i in range (0,len(final)):
        for j in range (0,len(final[i])):
            if final[i][j][trigram]:
                temp.append(final[i][j][trigram]/sum(list(final[i][j].values())))
            else: 
                temp.append(0)
    df_dict[trigram]=temp
end=time.time()
print(end-start)

54.30870342254639


In [36]:
start=time.time()
for trigram in ffr_features:
    temp=[]
    for i in range (0,len(final)):
        for j in range (0,len(final[i])):
            temp.append(final[i][j][trigram]/sum(list(final[i][j].values())))
    df_dict[trigram]=temp
end=time.time()
print(end-start)

340.3534960746765


In [37]:
del(final)

In [38]:
features=list(fr_features)+list(sp_features)+list(it_features)+list(pt_features)
print(len(set(features)))

789


In [39]:
fr_dataFrame=pd.DataFrame.from_dict(df_dict)

In [40]:
fr_dataFrame

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,Unnamed: 6_level_0,Unnamed: 7_level_0,Unnamed: 8_level_0,Unnamed: 9_level_0,Unnamed: 10_level_0,...,v,v,v,v,v,v,y,z,z,z
Unnamed: 0_level_1,Unnamed: 1_level_1,a,a,a,a,a,a,a,b,c,...,e,i,o,o,o,o,s,a,i,z
Unnamed: 0_level_2,s,g,i,m,p,s,u,v,i,e,...,s,c,i,n,r,u,Unnamed: 18_level_2,Unnamed: 19_level_2,o,a
0,0.000555,0.000174,0.000445,0.000361,0.000555,0.000258,0.002536,0.001065,0.000394,0.004161,...,0.000200,0.000110,0.000548,0.000413,0.000065,0.001245,0.000329,0.000000,0.000000,0
1,0.000466,0.000119,0.000694,0.000203,0.000576,0.000278,0.002923,0.001081,0.000374,0.004044,...,0.000283,0.000166,0.000586,0.000583,0.000063,0.000699,0.000188,0.000008,0.000008,0
2,0.000459,0.000155,0.000367,0.000227,0.000405,0.000279,0.002875,0.001370,0.000425,0.004164,...,0.000191,0.000074,0.000639,0.000585,0.000054,0.000961,0.000317,0.000011,0.000007,0
3,0.000509,0.000236,0.000584,0.000322,0.000618,0.000285,0.003268,0.001219,0.000308,0.003346,...,0.000285,0.000092,0.000667,0.000451,0.000034,0.000497,0.000425,0.000000,0.000000,0
4,0.000583,0.000051,0.000406,0.000203,0.000291,0.000330,0.002801,0.001077,0.000330,0.003941,...,0.000241,0.000139,0.000596,0.000406,0.000076,0.000520,0.000266,0.000000,0.000000,0
5,0.000404,0.000083,0.000312,0.000309,0.000574,0.000219,0.003119,0.001346,0.000411,0.004588,...,0.000225,0.000241,0.000790,0.000710,0.000046,0.000892,0.000306,0.000000,0.000000,0
6,0.000546,0.000077,0.000340,0.000604,0.000430,0.000250,0.003294,0.000950,0.000257,0.004161,...,0.000289,0.000116,0.001188,0.000668,0.000083,0.000443,0.000289,0.000000,0.000000,0
7,0.000459,0.000290,0.000492,0.000230,0.000547,0.000334,0.002751,0.001132,0.000399,0.003725,...,0.000514,0.000088,0.000717,0.000388,0.000131,0.000706,0.000208,0.000000,0.000000,0
8,0.000496,0.000229,0.000386,0.000371,0.000502,0.000292,0.002774,0.001232,0.000363,0.004026,...,0.000257,0.000082,0.000564,0.000714,0.000060,0.001088,0.000296,0.000002,0.000002,0
9,0.000420,0.000220,0.000474,0.000217,0.000505,0.000330,0.003088,0.001294,0.000449,0.004168,...,0.000241,0.000120,0.000701,0.000510,0.000049,0.000621,0.000694,0.000000,0.000000,0


In [42]:
file_name="C:\\Users\\YsfDS\\Documents\\AI Project\\DataFrames\\fr_df"
with open (file_name,'wb') as pickle_out32:
    pickle.dump(fr_dataFrame,pickle_out32)
del(pickle_out32)