## Create Pickle files

Throughout the code for this repository, pickle files with all the information extracted from txt directory are used. This notebook creates these pickle files by parsing the txt directory and creates these pickle files.

In [2]:
from bs4 import BeautifulSoup
from nltk.tokenize import sent_tokenize, word_tokenize
from collections import Counter
from nltk.tokenize import RegexpTokenizer
import os
import nltk
import random
import time
import string
import pickle
import numpy

In [4]:
# Utility functions
# These functions were taken from attempt 1 and put together for
# easier maintenance and testing.

def print_elapsed_time(start):
    end = time.time()
    elapsed = end - start
    m, s = divmod(elapsed, 60)
    h, m = divmod(m, 60)
    return ("%d:%02d:%02d" % (h,m,s))
    
#Remove any <tags> within text
def extract_text_only(text):
    soup = BeautifulSoup(text,"lxml")
#    soup = BeautifulSoup(text,"html5lib")
    return soup.get_text()    

def tokenize_removepuncuation(text):
    #words only 
    tokenizer = RegexpTokenizer(r'\w+')
    return tokenizer.tokenize(text)

def most_common_words_letter(most_common_words,most_common_letters):
    #Create word_features, a list of most common words on all languauges
    #this will be used on feature set fed to classifier
    word_features = set()
    letter_features = set()

    for k,v in most_common_words.items():
        for word in v:
            word_features.add(word[0])
  
    # Create letter_features, a list of most common letters on all languages
    for k,v in most_common_letters.items():
        for letter in v:
            letter_features.add(letter[0])

    return word_features, letter_features

#Takes two Counter objects, removes common elements
def remove_common_elements(counter1, counter2):
    elements_intersect = (counter1 & counter2).most_common()
    for n in elements_intersect:
        key = n[0]
        del counter1[key]
        del counter2[key]
    return counter1, counter2
    

In [5]:
#Feature extraction and creation functions

#Loop through directory and extract text
#directory is the path to directory to process
def get_text_from_directory(directory):
    language_label = directory.split("/")[-1]    
    documents = []
    counter = 0
    #keep a count on unique words seen on documents
    word_counter = Counter()
    alphabet_counter = Counter()
    alphabet = set()
    #Collect a minimun of 5,000 words per directory(language)
    for filename in os.listdir(directory):
        try:
            text_file = open(directory+"/"+filename,"r").read()
            text = extract_text_only(text_file)
            #Tokenize words and remove punctuation
            tokenized_text = tokenize_removepuncuation(text)
            #add to dict counter
            word_counter.update(tokenized_text)
            #get letters and add to alphabet
            [alphabet_counter.update(list(n)) for n in tokenized_text]
            documents.append((tokenized_text,language_label))
            counter = counter + 1
        except:
            print(directory+" - Issue with filename:"+filename+" Ignoring.")
    return documents, word_counter, alphabet_counter

def extract_data_from_corpora_save_pickles(corpora_directory):

    #Loop through all directories contain corpora with all languages
    #directory will be the folder containing documents on that language
    for directory in os.listdir(corpora_directory):
        #full_path contains
        full_path = corpora_directory+directory
        if(os.path.isdir(full_path)):
            print("About to process directory "+directory)
            #process directory, text contains documents list with rows (['worda1','worda2,'worda3'],'LANG-A')
            #word_counter contains count of all words seen
            text, word_counter, alphabet = get_text_from_directory(full_path)
            print("Number of words for this language:"+str(len(word_counter)))
            
            #Keep only letters that are not common ascii letters
            for letter in list(alphabet):
                if(letter in list(string.ascii_letters) or letter in list(string.digits)):
                    del alphabet[letter]


            #Save to pickle so it can be read without having to process again
            pickle_out = open("pickles/word_counter_"+directory+".pickle","wb")
            pickle.dump(word_counter, pickle_out)
            pickle_out.close()
            pickle_out = open("pickles/alphabet_"+directory+".pickle","wb")
            pickle.dump(alphabet, pickle_out)
            pickle_out.close()
            pickle_out = open("pickles/documents_"+directory+".pickle","wb")
            pickle.dump(text, pickle_out)
            pickle_out.close()


In [6]:
# -----READ CORPORA AND GENERATE PICKLE FILES SYSTEM------
corpora_directory = "txt/"
start = time.time()
number_of_words  = 10 
number_of_letters = 20

save_pickles = True

extract_data_from_corpora_save_pickles(corpora_directory)

print("Elapsed time directory creating pickles:"+print_elapsed_time(start))


About to process directory bg
Number of words for this language:107850
About to process directory cs
Number of words for this language:190273
About to process directory da
Number of words for this language:350588
About to process directory de
Number of words for this language:359228
About to process directory el
Number of words for this language:235146
About to process directory en
Number of words for this language:101850
About to process directory es
Number of words for this language:173104
About to process directory et
Number of words for this language:315598
About to process directory fi
Number of words for this language:735982
About to process directory fr
Number of words for this language:131694
About to process directory hu
Number of words for this language:309740
About to process directory it
Number of words for this language:178009
About to process directory lt
Number of words for this language:257164
About to process directory lv
Number of words for this language:173923
About 