In [1]:
import time
import pickle
import os
from nltk.tokenize import RegexpTokenizer
import string
from collections import Counter

In [2]:
def print_elapsed_time():
    end = time.time()
    elapsed = end - start
    m, s = divmod(elapsed, 60)
    h, m = divmod(m, 60)
    return ("%d:%02d:%02d" % (h,m,s))

def tokenize_removepuncuation(text):
    #words only 
    tokenizer = RegexpTokenizer(r'\w+')
    return tokenizer.tokenize(text)

def get_most_frequent_letters(tokenized_text, number_of_letters):
    # get letters and add to alphabet
    alphabet_counter = Counter()
    [alphabet_counter.update(list(n)) for n in tokenized_text]
    most_common = alphabet_counter.most_common(number_of_letters)
    most_common_str = ""
    for l in most_common:
        most_common_str += l[0]
    most_common_str = most_common_str[:number_of_letters]
    return most_common_str

def get_most_frequent_ending_of_words(tokenized_text,number_of_characters):
    ending_counter = Counter()

    for n in tokenized_text:
        ending_counter[n[-number_of_characters:]] += 1

    most_common = ending_counter.most_common(10)
    most_common_list = []
    for l in most_common:
        most_common_list.append(l)

    return most_common_list

#this function receives a document and creates a feature based list
# Input:
#   [(['worda1','worda2,'worda3'],'LANG-A'),
#     ['wordb1','wordb2,'wordb3'],'LANG-B'),...]
# Ouput:
# [('αποφασιστικής': False,
#   'Περιφερειακής': True,
#   'pontot': False,...),'ru'),...]

def document_features_fromwords(document, word_features, number_of_common_letters):
    #normalize words
    document = [w.lower() for w in document]
    document_words = set(document)
    features = {}
    #Set most common letters as a feature
    most_common_letters = get_most_frequent_letters(document_words, number_of_common_letters)
    for n in range(2,number_of_common_letters+1):
        features['common_letters_'+str(n)] = most_common_letters[:n]
    #Set last three letters of words  also as a feature
    most_common_ending = get_most_frequent_ending_of_words(document_words, 3)
    counter = 0
    for n in most_common_ending:
        counter += 1
        features['common_ending_top_' + str(counter)] = n[0]
    # Add word that are part of common words
    for word in word_features:
        features[word] = (word in document_words)
    return features

def classify_document(document, classifier,word_features,number_of_common_letters):
    return classifier.classify(document_features_fromwords(tokenize_removepuncuation(document),word_features,number_of_common_letters))

def test_europarltest_file(eurofile, resultsfile, everyother, classifier,word_features,number_of_common_letters):
    #Read test file and classify each sentence in file
    positive_ctr = 0
    negative_ctr = 0
    total_ctr    = 0 
    #save results to file for processing
    fileout = open(resultsfile,'w')
    #columns
    fileout.write('predicted, language given, correctly classified?\n')
    
    processed_counter = 0
    with open(eurofile,'r') as f:
        for line in f:
            processed_counter +=1
            if(processed_counter%everyother==0):
                total_ctr += 1
                #language is first two letters in line    
                language = line[:2]
                #sentence is rest, clean up spaces
                sentence = line[2:].strip()
                #Detect language based on model
                language_detected = classify_document(sentence, classifier,word_features,number_of_common_letters)
                correctly_classified = language_detected==language
                #tally correct and incorrect
                if(correctly_classified):
                    #correctly classified
                    positive_ctr += 1
                else:
                    #incorrectly classified
                    negative_ctr += 1
            
                fileout.write(language_detected+','+language+','+str(correctly_classified)+'\n')
    fileout.close()
    return total_ctr, positive_ctr, negative_ctr

In [None]:
models_directory = "models"

#Loop through all models save as pickles and output results
for filename in os.listdir(models_directory):
    if("classifier" in filename):
        start = time.time()
        #filename contains number of documents, words, letters
        tokens = filename.split("_")
        number_of_documents       = tokens[1]
        upto_percentage           = tokens[2]
        number_of_common_letters  = tokens[3]
        upto_percentage_int = int(upto_percentage)
        number_of_common_letters_int = int(upto_percentage_int)

        #load classifier
        classifier = pickle.load( open( models_directory+"/"+filename, "rb" ) )
        #load word_features, letter_features
        word_features = pickle.load( open( models_directory+"/word_features"+number_of_documents+"_"+upto_percentage+"_"+number_of_common_letters, "rb" ) )
        letter_features = pickle.load( open( models_directory+"/letter_features"+number_of_documents+"_"+upto_percentage+"_"+number_of_common_letters, "rb" ) )
        print("-------------------------------")
        print("           Processing:"+filename)
        # -------------Step 6-------------
        # Classify all sentences in europarl.test and write results to resultsfile
        # This is the actual deployment of the classifier against challenge data
        europarl_testfile = "europarl.test"
        results_outfile   = "europarl_test_classified_attempt_"+str(number_of_documents)+"_"+str(upto_percentage)+"_"+str(number_of_common_letters)+".csv"
        #use for quick testing, to test just a subset of all documents read
        #every other 1000 would only classifies every other n document on testfile        
        everyother = 1
        start = time.time()
        total_ctr, positive_ctr, negative_ctr = test_europarltest_file(europarl_testfile, results_outfile, everyother, classifier,word_features,number_of_common_letters_int)
        #results
        print("        Number of documents: "+number_of_documents)
        print(" Percentage of common words: "+upto_percentage)
        print("   Number of common letters: "+number_of_common_letters)
        print("            Total attempted: "+str(total_ctr))
        print("       Classified correctly: "+str(positive_ctr))
        print("     Classified incorrectly: "+str(negative_ctr))
        accuracy = (positive_ctr/total_ctr) * 100
        print("      Europar test Accuracy: is",accuracy)
        print("Elapsed time for accuracy testing:"+print_elapsed_time())

-------------------------------
           Processing:classifier_10000_10_10.pickle
        Number of documents: 10000
 Percentage of common words: 10
   Number of common letters: 10.pickle
            Total attempted: 21000
       Classified correctly: 19649
     Classified incorrectly: 1351
      Europar test Accuracy: is 93.56666666666666
Elapsed time for accuracy testing:0:06:44
-------------------------------
           Processing:classifier_10000_15_5.pickle
        Number of documents: 10000
 Percentage of common words: 15
   Number of common letters: 5.pickle
            Total attempted: 21000
       Classified correctly: 19203
     Classified incorrectly: 1797
      Europar test Accuracy: is 91.44285714285715
Elapsed time for accuracy testing:0:14:51
-------------------------------
           Processing:classifier_10000_20_6.pickle
