In [None]:
import os
import re
import math
import nltk
import sklearn.metrics as skm
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from more_itertools import locate
from operator import itemgetter
from sklearn import metrics
import pandas as pd
import random

In [None]:
path = os.getcwd() + "\\"
print(path)

#### **CODE TO SET UP FOR TRAINING AND CLASSIFICATION**
##### **CHANGE THE CLASSIFICATION_LABEL TO BASS OR SAKE**

In [None]:
classification_label = 'bass'

In [None]:
def read_file(file_name):

    # where we read in the training set
    training_set_file = file_name

    with open(path + training_set_file) as t:
        training_set = t.readlines()

    return training_set

In [None]:
# remove punctuation and lowercase al
def remove_punctuations(dataset):
    processed_training_set = []
    classA_count = 0
    classB_count = 0

    for i in dataset:
        sentence_list = i.split(':')
        sentence = ''.join(sentence_list[1:]).lower()
        
        # count the number of labels for each class
        if sentence_list[0] == classification_label:
            classA_count += 1
        else:
            classB_count += 1
            
        # remove the doc stuff
        sentence = re.sub(r'\<[\W\s]\>', '', sentence)

        # remove underscores and punctuation
        sentence = re.sub(r'[^\w\s]', '', sentence).replace('_', '')

        # remove starting spaces
        sentence = re.sub(r'^\s+', '', sentence)

        sentence = sentence_list[0] + ':\t' + sentence
        processed_training_set.append(sentence.strip())

    return processed_training_set, classA_count, classB_count

In [None]:
# increment count of a collocation according to it's belonging word class
def increment_count(current_collocation, word_class, collocations):
    if current_collocation not in collocations:
    # if collocation is not present in collocations dictionary
        count_in_class = {classification_label: 0, "*" + classification_label: 0}
    else:
        count_in_class = collocations[current_collocation]

    # increment the count
    count_in_class[word_class] += 1
    
    # store it in the dictionary
    collocations[current_collocation] = count_in_class

In [None]:
def collocations_in_positions(index, words_list, word_class, k, collocations):
    for i in index:
        # word at position i - k
        if i - k > 0:
            current_collocation = " ".join(words_list[i - k: i + 1])
            increment_count(current_collocation, word_class, collocations)

        # word at position i + k
        if i + k < len(words_list):
            current_collocation = " ".join(words_list[i: i + k + 1])
            increment_count(current_collocation, word_class, collocations)

In [None]:
def pos_tagging(words_list):
    # POS tag the words using nltk.pos_tag
    return [pos[0] if pos[0] == classification_label else pos[1] for pos in nltk.pos_tag(words_list)]

In [None]:
# to find the pos collocations
def pos_collocations(index, words_list, word_class, k, collocations):
    pos = pos_tagging(words_list)
    for i in index:
        # POS at i - k
        if i - k > 0: 
            current_collocation = pos[i - k] + " " + ('* ' * (k - 1))  + words_list[i]
            increment_count(current_collocation, word_class, collocations)

        # POS at i + k
        if i + k < len(words_list):
            current_collocation = words_list[i] +  " " + ('* ' * (k - 1))  + pos[i + k]
            increment_count(current_collocation, word_class, collocations)


In [None]:
def window_collocations(words_list, word_class, collocations):
    for word in words_list:
        # don't count the word that is the classification label itself
        if word != classification_label:
            
            # increment the count of other words found in +/- 10 words context
            increment_count(word, word_class, collocations)

In [None]:
def build_collocations(dataset):
    collocations = {}
    # build collocations
    for line in dataset:
        word_class, text = line.split(":\t")
        words_list = text.split()

        # finding indexes of classification_word 
        index = list(locate(words_list, lambda a: a == classification_label))

        # find words in position +/- 1
        collocations_in_positions(index, words_list, word_class, 1, collocations)

        # find words in position +/- 2
        collocations_in_positions(index, words_list, word_class, 2, collocations)

        # POS collocation in +/- 1 position
        pos_collocations(index, words_list, word_class, 1, collocations)
        
        # Collocations in +/- 10 words context
        window_collocations(words_list, word_class, collocations)
    
    return collocations

In [None]:
# error term as suggested in the paper
error_term = .1
def log_likelihood(collocation, count):
    # if the sum of count of labels for a collocation is less than 2, then don't consider those collocations
    if count[classification_label] + count['*' + classification_label] < 2:
        return 0

    # To add smoothing to our log-likelihood value
    count_value1 = count[classification_label] + error_term
    count_value2 = count['*' + classification_label] + error_term
    
    # get the count of each class label for a collocation and then calculate the log-likelihood
    log_value = math.log2(count_value1 / count_value2)

    return log_value

In [None]:
# builds a decision list to use to categorize the data
def build_decision_list(current_collocation):
    decision_list = []
    # for each collocation, calcuate the log-likelihood
    for collocation, count in current_collocation.items():
        log_value = log_likelihood(collocation, count)
        
        # if the log-likelihood value is less than 2, don't add it to the decision list
        # since that collocation won't give us much information of the class label
        if abs(log_value) < 2: continue
        label = None
        
        # if the log_value is positive, then classify the collocation as classification_label
        # and if it is negative then it is the other class label
        if log_value > 0:
            label = classification_label
        else:
            label = '*' + classification_label
        decision_list.append((collocation, abs(log_value), label))

    return decision_list

In [None]:
def train_model():
    
    # train the model for prediction
    training_set = read_file(classification_label + '.trn')
    # pre-process the dataset
    processed_training_set, classA_count, classB_count = remove_punctuations(training_set)
    
    # assign a default class with respect to the majority count
    default_class = classification_label if classA_count > classB_count else '*' + classification_label
    
    # build the collocation dictionary
    collocations = build_collocations(processed_training_set)
    
    # sort the decision list so we can iterate with the highest probability first
    decision_list = build_decision_list(collocations)
    decision_list.sort(key=itemgetter(1), reverse=True)
    
    return decision_list, default_class

Classification Testing

In [None]:
# test the sentence and find the rule
def test_sentence(sentence, decision_list, default_class):
    pos = pos_tagging(sentence.split())
    for rules in decision_list: 
        
        # if the current decision rule is found in the test sentence then return that decision rule
        if ' ' + rules[0] + ' ' in sentence or rules[0] in pos:
            return rules[2]

    # return the class with majority count if nothing is in the decision tree that matches
    return default_class


In [None]:
def statistics(actual_list, predicted_list):
    #Confusion Metrics
    print("Confusion Metrics: ")
    print(skm.confusion_matrix(actual_list, predicted_list))
    
    print()
    
    #Accuracy score
    print("Accuracy Score ", (skm.accuracy_score(actual_list, predicted_list)) * 100, '%')
    #Recall score
    print("Recall Score ", skm.recall_score(actual_list, predicted_list, average=None))
    #Precision score
    print("Precision Score ", skm.precision_score(actual_list, predicted_list, average=None))

In [None]:
def test_model(decision_list, testing_set, default_class, flag=False):
    count = 0
    actual_list = []
    predicted_list = []

    # this is the whole test
    for test in testing_set:
        label, text = test.split(":\t")
        
        # use model for classification
        predicted_label = test_sentence(text, decision_list, default_class)

        actual_list.append(label)
        predicted_list.append(predicted_label)

#         if flag:
#             print("Predicted label %s vs Actual label %s " % (predicted_label, label))

        # how many we got wrong with our test
        if predicted_label != label:
            count += 1

    return count, actual_list, predicted_list

In [None]:
def testing(decision_list, default_class):
    testing_set = read_file(classification_label + '.tst')
    processed_testing_set = remove_punctuations(testing_set)[0]

    # test the model
    incorrect_count, actual_list, predicted_list = test_model(decision_list, processed_testing_set, default_class, True)

    print("Number of incorrect classifications ", incorrect_count)

    statistics(actual_list, predicted_list)


In [None]:
dec_list, default_class = train_model()
testing(dec_list, default_class)