# Naive Bayes Implementation

## Imports
    

In [1]:
import sys, re, collections, pickle
import os
import matplotlib.pyplot as plt
import pprint
import numpy as np
from copy import deepcopy

# Data Loading

- ```save()``` : Takes an object and makes a pickle file for it
- ```load()``` : Loads objects from pickle files
- ```load_data_test```: Gets the data from the 

In [2]:
PATH_DATA_TRAIN = '../Data/train.txt'
PATH_DATA_TEST = '../Data/dev.txt'

def save(obj, name):
    if 'files' not in os.listdir():
        os.mkdir('files')
    with open('files/'+ name + '.pkl', 'wb') as f:
        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)

def load(name):
    try:
        with open('files/' + name + '.pkl', 'rb') as f:
            return pickle.load(f)
    except:
        return None
       
def load_data_test():
        training_lines = load('training_lines')
        testing_lines = load('testing_lines')
        
        if not (training_lines and testing_lines):
            training_lines = []
            testing_lines = []
            temp = []
            temp1 = []
            f = open(PATH_DATA_TRAIN, 'r')
            k = open(PATH_DATA_TEST, 'r')
            data = f.read()
            data1 = k.read()
            f.close()
            k.close()
            raw_data = data.split('\n')
            raw_data1 = data1.split('\n')
            for i in raw_data:
                if i == '':
                    training_lines.append(temp)
                    temp = []
                else:
                    temp.append(i)
            save(training_lines, 'training_lines')
            for j in raw_data1:
                if j == '':
                    testing_lines.append(temp1)
                    temp1 = []
                else:
                    temp1.append(j)
            save(testing_lines, 'testing_lines')

In [3]:

load_data_test()
training_lines = load('training_lines')
testing_lines = load('testing_lines')

# Classification

In [None]:
```create_grams``` : gets the grams given a sentence
```counts``` : gets the count of the

In [4]:
def create_grams(sent, i, N):
    grams = tuple()
    for j in range(i-N+1, i+1):
        try:
            if j < 0:
                grams += (' ', )
            else:
                grams += (sent[j].split('   ')[0], )
        except:
            grams += (' ', )
    return grams

def counts(data, N):
    counter = {}
    for sentences in data:
        for iter, token_group in enumerate(sentences):
            tokens = token_group.split('   ') # 3 spaces to get the various coloumns
            if len(tokens) == 3: # All the three elements exist
                if tokens[2] in counter:
                    counter[tokens[2]].update([create_grams(sentences, iter, N)])
                else:
                    counter[tokens[2]] = collections.Counter([create_grams(sentences, iter, N)])
    return counter

def make_counts():
    counts_dict = load('count_dict')
    if not counts_dict:
        counts_dict = {}
        for iter in range(1, 4):
            counts_dict[iter] = counts(training_lines, iter)
        save(counts_dict, 'count_dict')
    print("counts made")
    return counts_dict
# counts has the counter of all the error classes with it's various words in it
count = make_counts()

counts made


In [5]:
count[2]

{'Nn': Counter({('house', 'effect'): 1,
          (':', '//www.purdue.edu/uns/x/2007a/070314agrawalbiomass.html'): 1,
          ('faraway', 'situation'): 1,
          ('the', 'way'): 1,
          (',', 'individual'): 10,
          ('migration', 'issue'): 1,
          ('that', 'incident'): 1,
          ('require', 'much'): 1,
          (',', 'it'): 1,
          (' ', 'human'): 2,
          ('design', 'process'): 17,
          ('hidden', 'problem'): 1,
          ('real', 'condition'): 1,
          (' ', 'adult'): 1,
          ('funded', 'field'): 1,
          ('that', 'crimes'): 1,
          ('carbon', 'emission'): 2,
          ('far', 'distance'): 1,
          ('education', 'opportunity'): 1,
          ('the', 'coins'): 1,
          ('of', 'rbmk'): 1,
          ('as', 'human'): 6,
          ('facing', 'illness'): 1,
          ('and', 'inconvenience'): 1,
          ('in', 'the'): 6,
          ('her', 'relative'): 1,
          ('ongoing', 'researches'): 2,
          ('medical', 'cure'): 1

# Probability Calculations

In [17]:
error_classes = count[1].keys() # Stores the error classes
length_of_gram_vocab = {} # Keeps the count of no of tokens for each error class in i grams 
gram_vocabulary = {} # All unique tokens for each gram
error_count_keeper = {} # 

unique_grams_count = {} # Assigning the total number of unique tokens for each i gram

for i in range(1, 4):
    gram_vocabulary[i] = set() # Get the unique grams
    length_of_gram_vocab[i] = dict()
    for error in error_classes:
        gram_vocabulary[i] = gram_vocabulary[i] or set(count[i][error].keys())
        length_of_gram_vocab[i][error] = sum(count[i][error].values())
    unique_grams_count[i] = len(gram_vocabulary[i])
    
for j in range(1, 4):
    error_count_keeper[j] = {}
    for error in error_classes:
        error_count_keeper[j][error] = sum([count[j][error][k] for k in gram_vocabulary[j]]) + unique_grams_count[j]

def smoothed_probability(gram, error_class, n, smoothing_factor=0.1):
    # returns the smoothed probability
    count_of_the_gram_in_error_class = count[n][error_class][gram]
    return (count_of_the_gram_in_error_class + smoothing_factor) / error_count_keeper[n][error_class]

def load_probability():
    # initialization function
    probability = load('probability')
    if not probability:
        probability = deepcopy(count)
        for j in range(1, 4):
            for error_tags in count[j].keys():
                for gram in count[j][error_tags].keys():
                    probability[j][error_tags][gram] = smoothed_probability(gram, error_tags, j)
            save(probability, 'probability')
    print('probabilities loaded')
    


In [18]:
load_probability()
probability = load('probability')

probabilities loaded


In [20]:
print(probability[2])

{'Nn': Counter({('power', 'plant'): 0.0036645396536007296, ('design', 'process'): 0.0031175934366453966, ('nuclear', 'reactor'): 0.0031175934366453966, ('of', 'the'): 0.002388331814038286, ('fossil', 'fuels'): 0.002023701002734731, (',', 'the'): 0.002023701002734731, ('their', 'life'): 0.002023701002734731, (',', 'individual'): 0.0018413855970829535, ('senior', 'citizen'): 0.0018413855970829535, ('high', 'temperature'): 0.0018413855970829535, ('nuclear', 'weapon'): 0.0018413855970829535, ('nuclear', 'plant'): 0.001659070191431176, ('for', 'the'): 0.0014767547857793982, (' ', 'government'): 0.0014767547857793982, ('the', 'rights'): 0.0014767547857793982, ('energy', 'source'): 0.0014767547857793982, ('to', 'the'): 0.0012944393801276207, ('the', 'need'): 0.0012944393801276207, ('human', "'s"): 0.0012944393801276207, ('fossil', 'fuel'): 0.0012944393801276207, ('as', 'human'): 0.0011121239744758431, ('in', 'the'): 0.0011121239744758431, ('family', 'member'): 0.0011121239744758431, ('aging',

In [21]:
def get_prob(gram, error_class, n):
    if gram in probability[n][error_class].keys():
        return probability[n][error_class][gram]
    else:
        return smoothed_probability(gram, error_class, n)

def prob_class(grams, error_class, n, log=True):
    probs_grams = [get_prob(gram, error_class, n) for gram in grams]
    product = np.product(probs_grams)
    final_prob = product * (length_of_gram_vocab[n][error_class] / sum(length_of_gram_vocab[n].values()))
    if log:
        return np.log(final_prob)
    return final_prob

def naive_classifier(groups, n):
    temp = [(error, prob_class(groups, error, n)) for error in error_classes]
    return sorted(temp, key=lambda k: k[1], reverse=True)

In [23]:
sent = testing_lines[5]
print(sent)
sent_grams = lambda sent, n: [create_grams(sent, i, n) for i in range(len(sent))]
naive_classifier(sent_grams(sent, 1), 1)

['if', 'the', 'fertility', 'rate', 'continues', 'to', 'drop', ',', 'there', 'will', 'be', 'not', '-', 'enough', 'young', 'people', 'to', 'replace', 'the', 'older', 'generation', '.']


[('Um', -130.55748133735426),
 ('WOinc', -135.29370065407343),
 ('Rloc-', -138.3916479824075),
 ('Wci', -139.10060319337964),
 ('Wform', -149.9001637264522),
 ('Srun', -150.05472153549732),
 ('Wtone', -150.57646920497564),
 ('Others', -154.70396730534463),
 ('WOadv', -155.26364251327823),
 ('Trans', -155.9737148198254),
 ('Mec', -156.3414501118055),
 ('Sfrag', -157.39481994691184),
 ('Vform', -159.51688894577615),
 ('Cit', -159.62129812550396),
 ('Spar', -159.7527817489583),
 ('Vt', -162.40925297347138),
 ('Ssub', -164.81806542896786),
 ('Vm', -165.92463888177193),
 ('Nn', -166.59745355467874),
 ('Prep', -170.86706237428916),
 ('Pref', -171.95035822994535),
 ('SVA', -173.06835631254123),
 ('ArtOrDet', -173.0687312625282),
 ('Smod', -174.3874199173404),
 ('V0', -180.87962338796612),
 ('Npos', -181.2228534992274),
 ('Pform', -185.31109081296668),
 ('Wa', -204.31691837286874)]