# Importing the Data
Performing attribute reduction by removing empty columns.

In [1]:
DATASET_DIR = './data/' # Datasets to be places here
# pip install pandas
# pip install xlrd

import os
import pandas as pd
import numpy as np

# removing empty columns and finding minimum and maximum scores given to each of the 8 sets

X = pd.read_csv(os.path.join(DATASET_DIR, 'training_set_rel3.tsv'), sep='\t', encoding='ISO-8859-1')
# reading from tsv file (tab - separated) with Latin alphabet encoding including special symbols
Y = X['domain1_score']
X = X.dropna(axis=1)
X = X.drop(columns=['rater1_domain1', 'rater2_domain1'])
Z = pd.read_excel(r'./data/essay_set_descriptions.xlsx')

# minimum and maximum scores for each dataset
minimum_scores = Z['min_domain1_score'].to_list()
minimum_scores.insert(0,-1)
maximum_scores = Z['max_domain1_score'].to_list()
maximum_scores.insert(0,-1)
print('\n----------Essay Set Descriptions----------\n')
print(Z[['essay_set','type_of_essay','training_set_size']])

print('\nMinimum scores for each essay set:')
print(minimum_scores[1:9])
print('\nMaximum scores for each essay set:')
print(maximum_scores[1:9])


----------Essay Set Descriptions----------

   essay_set                         type_of_essay  training_set_size
0          1  persuasive / narrative  / expository               1783
1          2  persuasive / narrative  / expository               1800
2          3            source dependent responses               1726
3          4            source dependent responses               1772
4          5            source dependent responses               1805
5          6            source dependent responses               1800
6          7  persuasive / narrative  / expository               1569
7          8  persuasive / narrative  / expository                723

Minimum scores for each essay set:
[2, 1, 0, 0, 0, 0, 0, 0]

Maximum scores for each essay set:
[12, 6, 3, 3, 4, 4, 30, 60]


## Main Dataframe
Our main dataframe consists of 12975 sample essays which goes upto essay_id 21633 and are divided into 8 sets.

In [2]:
X

Unnamed: 0,essay_id,essay_set,essay,domain1_score
0,1,1,"Dear local newspaper, I think effects computer...",8
1,2,1,"Dear @CAPS1 @CAPS2, I believe that using compu...",9
2,3,1,"Dear, @CAPS1 @CAPS2 @CAPS3 More and more peopl...",7
3,4,1,"Dear Local Newspaper, @CAPS1 I have found that...",10
4,5,1,"Dear @LOCATION1, I know having computers has a...",8
...,...,...,...,...
12971,21626,8,In most stories mothers and daughters are eit...,35
12972,21628,8,I never understood the meaning laughter is th...,32
12973,21629,8,"When you laugh, is @CAPS5 out of habit, or is ...",40
12974,21630,8,Trippin' on fen...,40


## Pre-processing of the Data

These are all helper functions used to clean and tokenize the essays into sentences and wordlists.

In [3]:
# !pip install gensim for Word2vec and Fasttext Model
# !pip install nltk for natural Language Processing 

import nltk
# first time run download these packages
# nltk.download('punkt')
# nltk.download('stopwords')
# nltk.download('popular')

import re # for regular expressions operations 
from nltk.corpus import stopwords

def essay_to_wordlist(essay_v, remove_stopwords):
    # remove the tagged labels and word tokenize the sentence
    essay_v = re.sub("[^a-zA-Z]", " ", essay_v) # removing anything that is not alphabetic
    words = essay_v.lower().split() # turn sentence into lowercase and split it into words 
    if remove_stopwords:
        stops = set(stopwords.words("english")) # english stopwords library 
        words = [w for w in words if not w in stops] 
        # words present in the sentence and not present in stopwords
    return (words)

def essay_to_sentences(essay_v, remove_stopwords):
    # sentence tokenize the essay and call essay_to_wordlist() for word tokenization
    tokenizer = nltk.data.load('tokenizers/punkt/english.pickle') # nltk library
    raw_sentences = tokenizer.tokenize(essay_v.strip()) # call tokenizer on essay striped of spaces 
    sentences = []
    for raw_sentence in raw_sentences:
        if len(raw_sentence) > 0:
            sentences.append(essay_to_wordlist(raw_sentence, remove_stopwords))
            # remove_stopwords carried forward to essay_to_wordlist it is a bool variable
    return sentences # will be returning list of tokenized sentences without stopwords

def makeFeatureVec(words, model, num_features):
    # make Feature Vector from the words list of an Essay
    featureVec = np.zeros((num_features,),dtype="float32")
    #  make null vectors of length = num_features for every word index
    num_words = 0.
    index2word_set = set(model.wv.index2word) # vocabulary set of model
    for word in words:
        if word in index2word_set: # for word in a sentence being in index2word_set
            num_words += 1
            featureVec = np.add(featureVec,model.wv[word])
    if num_words:
        featureVec = np.divide(featureVec,num_words) 
        # divides every element of featureVec by num_words (to compute average) 
    return featureVec

def getAvgFeatureVecs(essays, model, num_features):
    # main function to generate the word vectors for word2vec model
    # it then calls makeFeatureVec and for every essay sentance
    counter = 0
    essayFeatureVecs = np.zeros((len(essays),num_features),dtype="float32")
    # make null vectors of length = length of essay having null vector of num_features at every index
    for essay in essays:
        essayFeatureVecs[counter] = makeFeatureVec(essay, model, num_features)
        counter = counter + 1
    return essayFeatureVecs
    # then returns the trained model feature vectors

## Word embeddings
We will preprocess all essays and convert them to feature vectors using Word2vec and Fasttext model. We would also perform transfer learning by using pre-trained Word2vec, GloVe and Fasttext models.

All the word embeddings used are 300 dimentional vectors.

In [4]:
# initializing variables for word2vec and fasttext model.
num_features = 300 # vector length
min_word_count = 40 # to be considered for vectorisation
num_workers = 8 # working cores
context = 10
downsampling = 1e-3 # compressing

## Word2Vec Model
Using word2vec model to make embeddings for visualization.

In [5]:
from gensim.models import Word2Vec # for word2vec
from gensim.test.utils import get_tmpfile # for saving model

allsentences = [] # list of all sentences

for essay in X['essay']:
# obtaining all sentences from the essays.
    allsentences += essay_to_sentences(essay, remove_stopwords=True)

# for visualization of vectors
visualmodel = Word2Vec(
    allsentences, 
    workers=num_workers, 
    size=num_features, 
    min_count=min_word_count, 
    window=context, 
    sample=downsampling
)

path = get_tmpfile("./word2vecvis/word2vecvisual.model") # path for the model 

visualmodel.init_sims(replace=True) # normalized model
# different ways to save model
visualmodel.wv.save_word2vec_format('./word2vecvis/word2vecvisual.txt', binary=False)
visualmodel.wv.save_word2vec_format('./word2vecvis/word2vecvisual.bin', binary=True)
visualmodel.save("./word2vecvis/word2vecvisual.model")

# embedding dataframe
M=visualmodel.wv[visualmodel.wv.vocab]
df=pd.DataFrame(M)
pd.options.display.max_columns=10
df

Unnamed: 0,0,1,2,3,4,...,295,296,297,298,299
0,-0.093336,-0.045639,0.003970,0.031609,0.012581,...,0.053283,-0.043948,0.028168,-0.124830,-0.030559
1,-0.062648,-0.030446,-0.005180,-0.004587,-0.008295,...,0.043100,-0.063334,-0.018510,-0.062218,-0.012708
2,-0.059883,-0.009011,0.013409,0.068408,-0.005265,...,0.081692,-0.041097,0.002538,-0.120984,-0.020349
3,-0.019802,-0.023876,-0.072023,0.028865,-0.067879,...,0.039249,-0.065322,0.004745,-0.004138,-0.027871
4,-0.134069,-0.119417,0.063802,0.087960,-0.047054,...,0.010881,-0.015373,-0.119925,-0.051917,0.003321
...,...,...,...,...,...,...,...,...,...,...,...
2897,-0.029500,-0.041040,0.038128,-0.017910,0.002117,...,-0.006357,0.005293,-0.070519,0.044967,0.096494
2898,-0.082162,0.037860,-0.034531,0.010890,0.050583,...,-0.045344,-0.074076,0.012369,-0.017194,0.063105
2899,-0.014986,-0.003020,-0.075203,0.040039,0.057193,...,0.010275,-0.038743,0.081464,-0.060517,0.013883
2900,0.023518,-0.009254,-0.087496,0.071146,0.017753,...,-0.010846,-0.004426,0.036070,0.033182,-0.021839


## Pre-Trained Word2Vec Model
It is trained on the Google News dataset (about 100 billion words).

In [6]:
from gensim.models import KeyedVectors

modelbasic = Word2Vec.load('./word2vecvis/word2vecvisual.model')
# getting embeddings from pretrained word2vec model 
modelbasic.intersect_word2vec_format('./word2vec/word2vec.bin', binary=True, lockf=1.0)

# will be using on training data
# modelbasic.train(allsentences,total_examples=len(allsentences), epochs=modelbasic.iter)
# modelbasic.init_sims(replace=True) # for normalizing
# modelbasic.save("./word2vec/word2vecPre.model")

## Pre-Trained Glove Model
It is trained on Wikipedia data and contains about 6 billion words in its vocabulary. 

In [None]:
from gensim.scripts.glove2word2vec import glove2word2vec

modelglove = Word2Vec.load('./word2vecvis/word2vecvisual.model')
# Converting Glove to Word2Vec
_ = glove2word2vec('./glove/glove.6B.300d.txt', "./glove/gloveW2V.txt")
# Getting embeddings from pretrained Glove model
modelglove.intersect_word2vec_format('./glove/gloveW2V.txt', binary=False, lockf=1.0)

# will be using on training data
# modelglove.train(allsentences,total_examples=len(allsentences), epochs=modelbasic.iter)
# modelglove.init_sims(replace=True) for normalizing
# modelglove.save("./glove/gloveW2V.model")

## Fasttext
It is similar to word2vec model but also contains embeddings for n-grams which hep in data sets with out of vocabulary words.

In [None]:
from gensim.models.fasttext import FastText

ftmodel = FastText(
    allsentences, 
    workers=num_workers, 
    size=num_features, 
    min_count = min_word_count, 
    window = context, 
    sample = downsampling
)
# ftmodel.save("./fasttext/fasttext.model") for saving with ngrams

# embedding dataframe
F=ftmodel.wv[ftmodel.wv.vocab]
ft=pd.DataFrame(F)
pd.options.display.max_columns=10
ft

## Pre-Trained Fasttext Model
It is the fasttext model pre-trained on Wiki-news also containing sub-words data.

In [None]:
ftmodelPre = Word2Vec.load('./word2vecvis/word2vecvisual.model')
total_examples = ftmodelPre.corpus_count
# Getting embeddings from pretrained Fasttext model 
ftmodelPre.intersect_word2vec_format('./fasttext/wiki-news-300d-1M-subword.vec')

# will be using on training data
# ftmodelPre.train(allsentences,total_examples=len(allsentences), epochs=ftmodel.iter)
# ftmodelPre.save("./fasttext/fasttextPre.model") for saving with ngrams

## Visualization
Files needed for visualization on Embedding Projector, Tensorflow.

In [None]:
# Storing Visual model tsv files
import io # for input output

# files needed for tensorboard
out_v = io.open('./word2vecvis/vecs.tsv', 'w', encoding='utf-8')
out_m = io.open('./word2vecvis/meta.tsv', 'w', encoding='utf-8')

# write meta file and vector file
for index in range(len(visualmodel.wv.index2word)): # for every word in vocab
    word = visualmodel.wv.index2word[index]
    vec = visualmodel.wv.vectors[index]
    out_m.write(word + "\n")
    out_v.write('\t'.join([str(x) for x in vec]) + "\n")
out_v.close()
out_m.close()

# Open http://projector.tensorflow.org/
# Click “Load Data” button from the left menu.
# Select “Choose file” in “Load a TSV file of vectors.” and choose “vecs.tsv” file.
# Select “Choose file” in “Load a TSV file of metadata.” and choose “meta.tsv” file.
# The model has been visualized in 3D/2D.
# It is done on choosing variable with higher variance as a dimension from the vector

## Exploratory Analysis
It is performed using Principal Component Analysis(PCA)

In [None]:
import matplotlib.pyplot as plt
import matplotlib.cm as cm

#Computing the correlation matrix
M_corr=df.corr()

#Computing eigen values and eigen vectors
values,vectors=np.linalg.eig(M_corr)


# Sorting the eigen vectors coresponding to eigen values in descending order is done like 
# this to have us choosing best eigenvector for infogain. But in our model its already 
# sorted the way it should be. Eigenvector with highest eigenvalue in the first column.
# args = (-values).argsort()
# values = vectors[args]
# vectors = vectors[:, args]

# our aim is to cover maximum variance possible 
# no. of components selected = no. of plotting dimensions
tot = sum(values) # summation of eigenvalues
print("\n",tot) 
var_exp = [(i / tot)*100 for i in values[:5]] # first 5 variance in desc order
print("\n1. Variance Explained\n",var_exp) 
cum_var_exp = np.cumsum(var_exp) # first 5 cumulative variance
print("\n2. Cumulative Variance Explained\n",cum_var_exp) 

#Taking first 2 components which explain maximum variance for projecting
new_vectors=vectors[:,:2]

new_M=np.dot(M,new_vectors)
print("\n",new_M) # coordinates for 2D plot

plt.figure(figsize=(8,6))
plt.gca().spines['top'].set_visible(False)
plt.gca().spines['right'].set_visible(False)
plt.title('Variances of various components',size=15)
plt.xlabel("Components",size=15)
plt.ylabel('Variance',size=15)
plt.bar(range(1,6),
        var_exp,
        width=0.8,
        color = cm.rainbow(np.linspace(0, 1, len(var_exp))))

## Word Embedding Plot
Using PCA we have done plotting of the vectors in a 2-dimentional space 

In [None]:
#Plotting
plt.figure(figsize=(15,12))
plt.gca().spines['top'].set_visible(False)
plt.gca().spines['right'].set_visible(False)
plt.scatter(new_M[:,0],new_M[:,1],linewidths=1,color='skyblue',alpha=0.75)
plt.xlabel("PC1",size=15)
plt.ylabel("PC2",size=15)
plt.title("Word Embedding Space",size=20)
vocab=list(visualmodel.wv.vocab)
for i, word in enumerate(vocab):
  if i%37==0:
      plt.annotate(word,xy=(new_M[i,0],new_M[i,1])) # selective annotations
  

## Defining the model 
We have used three types of models namely:
Dual layer LSTM, Bi-directional LSTM, Convolutional neural network LSTM

In [None]:
from keras.layers import Embedding, LSTM, Dense, Dropout, Bidirectional, Conv1D
from keras.models import Sequential
import keras.backend as K

# bidirectional is for evaluating input in both sequential order
# conv1D is for adding a Cnn layer instead of a LSTM layer
# return sequences return sequences in shape for next Lstm layer
# dropout layers are for regularization
# dropout is for inputs and recurrent dropout is for recurrent inputs
# instead of using sigmoid activation in the output layer we will use Relu since we are not normalising training labels.
# x is 0-> Dual layer LSTM, 1-> Bidirectional LSTM, 2-> CNN LSTM 

def get_model(x):
    model = Sequential()
    if x == 0:
        model.add(LSTM(300, dropout=0.4, recurrent_dropout=0.4, input_shape=[1, 300], return_sequences=True))
    if x == 1:
        model.add(Bidirectional(LSTM(300, dropout=0.4, recurrent_dropout=0.4, input_shape=[1, 300], return_sequences=True)))
    if x == 2:
        model.add(Conv1D(64, 3, activation='relu',input_shape=(1,300),padding='same'))
        model.add(Dropout(0.4))
    model.add(LSTM(128, recurrent_dropout=0.4))
    model.add(Dropout(0.5))
    model.add(Dense(1, activation='relu')) 
    model.compile(loss='mean_squared_error', optimizer='adam', metrics=['mae'])
    if x == 1:
        model.build((None,1,300))
    model.summary()
    return model

## Training Phase
Here we will train our data for 5 types of word embeddings and 3 types of LSTM Networks. And store the respective Kappa Scores in the dataframe and them will do comparative analysis.

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import cohen_kappa_score
    
X_train,X_test,y_train,y_test = train_test_split(X, Y, train_size=0.70,test_size=0.30, random_state=0)
print('\nShape of X_train and y_train respectively.\n')
print (X_train.shape, y_train.shape)
print (X_test.shape, y_test.shape)
    
train_essays = X_train['essay']
test_essays = X_test['essay']
   
sentences = []
    
for essay in train_essays:
    # Obtaining all sentences from the training essays.
    sentences += essay_to_sentences(essay, remove_stopwords = True)

models = ['Dual-LSTM','Bi-LSTM','Cnn-LSTM']
embeddings = ['W2V','W2VP','GloVe','FsTxT','FsTxTP']

# dataframe to store our final result
final = {
    embeddings[0]:{models[0]:0,models[1]:0,models[2]:0},
    embeddings[1]:{models[0]:0,models[1]:0,models[2]:0},
    embeddings[2]:{models[0]:0,models[1]:0,models[2]:0},
    embeddings[3]:{models[0]:0,models[1]:0,models[2]:0},
    embeddings[4]:{models[0]:0,models[1]:0,models[2]:0}     
}

for i in range(5): # for every word embedding
    
    if(i==0):
        model = Word2Vec(
            sentences, 
            workers=num_workers, 
            size=num_features, 
            min_count = min_word_count, 
            window = context, 
            sample = downsampling
        )
        model.init_sims(replace=True) # for normalizing
    
    if(i==1):
        model = modelbasic
        modelbasic.train(allsentences,total_examples=len(allsentences), epochs=modelbasic.epochs)
        modelbasic.init_sims(replace=True) # for normalizing
        modelbasic.save("./word2vec/word2vecPre.model") # saving model
        model.train(sentences,total_examples=len(sentences), epochs=model.epochs)
        model.init_sims(replace=True) # for normalizing
    
    if(i==2):
        model = modelglove
        modelglove.train(allsentences,total_examples=len(allsentences), epochs=modelbasic.epochs)
        modelglove.init_sims(replace=True) # for normalizing
        modelglove.save("./glove/gloveW2V.model") # saving model
        model.train(sentences,total_examples=len(sentences), epochs=model.epochs)
        model.init_sims(replace=True) # for normalizing
    
    if(i==3):
        model = FastText(
            sentences, 
            workers=num_workers, 
            size=num_features, 
            min_count = min_word_count, 
            window = context, 
            sample = downsampling
        )
        model.init_sims(replace=True) # for normalizing
    
    if(i==4):
        model = ftmodelPre
        model.train(sentences,total_examples=len(sentences), epochs=model.epochs)
        model.init_sims(replace=True) # for normalizing
    
    # Generate training and testing data word vectors.
    clean_train_essays = []
    for essay_v in train_essays:
        clean_train_essays.append(essay_to_wordlist(essay_v, remove_stopwords=True))
    trainDataVecs = getAvgFeatureVecs(clean_train_essays, model, num_features)

    clean_test_essays = []
    for essay_v in test_essays:
        clean_test_essays.append(essay_to_wordlist(essay_v, remove_stopwords=True))
    testDataVecs = getAvgFeatureVecs( clean_test_essays, model, num_features )

    trainDataVecs = np.array(trainDataVecs)
    testDataVecs = np.array(testDataVecs)
    
    # Reshaping train and test vectors to 3 dimensions for LSTM (1 represnts one timestep)
    trainDataVecs = np.reshape(trainDataVecs, (trainDataVecs.shape[0], 1, trainDataVecs.shape[1]))
    testDataVecs = np.reshape(testDataVecs, (testDataVecs.shape[0], 1, testDataVecs.shape[1]))

    for j in range(3): # for every model

        print('\n------------------- {} Model with {} Embeddings -------------------\n'.format(models[j],embeddings[i]))

        lstm_model = get_model(j)
        history = lstm_model.fit(
            trainDataVecs, 
            y_train, 
            validation_data=(testDataVecs,y_test), 
            batch_size=32, 
            epochs=100, 
            shuffle = False
        ) 
        # fitting of the model

        # This can be used for prediction
        # lstm_model.load_weights('./final_lstm.h5') To load model weight
        # Predicting from test data
        y_pred = np.around(lstm_model.predict(testDataVecs))
        # lstm_model.save('./final_lstm.h5')
        
        # Evaluate the model on the evaluation metric. "Quadratic mean averaged Kappa"
        result = cohen_kappa_score(y_test.values,y_pred,weights='quadratic')
        final[embeddings[i]][models[j]] = result
        
        print()
        plt.figure(figsize=(10,7))
        plt.plot(history.history['loss'])
        plt.plot(history.history['val_loss'])
        plt.title('Training vs Validation Loss', size=25)
        plt.ylabel('Loss', size=25)
        plt.xlabel('No. Of Epochs', size=25)
        plt.legend(['Training','Validation'], loc= 'upper right' )
        plt.show()
        
        print("\nKappa Score: {}".format(result))
        print('\n------------------- {} Model with {} Embeddings -------------------\n'.format(models[j],embeddings[i]))

## Score Comparison
Here we compare our kappa score for different LSTM - Models

In [None]:
scoreDb = pd.DataFrame(final)

print('\nFinal Scores Matrix: \n')
print(scoreDb)
print()
plt.figure(figsize=(12,9))
plt.gca().spines['top'].set_visible(False)
plt.gca().spines['right'].set_visible(False)
plt.xlabel("Word Embeddings",size=20)
plt.ylabel("Kappa Score",size=20)
plt.title("Scores For Different LSTM Models",size=20)
color = ['red','blue','green']
plt.grid(True, linestyle=':')
plt.tick_params(labelsize=15)
for i in range(3):
    plt.plot(scoreDb.columns.values,scoreDb.iloc[i].values,color=color[i], marker='o', linestyle='--', markersize=10)
plt.legend(scoreDb.index.values, fontsize =15)
plt.show()

## Implementing Multinomial Naive Bayes from Scratch
Implementing multinomial naive bayes on the basic of word frequency count considering only top 3000 frequent words in the whole corpus.


In [None]:
import math as ma
import itertools # this is to slice the dictionary to get only max frequecvy values
from nltk.tokenize import word_tokenize # used to tokenize the sentences
stop_words = set(stopwords.words('english'))

def probability(dictionary,x,score): # it returns the actual probability of input x over class clas

    count = ma.log(dictionary[score]["count"]) - ma.log(dictionary["total"]) # it is probability for a certain score
    features_number = len(dictionary[score].keys()) - 2 # total number of features
    for j in range(features_number): # calculting the probabilty over each feature the later we will take log() sum of all       
        
        if(x[j]==0): # if input x have zero frequency over the feature so its probability will not counted
            continue
        
        count_xj_in_feature_j = dictionary[score][j] + 1 # it is the total frequency of feature j in a score
        # adding one to eliminate any domain specific errors
        count_clas_ele_in_feature = dictionary[score]["Grand_total"] # it is total number of words in a score
        p = ma.log(count_xj_in_feature_j) - ma.log(count_clas_ele_in_feature) 
        # summing all small probabilities of all features
        count = count + p
    return count # returning the probabilty

def singlecol(dictionary,x): # singlecol gives the prediction(output) of single colum at a time
    
    best_prob = -1 # giving any value to initialise best_prob
    best_cls = -1 # giving any value to initialise best_cls
    classes = dictionary.keys() # dictionary .keys have all the classes names
    val = True
    for clas in classes: # checking probabily on one class at a time 
        
        if (clas=="total"): # total is not a class so ignore it
            continue
        
        clas_p = probability(dictionary,x,clas) # clas_p will have probability of input x for class clas
        
        if(val or clas_p>best_prob):
            best_prob = clas_p
            best_cls = clas
        
        val = False
    return best_cls

X_naive = X['essay'].tolist()
Y_naive = X['domain1_score'].tolist()
sets = X['essay_set'].tolist()

# normalizing score
for i in range(len(sets)):
    Y_naive[i] = Y_naive[i] - minimum_scores[sets[i]]
    Y_naive[i] = int(np.around((Y_naive[i] * 5) / (maximum_scores[sets[i]] - minimum_scores[sets[i]])))

# doing hold out spliting for train and test data
xtrain,xtest,ytrain,ytest = train_test_split(X_naive,Y_naive,test_size=0.3,random_state=0)

len_data = len(xtrain)

# in this dictionary we will store frequency of each word from entire dataset by removing stop_words
dictionary = dict()
for j in range(len_data):
    data = xtrain[j]
    word_tokens = word_tokenize(data)
    filtered_sentence = [w for w in word_tokens if not w in stop_words]
    
    for word in filtered_sentence:
        if word in dictionary:
            dictionary[word]+=1
        else:
            dictionary[word]=1

new_dict={} # this is the reverse sorted form of dictionary used above
for key,value in sorted(dictionary.items(),key=lambda item: item[1],reverse=True):
    new_dict[key]=value

# slicing over bigger ditionary to get max  frequency 3000 data only
a=dict(itertools.islice(new_dict.items(),3000))
     
features=[] # features is the list of keys of dictionary (a) 
for i in a.keys():
    features.append(i)

# modifing x_train and x_test to 2d Lists having frequency of each word of features 
xx_train=np.zeros((len(xtrain),len(features)))
for i in range(len(xtrain)):
    data=xtrain[i]
    
    word_tokens = word_tokenize(data)
    filtered_sentence = [w for w in word_tokens if not w in stop_words]
    for j in filtered_sentence:
        if j in features:
            xx_train[i][features.index(j)]+=1
            
xx_test=np.zeros((len(xtest),len(features)))
for i in range(len(xtest)):
    data2 = xtest[i]
    word_tok = word_tokenize(data2)
    
    fil_sentence = [w for w in word_tok if not w in stop_words]
    
    for j in fil_sentence:
        if j in features:
            xx_test[i][features.index(j)]+=1

# this is to train algorithm over training data
result = {} # we will use dictionary and create nested dictionary where needed
classes = set(ytrain)

for current_class in classes: # accessing all score classes one by one 
    x_train_current = []   
    y_train_current = []
    result[current_class] = {}
    result["total"] = len(xx_train) # it will hold length of entire xx_train set
    for i in range(len(ytrain)):

        if (Y_naive[i]==current_class):
            x_train_current.append(xx_train[i])
            y_train_current = Y_naive[i] # spliting y_train for only current_class
    
    result[current_class]["count"] = len(x_train_current) # it will hold count of current_class
    features_total = xx_train.shape[1] # feature size is nothin but the columns of xx_train
    a = 0
    for j in range(len(features)):
        result[current_class][j]=0
        for k in range(len(x_train_current)):
            result[current_class][j] += x_train_current[k][j] # it will hold frequency of feature j
        a += result[current_class][j]
    result[current_class]["Grand_total"] = a  # it will hold count of entire words in current_class
    
ypred = []
for x in xx_test: # going through test_data row wise
    pred=singlecol(result,x) # as we get a answer by one column we are appending it to list
    ypred.append(pred)    

from sklearn.metrics import classification_report,confusion_matrix
#importing these to check correctness of y_pred(output)
print('\nThis classification is due to our implementation\n')
print(classification_report(ytest,ypred))
print('\n---------------------COMPARISION---------------------\n')
print('\nThis classification is due to sklearn library\n')

from sklearn.naive_bayes import MultinomialNB # now doing the same fit and predict by MultinomialNB library function
arg1=MultinomialNB()
arg1.fit(xx_train,ytrain)
ypred2=arg1.predict(xx_test)

print(classification_report(ytest,ypred2))

## Confusion Matrix for Naive Bayes 
The brighter colors and higher value number around the diagonal shows the accuracy of our model and dispersion shows the low precision.

In [None]:
import seaborn as sns # for seaborn 

plt.figure(figsize=(10,7))
array = confusion_matrix(ytest,ypred2)
df_cm = pd.DataFrame(array, range(6), range(6))
fig = sns.heatmap(df_cm, annot=True, annot_kws={"size": 16},fmt='d') # font size
sns.set(font_scale=1.4) # for label size
plt.xlabel("Predicted Score", size=25)
plt.ylabel("Actual Score", size=25)
plt.tick_params(labelsize=10)
plt.title("Confusion matrix for Multinomial Naive Bayes")
plt.show(fig)