In [1]:
import random
import os.path
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
import pickle
from gensim.models import Word2Vec, keyedvectors
from sklearn.model_selection import train_test_split


# UNZIP THE WORD2VEC200.txt.zip File

def read_text(filepath, dictionary=None):
    """
    gets the text and add it to the dictionary
    Args:
        filepath: the filepath of the text file|
        dictionary: the dictionary storage for True to get quick lookup

    Returns: the dictionary

    """
    if dictionary is None:
        dictionary = {}

    f = open(filepath, "r", encoding='utf8')
    for line in f:
        line = line.strip()
        if line not in dictionary.keys():
            dictionary[line] = True

    f.close()

    return dictionary


In [2]:

pre_split = pd.read_csv('tokenized_df.csv').sample(n=50000, random_state=44)

tokenized_train, tokenized_test = train_test_split(pre_split, test_size=0.2, random_state=44)

positive_words = read_text("positive-words.txt")
negative_words = read_text("negative-words.txt")
profanity = list(pd.read_csv('profanity_en.csv')['text'])

word_embeddings = None

def featurize50(data, word_embeddings):
    """
    we use this format to make implementation of this class more straightforward and to be
    consistent with what you see in nltk
    Parameters:
      data - str like "I loved the hotel"
    Return: a list of tuples linking features to values
    for BoW, a list of tuples linking every word to True [("I", True), ("loved", True), ("it", True)]
    """
    num_positive = 0
    num_negative = 0
    num_profanity = 0

    num_embedding_pos = 0
    num_embedding_neg = 0

    words = data

    # Number of positive lexicon
    for word in words:
        if word in positive_words:
            num_positive += 1
        if word in negative_words:
            num_negative += 1
        if word in profanity:
            num_profanity += num_profanity
        
        try:
            most_similar = word_embeddings.most_similar(positive=[word])[:5]

            for w in most_similar:
                if w[0] in positive_words:
                    num_embedding_pos += 1
                elif w[0] in negative_words:
                    num_embedding_neg += 1

                if w[0] in profanity:
                    num_profanity += 1
        except:
            pass

    difference = num_positive - num_negative
    embed_difference = num_embedding_pos - num_embedding_neg

    return [difference, num_profanity, embed_difference]

def featurize25(data, word_embeddings):
    """
    we use this format to make implementation of this class more straightforward and to be
    consistent with what you see in nltk
    Parameters:
      data - str like "I loved the hotel"
    Return: a list of tuples linking features to values
    for BoW, a list of tuples linking every word to True [("I", True), ("loved", True), ("it", True)]
    """
    num_positive = 0
    num_negative = 0
    num_profanity = 0

    num_embedding_pos = 0
    num_embedding_neg = 0

    words = data

    # Number of positive lexicon
    for word in words:
        if word in positive_words:
            num_positive += 1
        if word in negative_words:
            num_negative += 1
        if word in profanity:
            num_profanity += num_profanity
        
        try:
            most_similar = word_embeddings.most_similar(positive=[word])[:5]

            for w in most_similar:
                if w[0] in positive_words:
                    num_embedding_pos += 1
                elif w[0] in negative_words:
                    num_embedding_neg += 1

                if w[0] in profanity:
                    num_profanity += 1
        except:
            pass

    difference = num_positive - num_negative
    embed_difference = num_embedding_pos - num_embedding_neg

    return [difference, num_profanity, embed_difference]


def featurize100(data, word_embeddings):
    """
    we use this format to make implementation of this class more straightforward and to be
    consistent with what you see in nltk
    Parameters:
      data - str like "I loved the hotel"
    Return: a list of tuples linking features to values
    for BoW, a list of tuples linking every word to True [("I", True), ("loved", True), ("it", True)]
    """
    num_positive = 0
    num_negative = 0
    num_profanity = 0

    num_embedding_pos = 0
    num_embedding_neg = 0

    words = data

    # Number of positive lexicon
    for word in words:
        if word in positive_words:
            num_positive += 1
        if word in negative_words:
            num_negative += 1
        if word in profanity:
            num_profanity += num_profanity
        
        try:
            most_similar = word_embeddings.most_similar(positive=[word])[:5]

            for w in most_similar:
                if w[0] in positive_words:
                    num_embedding_pos += 1
                elif w[0] in negative_words:
                    num_embedding_neg += 1

                if w[0] in profanity:
                    num_profanity += 1
        except:
            pass

    difference = num_positive - num_negative
    embed_difference = num_embedding_pos - num_embedding_neg

    return [difference, num_profanity, embed_difference]


def featurize200(data, word_embeddings):
    """
    we use this format to make implementation of this class more straightforward and to be
    consistent with what you see in nltk
    Parameters:
      data - str like "I loved the hotel"
    Return: a list of tuples linking features to values
    for BoW, a list of tuples linking every word to True [("I", True), ("loved", True), ("it", True)]
    """
    num_positive = 0
    num_negative = 0
    num_profanity = 0

    num_embedding_pos = 0
    num_embedding_neg = 0

    words = data

    # Number of positive lexicon
    for word in words:
        if word in positive_words:
            num_positive += 1
        if word in negative_words:
            num_negative += 1
        if word in profanity:
            num_profanity += num_profanity
        
        try:
            most_similar = word_embeddings.most_similar(positive=[word])[:5]

            for w in most_similar:
                if w[0] in positive_words:
                    num_embedding_pos += 1
                elif w[0] in negative_words:
                    num_embedding_neg += 1

                if w[0] in profanity:
                    num_profanity += 1
        except:
            pass

    difference = num_positive - num_negative
    embed_difference = num_embedding_pos - num_embedding_neg

    return [difference, num_profanity, embed_difference]

fixed_train = []
fixed_test = []

fixed_train_label = []
fixed_test_label = []

word_start = '<s>'
word_end = '</s>'

token_list_train = list(tokenized_train['tokens'])
token_list_train_label = list(tokenized_train['label'])

token_list_test = list(tokenized_test['tokens'])
token_list_test_label = list(tokenized_test['label'])

for index in range(len(token_list_train)):
    fixed_sentence = [word_start]
    sentence = token_list_train[index]
    
    for s in sentence[1: -1].split(' '):
        fixed_sentence.append(s[1: -2])
    
    fixed_sentence.append(word_end)
    fixed_train.append(fixed_sentence)

for index in range(len(token_list_test)):
    fixed_sentence = [word_start]
    sentence = token_list_test[index]
    
    for s in sentence[1: -1].split(' '):
        fixed_sentence.append(s[1: -2])
    
    fixed_sentence.append(word_end)
    fixed_test.append(fixed_sentence)

In [3]:
# Word Embedding 25
# vocabulary = fixed_train + fixed_test

# word_embeddings25 = None

# if os.path.isfile('word2vec25.txt'):
#     word_embeddings25 = keyedvectors.KeyedVectors.load_word2vec_format('word2vec25.txt', binary=False)
# else:
#     word_embeddings25 = Word2Vec(vocabulary, sg=1, window=5, vector_size=25, min_count=1)
#     word_embeddings25.wv.save_word2vec_format('word2vec25.txt', binary=False)

# train_features25 = []

# count = 0
# for i in fixed_train:
#     train_features25.append(featurize25(i, word_embeddings25))
    
#     if count % 100 == 0:
#         print(count)
#     count += 1
    
# test_features25 = []

# count = 0
# for i in fixed_test:
#     test_features25.append(featurize25(i, word_embeddings25))
    
#     if count % 100 == 0:
#         print(count)
#     count += 1
    
    
# with open('./train_features25.pkl', 'wb') as f:
#     pickle.dump(train_features25, f)
#     f.close()
    
# with open('./test_features25.pkl', 'wb') as f:
#     pickle.dump(test_features25, f)
#     f.close()

In [4]:
# # Word Embedding 50
# vocabulary = fixed_train + fixed_test

# word_embeddings50 = None

# if os.path.isfile('word2vec50.txt'):
#     word_embeddings50 = keyedvectors.KeyedVectors.load_word2vec_format('word2vec50.txt', binary=False)
# else:
#     word_embeddings50 = Word2Vec(vocabulary, sg=1, window=5, vector_size=50, min_count=1)
#     word_embeddings50.wv.save_word2vec_format('word2vec50.txt', binary=False)

# train_features50 = []

# count = 0
# for i in fixed_train:
#     train_features50.append(featurize50(i, word_embeddings50))
    
#     if count % 100 == 0:
#         print(count)
#     count += 1
    
# test_features50 = []

# count = 0
# for i in fixed_test:
#     test_features50.append(featurize50(i, word_embeddings50))
    
#     if count % 100 == 0:
#         print(count)
#     count += 1

# with open('./train_features50.pkl', 'wb') as f:
#     pickle.dump(train_features50, f)
#     f.close()
    
# with open('./test_features50.pkl', 'wb') as f:
#     pickle.dump(test_features50, f)
#     f.close()

In [5]:
# # Word Embedding 100
# vocabulary = fixed_train + fixed_test

# word_embeddings100 = None

# if os.path.isfile('word2vec100.txt'):
#     word_embeddings100 = keyedvectors.KeyedVectors.load_word2vec_format('word2vec100.txt', binary=False)
# else:
#     word_embeddings100 = Word2Vec(vocabulary, sg=1, window=5, vector_size=100, min_count=1)
#     word_embeddings100.wv.save_word2vec_format('word2vec100.txt', binary=False)

# train_features100 = []

# count = 0
# for i in fixed_train:
#     train_features100.append(featurize100(i, word_embeddings100))
    
#     if count % 100 == 0:
#         print(count)
#     count += 1
    
# test_features100 = []

# count = 0
# for i in fixed_test:
#     test_features100.append(featurize100(i, word_embeddings100))
    
#     if count % 100 == 0:
#         print(count)
#     count += 1
    
# with open('./train_features100.pkl', 'wb') as f:
#     pickle.dump(train_features100, f)
#     f.close()
    
# with open('./test_features100.pkl', 'wb') as f:
#     pickle.dump(test_features100, f)
#     f.close()

In [6]:
# # Word Embedding 200
# vocabulary = fixed_train + fixed_test

# word_embeddings200 = None

# if os.path.isfile('word2vec200.txt'):
#     word_embeddings200 = keyedvectors.KeyedVectors.load_word2vec_format('word2vec200.txt', binary=False)
# else:
#     word_embeddings200 = Word2Vec(vocabulary, sg=1, window=5, vector_size=200, min_count=1)
#     word_embeddings200.wv.save_word2vec_format('word2vec200.txt', binary=False)

# train_features200 = []

# count = 0
# for i in fixed_train:
#     train_features200.append(featurize200(i, word_embeddings200))
    
#     if count % 100 == 0:
#         print(count)
#     count += 1
    
# test_features200 = []

# count = 0
# for i in fixed_test:
#     test_features200.append(featurize200(i, word_embeddings200))
    
#     if count % 100 == 0:
#         print(count)
#     count += 1
    
# with open('train_features200.pkl', 'wb') as f:
#     pickle.dump(train_features200, f)
#     f.close()
    
# with open('test_features200.pkl', 'wb') as f:
#     pickle.dump(test_features200, f)
#     f.close()

In [7]:
# # GenSim Twitter 

# def featurize200_twitter(data, word_embeddings):
#     """
#     we use this format to make implementation of this class more straightforward and to be
#     consistent with what you see in nltk
#     Parameters:
#       data - str like "I loved the hotel"
#     Return: a list of tuples linking features to values
#     for BoW, a list of tuples linking every word to True [("I", True), ("loved", True), ("it", True)]
#     """
#     num_positive = 0
#     num_negative = 0
#     num_profanity = 0

#     num_embedding_pos = 0
#     num_embedding_neg = 0

#     words = data

#     # Number of positive lexicon
#     for word in words:
#         if word in positive_words:
#             num_positive += 1
            
#             try:
#                 most_similar = word_embeddings.most_similar(positive=[word])[:5]

#                 for w in most_similar:
#                     if w[0] in positive_words:
#                         num_embedding_pos += 1
#                     elif w[0] in negative_words:
#                         num_embedding_neg += 1

#                     if w[0] in profanity:
#                         num_profanity += 1
#             except:
#                 pass
#         if word in negative_words:
#             num_negative += 1
            
#             try:
#                 most_similar = word_embeddings.most_similar(positive=[word])[:5]

#                 for w in most_similar:
#                     if w[0] in positive_words:
#                         num_embedding_pos += 1
#                     elif w[0] in negative_words:
#                         num_embedding_neg += 1

#                     if w[0] in profanity:
#                         num_profanity += 1
#             except:
#                 pass
#         if word in profanity:
#             num_profanity += num_profanity
        
#             try:
#                 most_similar = word_embeddings.most_similar(positive=[word])[:5]

#                 for w in most_similar:
#                     if w[0] in positive_words:
#                         num_embedding_pos += 1
#                     elif w[0] in negative_words:
#                         num_embedding_neg += 1

#                     if w[0] in profanity:
#                         num_profanity += 1
#             except:
#                 pass

#     difference = num_positive - num_negative
#     embed_difference = num_embedding_pos - num_embedding_neg

#     return [difference, num_profanity, embed_difference]

# vocabulary = fixed_train + fixed_test

# word_embeddings_twitter200 = api.load('glove-twitter-200')
# train_features_twitter200 = []

# count = 0
# for i in fixed_train:
#     train_features_twitter200.append(featurize200_twitter(i, word_embeddings_twitter200))
    
#     if count % 100 == 0:
#         print(count)
#     count += 1
    
# test_features_twitter200 = []

# count = 0
# for i in fixed_test:
#     test_features_twitter200.append(featurize200_twitter(i, word_embeddings_twitter200))
    
#     if count % 100 == 0:
#         print(count)
#     count += 1
    
# with open('./train_features_twitter200.pkl', 'wb') as f:
#     pickle.dump(train_features_twitter200, f)
#     f.close()
    
# with open('./test_features_twitter200.pkl', 'wb') as f:
#     pickle.dump(test_features_twitter200, f)
#     f.close()


In [8]:
# import gensim.downloader as api

# def featurize100_twitter(data, word_embeddings):
#     """
#     we use this format to make implementation of this class more straightforward and to be
#     consistent with what you see in nltk
#     Parameters:
#       data - str like "I loved the hotel"
#     Return: a list of tuples linking features to values
#     for BoW, a list of tuples linking every word to True [("I", True), ("loved", True), ("it", True)]
#     """
#     num_positive = 0
#     num_negative = 0
#     num_profanity = 0

#     num_embedding_pos = 0
#     num_embedding_neg = 0

#     words = data

#     # Number of positive lexicon
#     for word in words:
#         if word in positive_words:
#             num_positive += 1
            
#             try:
#                 most_similar = word_embeddings.most_similar(positive=[word])[:5]

#                 for w in most_similar:
#                     if w[0] in positive_words:
#                         num_embedding_pos += 1
#                     elif w[0] in negative_words:
#                         num_embedding_neg += 1

#                     if w[0] in profanity:
#                         num_profanity += 1
#             except:
#                 pass
#         if word in negative_words:
#             num_negative += 1
            
#             try:
#                 most_similar = word_embeddings.most_similar(positive=[word])[:5]

#                 for w in most_similar:
#                     if w[0] in positive_words:
#                         num_embedding_pos += 1
#                     elif w[0] in negative_words:
#                         num_embedding_neg += 1

#                     if w[0] in profanity:
#                         num_profanity += 1
#             except:
#                 pass
#         if word in profanity:
#             num_profanity += num_profanity
        
#             try:
#                 most_similar = word_embeddings.most_similar(positive=[word])[:5]

#                 for w in most_similar:
#                     if w[0] in positive_words:
#                         num_embedding_pos += 1
#                     elif w[0] in negative_words:
#                         num_embedding_neg += 1

#                     if w[0] in profanity:
#                         num_profanity += 1
#             except:
#                 pass

#     difference = num_positive - num_negative
#     embed_difference = num_embedding_pos - num_embedding_neg

#     return [difference, num_profanity, embed_difference]

# vocabulary = fixed_train + fixed_test

# word_embeddings_twitter100 = api.load('glove-twitter-100')
# train_features_twitter100 = []

# count = 0
# for i in fixed_train:
#     train_features_twitter100.append(featurize100_twitter(i, word_embeddings_twitter100))
    
#     if count % 100 == 0:
#         print(count)
#     count += 1
    
# test_features_twitter100 = []

# count = 0
# for i in fixed_test:
#     test_features_twitter100.append(featurize100_twitter(i, word_embeddings_twitter100))
    
#     if count % 100 == 0:
#         print(count)
#     count += 1
    
# with open('./train_features_twitter100.pkl', 'wb') as f:
#     pickle.dump(train_features_twitter100, f)
#     f.close()
    
# with open('./test_features_twitter100.pkl', 'wb') as f:
#     pickle.dump(test_features_twitter100, f)
#     f.close()

In [9]:
# Loading the Features:

with open('train_features25.pkl', 'rb') as f:
    train_features_25 = pickle.load(f)
with open('test_features25.pkl', 'rb') as f:
    test_features_25 = pickle.load(f)


with open('train_features50.pkl', 'rb') as f:
    train_features_50 = pickle.load(f)
with open('test_features50.pkl', 'rb') as f:
    test_features_50 = pickle.load(f)
    
with open('train_features100.pkl', 'rb') as f:
    train_features_100 = pickle.load(f)
with open('test_features100.pkl', 'rb') as f:
    test_features_100 = pickle.load(f)
    
with open('train_features200.pkl', 'rb') as f:
    train_features_200 = pickle.load(f)
with open('test_features200.pkl', 'rb') as f:
    test_features_200 = pickle.load(f)
    
with open('train_features_twitter200.pkl', 'rb') as f:
    train_features_twitter200 = pickle.load(f)
with open('test_features_twitter200.pkl', 'rb') as f:
    test_features_twitter200 = pickle.load(f)
    
with open('train_features_twitter100.pkl', 'rb') as f:
    train_features_twitter100 = pickle.load(f)
with open('test_features_twitter100.pkl', 'rb') as f:
    test_features_twitter100 = pickle.load(f)

In [10]:
# Making the model

model25 = LogisticRegression()
model50 = LogisticRegression()
model100 = LogisticRegression()
model200 = LogisticRegression()
model_twitter200 = LogisticRegression()
model_twitter100 = LogisticRegression()

# Training the models

model25.fit(train_features_25, token_list_train_label)
model50.fit(train_features_50, token_list_train_label)
model100.fit(train_features_100, token_list_train_label)
model200.fit(train_features_200, token_list_train_label)
model_twitter200.fit(train_features_twitter200, token_list_train_label)
model_twitter100.fit(train_features_twitter100, token_list_train_label)

# Getting the predictions
model25_predictions = model25.predict(test_features_25)
model50_predictions = model50.predict(test_features_50)
model100_predictions = model100.predict(test_features_100)
model200_predictions = model200.predict(test_features_200)
model_twitter_200_predictions = model_twitter200.predict(test_features_twitter200)
model_twitter_100_predictions = model_twitter100.predict(test_features_twitter100)

# Getting the score
model25_score = model25.score(test_features_25, token_list_test_label)
model50_score = model50.score(test_features_50, token_list_test_label)
model100_score = model100.score(test_features_100, token_list_test_label)
model200_score = model200.score(test_features_200, token_list_test_label)
model_twitter_200_test_score = model_twitter200.score(test_features_twitter200, token_list_test_label)
model_twitter_100_test_score = model_twitter100.score(test_features_twitter100, token_list_test_label)

model25_train_score = model25.score(train_features_25, token_list_train_label)
model50_train_score = model50.score(train_features_50, token_list_train_label)
model100_train_score = model100.score(train_features_100, token_list_train_label)
model200_train_score = model200.score(train_features_200, token_list_train_label)
model_twitter_200_train_score = model_twitter200.score(train_features_twitter200, token_list_train_label)
model_twitter_100_train_score = model_twitter100.score(train_features_twitter100, token_list_train_label)

print("Logistic Regression Model Trained on 25 embedding size train score is: ", model25_train_score)
print("Logistic Regression Model Trained on 50 embedding size train score is: ", model50_train_score)
print("Logistic Regression Model Trained on 100 embedding size train score is: ", model100_train_score)
print("Logistic Regression Model Trained on 200 embedding size train score is: ", model200_train_score)
print("Logistic Regression Model Trained on 100 twitter embedding size train score is: ", model_twitter_100_train_score)
print("Logistic Regression Model Trained on 200 twitter embedding size train score is: ", model_twitter_200_train_score, '\n')

print("Logistic Regression Model Trained on 25 embedding size test score is: ", model25_score)
print("Logistic Regression Model Trained on 50 embedding size test score is: ", model50_score)
print("Logistic Regression Model Trained on 100 embedding size test score is: ", model100_score)
print("Logistic Regression Model Trained on 200 embedding size test score is: ", model200_score)
print("Logistic Regression Model Trained on 100 twitter embedding size test score is: ", model_twitter_100_test_score)
print("Logistic Regression Model Trained on 200 twitter embedding size test score is: ", model_twitter_200_test_score)

Logistic Regression Model Trained on 25 embedding size train score is:  0.5361
Logistic Regression Model Trained on 50 embedding size train score is:  0.54565
Logistic Regression Model Trained on 100 embedding size train score is:  0.567925
Logistic Regression Model Trained on 200 embedding size train score is:  0.547975
Logistic Regression Model Trained on 100 twitter embedding size train score is:  0.45395
Logistic Regression Model Trained on 200 twitter embedding size train score is:  0.462775 

Logistic Regression Model Trained on 25 embedding size test score is:  0.5395
Logistic Regression Model Trained on 50 embedding size test score is:  0.5486
Logistic Regression Model Trained on 100 embedding size test score is:  0.5682
Logistic Regression Model Trained on 200 embedding size test score is:  0.5496
Logistic Regression Model Trained on 100 twitter embedding size test score is:  0.4547
Logistic Regression Model Trained on 200 twitter embedding size test score is:  0.4675


In [14]:
precision_25 = 0
total_25 = 0

precision_50 = 0
total_50 = 0

precision_100 = 0
total_100 = 0

precision_200 = 0
total_200 = 0

precision_twitter_100 = 0
total_twitter_100 = 0

precision_twitter_200 = 0
total_twitter_200 = 0

for i in range(len(model25_predictions)):
    correct = token_list_test_label[i]
    
    if correct == 1 and model25_predictions[i] == correct:
        precision_25 += 1
        total_25 += 1
    elif correct == 1:
        total_25 += 1
        
    if correct == 1 and model50_predictions[i] == correct:
        precision_50 += 1
        total_50 += 1
    elif correct == 1:
        total_50 += 1
        
    if correct == 1 and model100_predictions[i] == correct:
        precision_100 += 1
        total_100 += 1
    elif correct == 1:
        total_100 += 1
        
    if correct == 1 and model200_predictions[i] == correct:
        precision_200 += 1
        total_200 += 1
    elif correct == 1:
        total_200 += 1
        
    if correct == 1 and model_twitter_100_predictions[i] == correct:
        precision_twitter_100 += 1
        total_twitter_100 += 1
    elif correct == 1:
        total_twitter_100 += 1
        
    if correct == 1 and model_twitter_200_predictions[i] == correct:
        precision_twitter_200 += 1
        total_twitter_200 += 1
    elif correct == 1:
        total_twitter_200 += 1
        
print("Positive precision for Model of embedding size 25: ", precision_25 / total_25)
print("Positive precision for Model of embedding size 50: ", precision_50 / total_50)
print("Positive precision for Model of embedding size 100: ", precision_100 / total_100)
print("Positive precision for Model of embedding size 200: ", precision_200 / total_200)
print("Positive precision for Model of twitter embedding size 100: ", precision_twitter_100 / total_twitter_100)
print("Positive precision for Model of twitter embedding size 200: ", precision_twitter_200 / total_twitter_200)

Positive precision for Model of embedding size 25:  0.6904536862003781
Positive precision for Model of embedding size 50:  0.6902173913043478
Positive precision for Model of embedding size 100:  0.6143667296786389
Positive precision for Model of embedding size 200:  0.6930529300567108
Positive precision for Model of twitter embedding size 100:  0.7814272211720227
Positive precision for Model of twitter embedding size 200:  0.8213610586011342


In [16]:
precision_25 = 0
total_25 = 0

precision_50 = 0
total_50 = 0

precision_100 = 0
total_100 = 0

precision_200 = 0
total_200 = 0

precision_twitter_100 = 0
total_twitter_100 = 0

precision_twitter_200 = 0
total_twitter_200 = 0

for i in range(len(model25_predictions)):
    correct = token_list_test_label[i]
    
    if correct == -1 and model25_predictions[i] == correct:
        precision_25 += 1
        total_25 += 1
    elif correct == -1:
        total_25 += 1
        
    if correct == -1 and model50_predictions[i] == correct:
        precision_50 += 1
        total_50 += 1
    elif correct == -1:
        total_50 += 1
        
    if correct == -1 and model100_predictions[i] == correct:
        precision_100 += 1
        total_100 += 1
    elif correct == -1:
        total_100 += 1
        
    if correct == -1 and model200_predictions[i] == correct:
        precision_200 += 1
        total_200 += 1
    elif correct == -1:
        total_200 += 1
        
    if correct == -1 and model_twitter_100_predictions[i] == correct:
        precision_twitter_100 += 1
        total_twitter_100 += 1
    elif correct == -1:
        total_twitter_100 += 1
        
    if correct == -1 and model_twitter_200_predictions[i] == correct:
        precision_twitter_200 += 1
        total_twitter_200 += 1
    elif correct == -1:
        total_twitter_200 += 1
        
print("Negative precision for Model of embedding size 25: ", precision_25 / total_25)
print("Negative precision for Model of embedding size 50: ", precision_50 / total_50)
print("Negative precision for Model of embedding size 100: ", precision_100 / total_100)
print("Negative precision for Model of embedding size 200: ", precision_200 / total_200)
print("Negative precision for Model of twitter embedding size 100: ", precision_twitter_100 / total_twitter_100)
print("Negative precision for Model of twitter embedding size 200: ", precision_twitter_200 / total_twitter_200)

Negative precision for Model of embedding size 25:  0.26646706586826346
Negative precision for Model of embedding size 50:  0.26390076988879385
Negative precision for Model of embedding size 100:  0.24807527801539778
Negative precision for Model of embedding size 200:  0.2536355859709153
Negative precision for Model of twitter embedding size 100:  0.26646706586826346
Negative precision for Model of twitter embedding size 200:  0.2797262617621899


In [17]:
precision_25 = 0
total_25 = 0

precision_50 = 0
total_50 = 0

precision_100 = 0
total_100 = 0

precision_200 = 0
total_200 = 0

precision_twitter_100 = 0
total_twitter_100 = 0

precision_twitter_200 = 0
total_twitter_200 = 0

for i in range(len(model25_predictions)):
    correct = token_list_test_label[i]
    
    if correct == 0 and model25_predictions[i] == correct:
        precision_25 += 1
        total_25 += 1
    elif correct == 0:
        total_25 += 1
        
    if correct == 0 and model50_predictions[i] == correct:
        precision_50 += 1
        total_50 += 1
    elif correct == 0:
        total_50 += 1
        
    if correct == 0 and model100_predictions[i] == correct:
        precision_100 += 1
        total_100 += 1
    elif correct == 0:
        total_100 += 1
        
    if correct == 0 and model200_predictions[i] == correct:
        precision_200 += 1
        total_200 += 1
    elif correct == 0:
        total_200 += 1
        
    if correct == 0 and model_twitter_100_predictions[i] == correct:
        precision_twitter_100 += 1
        total_twitter_100 += 1
    elif correct == 0:
        total_twitter_100 += 1
        
    if correct == 0 and model_twitter_200_predictions[i] == correct:
        precision_twitter_200 += 1
        total_twitter_200 += 1
    elif correct == 0:
        total_twitter_200 += 1
        
print("Neutral precision for Model of embedding size 25: ", precision_25 / total_25)
print("Neutral precision for Model of embedding size 50: ", precision_50 / total_50)
print("Neutral precision for Model of embedding size 100: ", precision_100 / total_100)
print("Neutral precision for Model of embedding size 200: ", precision_200 / total_200)
print("Neutral precision for Model of twitter embedding size 100: ", precision_twitter_100 / total_twitter_100)
print("Neutral precision for Model of twitter embedding size 200: ", precision_twitter_200 / total_twitter_200)

Neutral precision for Model of embedding size 25:  0.5393586005830904
Neutral precision for Model of embedding size 50:  0.567930029154519
Neutral precision for Model of embedding size 100:  0.7294460641399417
Neutral precision for Model of embedding size 200:  0.5743440233236151
Neutral precision for Model of twitter embedding size 100:  0.17988338192419825
Neutral precision for Model of twitter embedding size 200:  0.1588921282798834
