In [66]:
def precision(gold_labels, predicted_labels):
    """
    Calculates the precision for a set of predicted labels give the gold (ground truth) labels.
    Parameters:
        gold_labels (list): a list of labels assigned by hand ("truth")
        predicted_labels (list): a corresponding list of labels predicted by the system
    Returns: double precision (a number from 0 to 1)
    """
    true_pos_count = 0
    true_false_pos_count = 0

    for i in range(len(gold_labels)):
        if gold_labels[i] == '1':
            if predicted_labels[i] == '1':
                true_pos_count += 1
                true_false_pos_count += 1
        else:
            if predicted_labels[i] == '1':
                true_false_pos_count += 1

    return true_pos_count / true_false_pos_count


def recall(gold_labels, predicted_labels):
    """
    Calculates the recall for a set of predicted labels give the gold (ground truth) labels.
    Parameters:
        gold_labels (list): a list of labels assigned by hand ("truth")
        predicted_labels (list): a corresponding list of labels predicted by the system
    Returns: double recall (a number from 0 to 1)
    """
    true_pos_count = 0
    true_pos_false_neg_count = 0

    for i in range(len(gold_labels)):
        if gold_labels[i] == '1':
            if predicted_labels[i] == '1':
                true_pos_count += 1
            true_pos_false_neg_count += 1

    return true_pos_count / true_pos_false_neg_count


def f1(gold_labels, predicted_labels):
    """
    Calculates the f1 for a set of predicted labels give the gold (ground truth) labels.
    Parameters:
        gold_labels (list): a list of labels assigned by hand ("truth")
        predicted_labels (list): a corresponding list of labels predicted by the system
    Returns: double f1 (a number from 0 to 1)
    """
    p = precision(gold_labels, predicted_labels)
    r = recall(gold_labels, predicted_labels)

    denominator = p + r
    numerator = 2 * p * r

    if denominator == 0 or numerator == 0:
        return 0

    return numerator / denominator


"""
Implement any other non-required functions here
"""


def get_from_tuple(tuples, string):
    """
    given a list of tuples, find the tuple whose first element matches the string
    Args:
        tuples: a list of tuples [(string, value)...]
        string: the string matcher

    Returns: a tuple if found, an empty list if not

    """
    for t in tuples:
        if t[0] == string:
            return t

    return []


def read_text(filepath, dictionary=None):
    """
    gets the text and add it to the dictionary
    Args:
        filepath: the filepath of the text file
        dictionary: the dictionary storage for True to get quick lookup

    Returns: the dictionary

    """
    if dictionary is None:
        dictionary = {}

    f = open(filepath, "r", encoding='utf8')
    for line in f:
        line = line.strip()
        if line not in dictionary.keys():
            dictionary[line] = True

    f.close()

    return dictionary


def get_labels(examples):
    """
    Returns a list of the labels from the examples
    Args:
        examples: a list of tuples [(id, text, label)...]

    Returns: list of labels [label, label, ...]
    """
    final_list = []
    for example in examples:
        final_list.append(example[2])

    return final_list


def lost_function_for_each_weight(sigmoid, result, weight):
    """
    Returns the lost value for each weight
    Args:
        sigmoid: the sigmoid value
        weight: our current weight value
        result: whether it was 1 or 0
    Returns: the lost value
    """

    difference = sigmoid - result

    return difference * weight


In [86]:
from gensim.models import Word2Vec, keyedvectors
import random
import os.path
import numpy as np
import pandas as pd

# CSV FILE SOURCE FOR BAD WORDS
# https://github.com/surge-ai/profanity/blob/main/profanity_en.csv

class LogisticRegressionModel:

    def __init__(self):
        # Features - Count of Positive Lexicon,
        #            Count of Negative Lexicon,
        #            If exclamation mark "!" is in the doc (1 if in, 0 otherwise)
        #            Difference of positive lexicon and negative lexicon
        #            If the words "another", "typical" is in the doc (1 if in, 0 otherwise)

        self.positive_words = read_text("positive-words.txt")
        self.negative_words = read_text("negative-words.txt")
        
        self.profanity = list(pd.read_csv('profanity_en.csv')['text'])

        # our features weights
        self.weights = {}
        # Learning step
        self.step = 0.1
        # bias
        self.bias = 0.1
        
        self.word_embeddings = None
        
        self.weights['profanity'] = (random.random() * 3 + 1) * random.choice([1, -1])
        self.weights['difference'] = (random.random() * 3 + 1) * random.choice([1, -1])
        self.weights['bias'] = self.bias
        self.weights['difference embeddings'] = (random.random() * 3 + 1) * random.choice([1, -1])
        
        
       
    def train(self, examples):
        """
        Trains the classifier based on the given examples
        Parameters:
          examples - a list of tuples of strings formatted [(id, example_text, label), (id, example_text, label)....]
        Return: None
        """
        if os.path.isfile('word2vec.txt'):
            self.word_embeddings = keyedvectors.KeyedVectors.load_word2vec_format('word2vec.txt', binary=False)
        else:
            self.word_embeddings = Word2Vec(examples, sg=1, window=5, vector_size=200, min_count=1)
            self.word_embeddings.wv.save_word2vec_format('word2vec.txt', binary=False)
        
        # We will be doing Stochastic Gradient Descent on each individual example
        for example in examples:
            # Get the features of each example
            features = self.featurize(example[0])
            label = example[1]

            self.gradient_descent(features, label, iterations=1000)

    def gradient_descent(self, features, label, iterations=20):
        """
        Performs the gradient descent on each feature
        Args:
            features: the features that our data has
            label: "1" or "0"
            iterations: the number of iterations we will run, default 20

        Returns: None
        """

        for i in range(iterations):
            sigmoid_result = self.sigmoid(features)
            y = int(label)

            # Calculating loss for each weight:

            for weight in self.weights.keys():
                loss_value = lost_function_for_each_weight(sigmoid_result, y, get_from_tuple(features, weight)[1])
                change = self.step * loss_value

                self.weights[weight] = self.weights[weight] - change

    def sigmoid(self, features):
        """
        Gets the sigmoid, or P(y=1 | x)
        Args:
            features: the list of tuples that corresponds to the features

        Returns: the sigmoid

        """
        dot_product = 0

        for i in self.weights.keys():
            dot_product += self.weights[i] * get_from_tuple(features, i)[1]
            
        z = dot_product + self.weights['bias']
        
        print(z)
        
        denominator = 1 + np.power(np.e, -1 * z)

        return 1 / denominator

    def score(self, data):
        """
        Score a given piece of text
        you’ll compute e ^ (log(p(c)) + sum(log(p(w_i | c))) here

        Parameters:
          data - str like "I loved the hotel"
        Return: dict of class: score mappings
        return a dictionary of the values of P(data | c)  for each class,
        as in section 4.3 of the textbook e.g. {"0": 0.000061, "1": 0.000032}
        """

        features = self.featurize(data)
        sigmoid = self.sigmoid(features)

        return {"1": sigmoid, "0": 1 - sigmoid}

    def classify(self, data):
        """
        Label a given piece of text
        Parameters:
          data - str like "I loved the hotel"
        Return: string class label
        """
        scores = self.score(data)

        max_score = 0
        classified = ""

        for each_class in scores.keys():
            if scores[each_class] > max_score or classified == "":
                max_score = scores[each_class]
                classified = each_class

        return str(classified)

    def featurize(self, data):
        """
        we use this format to make implementation of this class more straightforward and to be
        consistent with what you see in nltk
        Parameters:
          data - str like "I loved the hotel"
        Return: a list of tuples linking features to values
        for BoW, a list of tuples linking every word to True [("I", True), ("loved", True), ("it", True)]
        """
        num_positive = 0
        num_negative = 0
        num_profanity = 0
        
        num_embedding_pos = 0
        num_embedding_neg = 0

        words = data

        # Number of positive lexicon
        for word in words:
            if word in self.positive_words:
                num_positive += 1
            if word in self.negative_words:
                num_negative += 1
            if word in self.profanity:
                num_profanity += num_profanity
                
            most_similar = self.word_embeddings.most_similar(positive=[word])
            
            for w in most_similar:
                if w[0] in self.positive_words:
                    num_embedding_pos += 1
                elif w[0] in self.negative_words:
                    num_embedding_neg += 1
                
                if w[0] in self.profanity:
                    num_profanity += 1

        difference = num_positive - num_negative
        embed_difference = num_embedding_pos - num_embedding_neg
        
        return [
            # ("num_positive", num_positive),
            # ('num_negative', num_negative),
            ('difference', difference),
            ('profanity', num_profanity),
            ('difference embeddings', embed_difference),
            ('bias', 1)
        ]

    def __str__(self):
        return "Logistic Regression - Stopwords Removed - 5 features"

    def describe_experiments(self):
        s = """
                My improved Training model is a logistic regression model that learns using stochastic gradient descent.
                I experimented with 5 different features, and also normalized the training and testing data by removing 
                stop words.
                
                I initially started with Positive lexicons, negative lexicons, difference of the two, whether an 
                exclamation mark is in it, and whether it has the words "another, typical" in them. 
                
                Sometimes, because of the randomized weights and training data, I will achieve a lower than desirable 
                value for our scores. This is because of the low number of iterations that I have, but this can be 
                solved by increasing the number of iterations that we do.
                
                # Experiment 1: Only count Positive and negative lexicons as features
                Precision: 0.7313
                Recall: 0.467
                F1: 0.570
                
                # Experiment 2: Only count difference as feature
                Precision: 0.554
                Recall: 0.933
                F1: 0.695
                
                # Experiment 3: Only count difference, exclamation, typical/difference as feature
                Precision: 0.551
                Recall: 0.933
                F1: 0.693
                
                # Experiment 4: Leave out typical/another
                Precision: 0.563
                Recall: 0.857
                F1: 0.679
                
                # Experiment 5: Leave out exclamation
                Precision: 0.560
                Recall: 0.848
                F1: 0.674
                
                # Experiment 6: Leave out difference
                Precision: 0.703
                Recall: 0.610
                F1: 0.653
                
                # Experiment 7: Include stop words
                Precision: 0.551
                Recall: 0.933
                F1: 0.693
                
            We are running the experiments on the training data without shuffling so that it will be the same. We 
            conclude that only counting the number of positive and negative lexicon is not as accurate as counting the 
            difference. We are also getting low precision but high recall, meaning that we are getting a lot of false
            positives. 
            """
        return s

In [104]:
# Getting the word embeddings

import pandas as pd
from sklearn.model_selection import train_test_split

pre_split = pd.read_csv('tokenized_df.csv')

tokenized_train, tokenized_test = train_test_split(pre_split[:50000], test_size=0.2, random_state=44)

positive_words = read_text("positive-words.txt")
negative_words = read_text("negative-words.txt")
profanity = list(pd.read_csv('profanity_en.csv')['text'])

word_embeddings = None



def featurize(data):
    """
    we use this format to make implementation of this class more straightforward and to be
    consistent with what you see in nltk
    Parameters:
      data - str like "I loved the hotel"
    Return: a list of tuples linking features to values
    for BoW, a list of tuples linking every word to True [("I", True), ("loved", True), ("it", True)]
    """
    num_positive = 0
    num_negative = 0
    num_profanity = 0

    num_embedding_pos = 0
    num_embedding_neg = 0

    words = data

    # Number of positive lexicon
    for word in words:
        if word in positive_words:
            num_positive += 1
        if word in negative_words:
            num_negative += 1
        if word in profanity:
            num_profanity += num_profanity
        
        try:
            most_similar = word_embeddings.most_similar(positive=[word])[:5]

            for w in most_similar:
                if w[0] in positive_words:
                    num_embedding_pos += 1
                elif w[0] in negative_words:
                    num_embedding_neg += 1

                if w[0] in profanity:
                    num_profanity += 1
        except:
            pass

    difference = num_positive - num_negative
    embed_difference = num_embedding_pos - num_embedding_neg

    return [difference, num_profanity, embed_difference]

fixed_train = []
fixed_test = []

fixed_train_label = []
fixed_test_label = []

word_start = '<s>'
word_end = '</s>'

token_list_train = list(tokenized_train['tokens'])
token_list_train_label = list(tokenized_train['label'])

token_list_test = list(tokenized_test['tokens'])
token_list_test_label = list(tokenized_test['label'])

for index in range(len(token_list_train)):
    fixed_sentence = [word_start]
    sentence = token_list_train[index]
    
    for s in sentence[1: -1].split(' '):
        fixed_sentence.append(s[1: -2])
    
    fixed_sentence.append(word_end)
    fixed_train.append(fixed_sentence)

for index in range(len(token_list_test)):
    fixed_sentence = [word_start]
    sentence = token_list_test[index]
    
    for s in sentence[1: -1].split(' '):
        fixed_sentence.append(s[1: -2])
    
    fixed_sentence.append(word_end)
    fixed_test.append(fixed_sentence)

if os.path.isfile('word2vec.txt'):
    word_embeddings = keyedvectors.KeyedVectors.load_word2vec_format('word2vec.txt', binary=False)
else:
    word_embeddings = Word2Vec(fixed_train, sg=1, window=5, vector_size=200, min_count=1)
    word_embeddings.wv.save_word2vec_format('word2vec.txt', binary=False)
    
# Create List of features [[features...], ...]
# labels = [y1, y2, ...]

train_features = []

count = 0
for i in fixed_train:
    train_features.append(featurize(i))
    
    if count % 100 == 0:
        print('100 checkpoint')
    count += 1
    
test_features = []

for i in fixed_test:
    test_features.append(featurize(i))
    

100 checkpoint
100 checkpoint
100 checkpoint
100 checkpoint
100 checkpoint
100 checkpoint
100 checkpoint
100 checkpoint
100 checkpoint
100 checkpoint
100 checkpoint
100 checkpoint
100 checkpoint
100 checkpoint
100 checkpoint
100 checkpoint
100 checkpoint
100 checkpoint
100 checkpoint
100 checkpoint
100 checkpoint
100 checkpoint
100 checkpoint
100 checkpoint
100 checkpoint
100 checkpoint
100 checkpoint
100 checkpoint
100 checkpoint
100 checkpoint
100 checkpoint
100 checkpoint
100 checkpoint
100 checkpoint
100 checkpoint
100 checkpoint
100 checkpoint
100 checkpoint
100 checkpoint
100 checkpoint
100 checkpoint
100 checkpoint
100 checkpoint
100 checkpoint
100 checkpoint
100 checkpoint
100 checkpoint
100 checkpoint
100 checkpoint
100 checkpoint
100 checkpoint
100 checkpoint
100 checkpoint
100 checkpoint
100 checkpoint
100 checkpoint
100 checkpoint
100 checkpoint
100 checkpoint
100 checkpoint
100 checkpoint
100 checkpoint
100 checkpoint
100 checkpoint
100 checkpoint
100 checkpoint
100 checkp

In [110]:
from sklearn.linear_model import LogisticRegression
import pickle


# with open('test_features.pkl', 'wb') as f:
#     pickle.dump(test_features, f)
#     f.close()

model = LogisticRegression()
model.fit(train_features, token_list_train_label)

predictions = model.predict(test_features)
print(predictions)

model.score(test_features, token_list_test_label)

[ 0 -1  0 ...  1 -1  0]


0.5419