# Vanilla Bigram model 

#### by Jesse Elliot & Alina Gonzalez

We want to explore how the different neural networks and different emotions compare to a simple vanilla bigram model. 

In [10]:
from collections import Counter
import numpy as np
import math
import random
import time
from utils import LanguageModel
import pandas as pd
from tqdm.notebook import tqdm
import json

# constants
SENTENCE_BEGIN = "<s>"
SENTENCE_END = "</s>"
UNK = "<UNK>"
EMOTION_KEY = {0: 'anger', 1: 'fear', 2: 'joy', 3:'love', 4:'sadness', 5: 'surprise'} # manually done 

### Utility funcs 
**some functions are provided from HW3**

In [2]:
def create_ngrams(tokens: list, n: int) -> list:
  """Creates n-grams for the given token sequence.
  Args:
    tokens (list): a list of tokens as strings
    n (int): the length of n-grams to create

  Returns:
    list: list of tuples of strings, each tuple being one of the individual n-grams
  """
  lst = []
  for i in range(len(tokens)): 
    ngrams = []
    for j in range(i, i+n): 
      try: 
        ngrams.append(tokens[j])
      except: 
        break
    if len(ngrams) == n: 
      lst.append(tuple(ngrams))
  return lst 

In [3]:
def tokenize_line(line: str, ngram: int, 
                   by_char: bool = True, 
                   sentence_begin: str=SENTENCE_BEGIN, 
                   sentence_end: str=SENTENCE_END):
  """
  Tokenize a single string. Glue on the appropriate number of 
  sentence begin tokens and sentence end tokens (ngram - 1), except
  for the case when ngram == 1, when there will be one sentence begin
  and one sentence end token.
  Args:
    line (str): text to tokenize
    ngram (int): ngram preparation number
    by_char (bool): default value True, if True, tokenize by character, if
      False, tokenize by whitespace
    sentence_begin (str): sentence begin token value
    sentence_end (str): sentence end token value

  Returns:
    list of strings - a single line tokenized
  """
  inner_pieces = None
  if by_char:
    inner_pieces = list(line)
  else:
    # otherwise split on white space
    inner_pieces = line.split()

  if ngram == 1:
    tokens = [sentence_begin] + inner_pieces + [sentence_end]
  else:
    tokens = ([sentence_begin] * (ngram - 1)) + inner_pieces + ([sentence_end] * (ngram - 1))
  # always count the unigrams
  return tokens

In [4]:
def tokenize(data: list, ngram: int, 
                   by_char: bool = True, 
                   sentence_begin: str=SENTENCE_BEGIN, 
                   sentence_end: str=SENTENCE_END):
  """
  Tokenize each line in a list of strings. Glue on the appropriate number of 
  sentence begin tokens and sentence end tokens (ngram - 1), except
  for the case when ngram == 1, when there will be one sentence begin
  and one sentence end token.
  Args:
    data (list): list of strings to tokenize
    ngram (int): ngram preparation number
    by_char (bool): default value True, if True, tokenize by character, if
      False, tokenize by whitespace
    sentence_begin (str): sentence begin token value
    sentence_end (str): sentence end token value

  Returns:
    list of strings - all lines tokenized as one large list
  """
  total = []
  # also glue on sentence begin and end items
  for line in data:
    line = line.strip()
    # skip empty lines
    if len(line) == 0:
      continue
    tokens = tokenize_line(line, ngram, by_char, sentence_begin, sentence_end)
    total += tokens
  return total

In [5]:
text = []
with open('data/training.txt', 'r') as file: 
    for line in file: 
        text.append(line.strip())

### Training and Generating

In [6]:
# reading data
df = pd.read_csv('data/training.csv')

emotion_dfs = {}
for label in EMOTION_KEY: 
    curr = df.loc[df['label'] == label]
    emotion_dfs[label] = curr
df.head()

Unnamed: 0,text,label
0,i didnt feel humiliated,0
1,i can go from feeling so hopeless to so damned...,0
2,im grabbing a minute to post i feel greedy wrong,3
3,i am ever feeling nostalgic about the fireplac...,2
4,i am feeling grouchy,3


In [7]:
ngram = 2
gen_sents = {}
for label, df_ in tqdm(emotion_dfs.items()): 
    # tokenize by word
    toks = tokenize(list(df_['text']), ngram, by_char=False)
    lm = LanguageModel(ngram)
    lm.train(toks)
    gen_sents[label] = lm.generate(10)

  0%|          | 0/6 [00:00<?, ?it/s]

In [11]:
with open('vanilla_emotion_results.txt', 'w') as file: 
    json.dump(gen_sents, file)