In [1]:
import re
from pathlib import Path
import string
from functools import reduce
from math import log
import itertools
import numpy as np
from collections import defaultdict
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report

In [2]:
from transformers import pipeline

classifier = pipeline("text-classification",model='bhadresh-savani/distilbert-base-uncased-emotion', return_all_scores=True,)

def emotion_scores(sample):
    emotion=classifier(sample)
    return emotion[0]


  from .autonotebook import tqdm as notebook_tqdm
  return self.fget.__get__(instance, owner)()


In [3]:
filename = "corpus.txt"

In [4]:

def load_file(filename):
    with open(filename) as f:
        lines = [line.rstrip() for line in f]
    print("No of sentences in Corpus: "+str(len(lines)))
    return lines

In [5]:
class BigramLM:
    def __init__(self):
        self.bigram_counts = defaultdict(int)
        self.unigram_counts = defaultdict(int)
        self.vocab = set()
        self.bigrams = set()

    def learn_model(self, dataset):
      for sentence in dataset:
          tokens = sentence.split()
          for i in range(1, len(tokens)):
              bigram = (tokens[i-1], tokens[i])
              self.bigram_counts[bigram] += 1
              self.unigram_counts[tokens[i-1]] += 1
              self.vocab.add(tokens[i-1])
              self.vocab.add(tokens[i])
              self.bigrams.add((tokens[i-1], tokens[i]))


    def calculate_bigram_probability(self, word1, word2):
      if word1 in self.vocab and word2 in self.vocab:
          count_bigram = self.bigram_counts[word1 , word2]
          count_unigram = self.unigram_counts[word1]
          if count_unigram > 0:
              return count_bigram / count_unigram
      return 0.0

    # def calculate_all_bigrams_probabilities(self,bigrams,bigram_counts,unigram_counts):
    #   bigramProbabilities = defaultdict(float)
    #   for bigram in bigrams:
    #       word1 = bigram[0]
    #       word2 = bigram[1]
    #       bigramProbabilities[bigram] = (self.bigram_counts[word1 , word2])/(self.unigram_counts[word1])
    #   return bigramProbabilities

    def calculate_all_bigrams_probabilities(self, corpus):
        bigram_probabilities = {}
        for sentence in corpus:
            tokens = sentence.split()
            for i in range(1, len(tokens)):
                bigram = (tokens[i-1], tokens[i])
                probability = self.calculate_bigram_probability(bigram[0], bigram[1])
                bigram_probabilities[bigram] = probability
        return bigram_probabilities

    def bigram_probability_recursive(self, words):
        if len(words) < 2:
            return 0.0
        word1, word2 = words[0], words[1]
        if word1 in self.vocab and word2 in self.vocab:
            count_bigram = self.bigram_counts[word1, word2]
            count_unigram = self.unigram_counts[word1]
            if count_unigram > 0:
                recursive_probability = self.bigram_probability_recursive(words[1:])
                return count_bigram / count_unigram + recursive_probability
        return 0.0

    def laplace_smoothing(self, words, alpha=1):
        if len(words) < 2:
            return 0.0
        word1, word2 = words[0], words[1]
        count_bigram = self.bigram_counts[word1, word2] + alpha
        count_unigram = self.unigram_counts[word1] + alpha * len(self.vocab)
        recursive_probability = self.laplace_smoothing(words[1:], alpha)
        return count_bigram / count_unigram + recursive_probability

    def calculate_all_laplace_probabilities(self, corpus, alpha=1):
        laplace_probabilities = {}
        for sentence in corpus:
            tokens = sentence.split()
            for i in range(1, len(tokens)):
                bigram = (tokens[i-1], tokens[i])
                probability = self.laplace_smoothing(bigram, alpha)
                laplace_probabilities[bigram] = probability
        return laplace_probabilities

    def calculate_all_kneser_ney_probabilities(self, corpus, d=0.75):
        kneser_ney_probabilities = {}
        for sentence in corpus:
            tokens = sentence.split()
            for i in range(1, len(tokens)):
                bigram = (tokens[i-1], tokens[i])
                probability = self.kneser_ney_smoothing(bigram, d)
                kneser_ney_probabilities[bigram] = probability
            break
        return kneser_ney_probabilities

    def kneser_ney_smoothing(self, words, d=0.75):
        if len(words) < 2:
            return 0.0
        word1, word2 = words[0], words[1]
        count_bigram = self.bigram_counts[word1, word2]
        count_unigram = self.unigram_counts[word1]
        a_word1 = (d * len(set([w for w in self.vocab if self.bigram_counts[word1, w] > 0])))/count_unigram

        p_continuation_word2 = (len(set([w for w in self.vocab if self.bigram_counts[word1, w] > 0])) /
                                len(set([bigram for bigram, count in self.bigram_counts.items() if count > 0])))
        kneser_ney_prob = (max(count_bigram - d, 0) / count_unigram) + a_word1 * p_continuation_word2
        return kneser_ney_prob



    def calculate_bigram_probability_with_emotion(self, word1, word2):
      min_probability = 0.0
      max_probability = 1.0
      base_probability = self.calculate_bigram_probability(word1, word2)
      emotion_scores = self.get_emotion_score(word2)
      print(emotion_scores)
      if emotion_scores is not None:
          beta = max(emotion_scores[0], key=lambda x: x['score'])
          score = beta['score']
          emotion = beta['label']
        #   print("beta",beta)
          normalized_modified_probability = min(max_probability, max(min_probability, base_probability + score))
          return normalized_modified_probability, emotion
      else:
          return base_probability

    def calculate_all_bigram_probabilities_with_emotion(self, corpus):
        bigram_probabilities_with_emotion = {}
        for bigram in self.bigrams:
            probability_array, emotion = self.calculate_bigram_probability_with_emotion(bigram[0], bigram[1])
            bigram_probabilities_with_emotion[bigram] = probability_array, emotion
        return bigram_probabilities_with_emotion

    def bigram_probability_with_emotion(self, words):
        if len(bigram) < 2:
            return 0.0
        word1, word2 = words[0], words[1]
        base_probability = self.calculate_bigram_probability(word1, word2)
        emotion_scores1 = self.get_emotion_score(word1)
        emotion_scores2 = self.get_emotion_score(word2)
        emotion_array = [(score1 + score2) / 2 + base_probability
                             for score1, score2 in zip(emotion_scores1, emotion_scores2)]
        return emotion_array

    def get_emotion_score(self, word):
        emotion_scores = classifier(word)
        return emotion_scores

In [6]:
corpus = load_file(filename)

No of sentences in Corpus: 2400


In [7]:
#top5 bigram probabilities
bigram_model = BigramLM()
bigram_model.learn_model(corpus)
all_probabilities = bigram_model.calculate_all_bigrams_probabilities(corpus)

top_5_bigrams = sorted(all_probabilities.items(), key=lambda x: x[1], reverse=True)[:5]

print("Top 5 Bigrams with Probabilities:")
for bigram, probability in top_5_bigrams:
    print(f"{bigram}: {probability}")


Top 5 Bigrams with Probabilities:
('href', 'http'): 1.0
('tychelle', 'to'): 1.0
('hang', 'out'): 1.0
('nonexistent', 'social'): 1.0
('alex', 'and'): 1.0


In [None]:
#top5 laplace
bigram_model = BigramLM()
bigram_model.learn_model(corpus)
all_probabilities = bigram_model.calculate_all_laplace_probabilities(corpus)
top_5_bigrams = sorted(all_probabilities.items(), key=lambda x: x[1], reverse=True)[:5]

print("Top 5 Bigrams with Probabilities:")
for bigram, probability in top_5_bigrams:
    print(f"{bigram}: {probability}")

Top 5 Bigrams with Probabilities:
('i', 'feel'): 0.11049603820688159
('feel', 'like'): 0.0350976507217662
('i', 'am'): 0.03191142950179095
('that', 'i'): 0.02655721873491067
('and', 'i'): 0.02311382468382032


In [None]:
#top5 kneser_ney
bigram_model = BigramLM()
bigram_model.learn_model(corpus)
all_probabilities = bigram_model.calculate_all_kneser_ney_probabilities(corpus)
top_5_bigrams = sorted(all_probabilities.items(), key=lambda x: x[1], reverse=True)[:5]

print("Top 5 Bigrams with Probabilities:")
for bigram, probability in top_5_bigrams:
    print(f"{bigram}: {probability}")

Top 5 Bigrams with Probabilities:
('href', 'http'): 0.9700000368309052
('i', 'feel'): 0.2687019562027336
('here', 'i'): 0.15934809551413778
('link', 'href'): 0.12504603863152725
('count', 'link'): 0.0625920772630545


In [8]:
emotional_bigram_probabilities = bigram_model.calculate_all_bigram_probabilities_with_emotion(corpus)

[[{'label': 'sadness', 'score': 0.10332212597131729}, {'label': 'joy', 'score': 0.3187467157840729}, {'label': 'love', 'score': 0.0217866413295269}, {'label': 'anger', 'score': 0.3889038860797882}, {'label': 'fear', 'score': 0.14556942880153656}, {'label': 'surprise', 'score': 0.021671120077371597}]]
[[{'label': 'sadness', 'score': 0.03140631690621376}, {'label': 'joy', 'score': 0.12808093428611755}, {'label': 'love', 'score': 0.007331213913857937}, {'label': 'anger', 'score': 0.3398895263671875}, {'label': 'fear', 'score': 0.4773738980293274}, {'label': 'surprise', 'score': 0.01591809280216694}]]
[[{'label': 'sadness', 'score': 0.097732774913311}, {'label': 'joy', 'score': 0.14305515587329865}, {'label': 'love', 'score': 0.013276400044560432}, {'label': 'anger', 'score': 0.30332615971565247}, {'label': 'fear', 'score': 0.4271021783351898}, {'label': 'surprise', 'score': 0.015507309697568417}]]
[[{'label': 'sadness', 'score': 0.09546397626399994}, {'label': 'joy', 'score': 0.4438344538

In [25]:
emotional_bigram_probabilities[('feel', 'sad')]

(0.9966527096877101, 'sadness')

In [9]:
first_word_counts = {}
for sentence in corpus:
    tokens = sentence.split()
    word = tokens[0]
    if word in first_word_counts:
        first_word_counts[word] += 1
    else:
        first_word_counts[word] = 1

first_word_counts_probabilities = {word: count / len(corpus) for word, count in first_word_counts.items()}
first_word_counts_probabilities

{'i': 0.8783333333333333,
 'ill': 0.002916666666666667,
 'im': 0.08833333333333333,
 'ive': 0.017916666666666668,
 'during': 0.0004166666666666667,
 'id': 0.0033333333333333335,
 'the': 0.00125,
 'on': 0.0004166666666666667,
 'is': 0.0004166666666666667,
 'no': 0.0004166666666666667,
 'a': 0.00125,
 'in': 0.0004166666666666667,
 'when': 0.0033333333333333335,
 'heated': 0.0004166666666666667,
 'this': 0.0004166666666666667,
 'occured': 0.0004166666666666667}

In [10]:

def generate_sentence(emotion, first_word_counts_probabilities, emotional_bigram_probabilities):
    sentence = []
    first_word = np.random.choice(list(first_word_counts_probabilities.keys()), p=list(first_word_counts_probabilities.values()))
    sentence.append(first_word)
    max_lengths = range(7, 12)
    max_length = np.random.choice(max_lengths)
    for _ in range(max_length):
        word = sentence[-1]
        # while True:
        bigrams = [(bigram, probability) for bigram, probability in emotional_bigram_probabilities.items() if bigram[0] == word and probability[1] == emotion]
        if len(bigrams) == 0:
            bigrams = [(bigram, probability) for bigram, probability in emotional_bigram_probabilities.items() if bigram[0] == word]
        if len(bigrams) == 0:
            bigrams = bigram_model.bigrams
            # print("bigrams",bigrams)
            potential_next_words = [bigram[1] for bigram in bigrams]
        else:
            potential_next_words = [bigram[1] for bigram, probability in bigrams]
        # print(potential_next_words)
        next_word = np.random.choice(potential_next_words)
        sentence.append(next_word)
    sentence = ' '.join(sentence)
    return sentence


In [17]:
emotions = ['sadness', 'joy', 'anger', 'fear', 'love', 'surprise']
# Generate 50 samples for each of the 6 emotions for which you can get scores.Store these outputs in .txt files for each emotion using the file name format gen_<emotion>.txt.
for emotion in emotions:
    with open(f'gen_{emotion}.txt', 'w') as f:
        for _ in range(50):
            sentence = generate_sentence(emotion, first_word_counts_probabilities, emotional_bigram_probabilities)
            # print(sentence)
            f.write(sentence + '\n')

In [27]:
for emotion in emotions:
    with open(f'gen_{emotion}_labels.txt', 'w') as f:
        for _ in range(50):
            f.write(emotion + '\n')

In [11]:
X_train = corpus
y_train = load_file("labels.txt")

bigram_model = BigramLM()
emotions = ['sadness', 'joy', 'love', 'anger', 'fear', 'surprise']


X_test, y_test = [], []
for emotion in emotions:
    file_name = f"gen_{emotion}.txt"
    with open(file_name, 'r') as file:
        samples = file.read().splitlines()
        X_test.extend(samples)
        y_test.extend([emotion] * len(samples))


No of sentences in Corpus: 2400


In [29]:

# Create a pipeline with TF-IDF vectorizer and SVC and the bigram model
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('svc', SVC()),
])

params = {
    'svc__C': [1, 10, 100, 1000, 5000],
    'svc__kernel': ['linear', 'rbf'],
    'tfidf__max_features': [1000, 10000],
    'tfidf__ngram_range': [(1, 1), (1, 2), (1, 3)]
}

In [30]:
import pickle
model_count = 0
def save_model(model):
    global model_count
    filename = f"best_model.pkl"
    with open(filename, 'wb') as file:
        pickle.dump(model, file)
    print(f"Model saved as {filename}")
    model_count += 1

In [31]:
grid_search = GridSearchCV(pipeline, params, cv=5, verbose=2, n_jobs=-1, scoring='f1_macro', return_train_score=True)
grid_search.fit(X_train, y_train)
best_model = grid_search.best_estimator_
save_model(best_model)
y_pred = best_model.predict(X_test)
print(f"Classification Report for {params}:")
print(classification_report(y_test, y_pred))


Fitting 5 folds for each of 60 candidates, totalling 300 fits
Model saved as best_model.pkl
Classification Report for {'svc__C': [1, 10, 100, 1000, 5000], 'svc__kernel': ['linear', 'rbf'], 'tfidf__max_features': [1000, 10000], 'tfidf__ngram_range': [(1, 1), (1, 2), (1, 3)]}:
              precision    recall  f1-score   support

       anger       0.57      0.30      0.35        50
        fear       0.79      0.52      0.63        50
         joy       0.58      0.76      0.66        50
        love       0.81      1.00      0.88        50
     sadness       0.63      0.84      0.72        50
    surprise       0.88      0.90      0.89        50

    accuracy                           0.71       300
   macro avg       0.71      0.71      0.69       300
weighted avg       0.71      0.71      0.69       300



In [14]:
# generate 2 samples of each emotion and print them
for emotion in emotions:
    print(f"Samples for {emotion}:")
    for _ in range(2):
        sentence = generate_sentence(emotion, first_word_counts_probabilities, emotional_bigram_probabilities)
        print(sentence)

Samples for sadness:
i forget to suffer i hurt when ever make
i forget for losing again and suffer from new limitations which dying
Samples for joy:
i remember moments you feel faithful feel gorgeous colors
i perform a smart consumer too cheap so content and
Samples for love:
i love caring people caring people caring eh
im supporting walmart because again its warmth sense for
Samples for anger:
i write it things here that as usual at
i stared up between my bum are heading for and was
Samples for fear:
i could almost weird considering there if id just surreal confusion and
i could almost weird considering everything and suddenly startled me
Samples for surprise:
i wonder why ive gone and curious why should reiterate
i wonder are surprised us no resemblance to
