Codes needed for reading the files and preparing them for language model generation

In [246]:
from io import FileIO
import re
from collections import Counter
import nltk
from nltk.tokenize import RegexpTokenizer
from nltk.tokenize import word_tokenize
nltk.download('punkt')


def extact_sentences(file_path):
  # file_path:path to the trascript file 
  remove = re.compile("[\[\(].*?[\]\)]|particular attention to juror NO\. 8")
  replace = re.compile("FOREMAN:")
  juror_line = re.compile("NO\.\s?(\d+):\s+(.+?)(?=NO\. \d+|$)")

  # key: Jury value: sentence said
  dictionary_lines = {}
  with open(file_path, 'r', encoding="unicode_escape") as file:
    line = file.readline()
    while(line):
      line = re.sub(remove, "", line)
      line = re.sub(replace, 'NO. 1:', line)
      matched_text = re.search(juror_line, line)
      if matched_text:
        if matched_text.group(1) in dictionary_lines:
          dictionary_lines[matched_text.group(1)].append(matched_text.group(2))
        else:
          dictionary_lines[matched_text.group(1)] = [matched_text.group(2)]
      line = file.readline()

  # Remove the intro description of each juror before Act 1
  # No. 10 is formatted without a : 
  for juror in dictionary_lines:
    if juror != '10':
      del dictionary_lines[juror][:1]
  return dictionary_lines

def pre_process_inputs(dictionary_lines):
  from collections import Counter
  # This method takes in the dictionary from previous method
  # It will return two dictionaries: one having all the tokens along
  # with their frequencies (list to counter?)
  # The other dictionary will have jury as the key (e.g., NO.2)
  # and the values are another dictionary of {token: frequency} for each jury
  
  lst_all_words = []
  dictionary_juror_words = {}
  for jury in dictionary_lines:
    lst_sentences_by_jury = dictionary_lines[jury]
    words_juror = []
    for sentence in lst_sentences_by_jury:
      tokenizer = RegexpTokenizer(r'\w+')
      sentence_words = tokenizer.tokenize(sentence)
      for tok in sentence_words:
        words_juror.append(tok.lower())
        lst_all_words.append(tok.lower())
    dictionary_juror_words[jury] = dict(Counter(words_juror)) 

  vocabulary = dict(Counter(lst_all_words))
  return vocabulary, dictionary_juror_words

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Code to generate languge model

In [247]:
from collections import Counter

def unigram_lm(dic_vocabulary, dic_juror_words):
  # This method will generate the unigram language model for each juror
  # The return value is a dict with juror as key and value as his language model
  # the inputs are the two dictionaries from pre_process_inputs method
  dic_unigram_lm = {}

  for jury in dic_juror_words:
    dic_words_frequency = dic_juror_words[jury]
    language_model = {key: 0 for key in dic_vocabulary}
    # Here you will calculate the probabilities with MLE and add-one smoothing
    for key in language_model:
      if key in dic_words_frequency:
        nom = dic_words_frequency[key] + 1
        denom = dic_vocabulary[key] + 1 * len(dic_vocabulary)
        language_model[key] = nom/denom
    dic_unigram_lm[jury] = language_model
  return dic_unigram_lm

Code to generate T-SNE plot

In [248]:
from sklearn.manifold import TSNE
import numpy as np
import matplotlib.pyplot as plt
import numpy as np


def tsne_generator(dic_unigram_lm):
  tsne = TSNE(n_components=2)
  lst_freq = []
  lst_names = []
  for juror in dic_unigram_lm:
    lst_names.append(juror)
    
    # frequencies is a dictionary of token: frequencies
    # sort this by key and then convert values to the list and append it 
    # to the lst_freq
    frequencies = dic_unigram_lm[juror]
    ##### 
    # Your code goes here and then remove freq=[0] and instead of [0]
    # save your results in freq
    freq = [0]
    lst_freq.append(freq)
  
  # Convert the list to a NumPy array
  np_array = np.array(lst_freq)
  data = np_array

  # getting vectors with tsne
  vectors = tsne.fit_transform(data)

  # Your code for plot goes here
  # fig, ax = plt.subplots()
  # Some codes here to define what to be shown
  ####
  # making sure the legend is shown (uncomment)
  #ax.legend(bbox_to_anchor=(1.1, 1.05))
  #ax.grid(True)
  #plt.show()

In [249]:
path = r"/content/12AngryMen.txt"

dictionary_lines = extact_sentences(path)

#for juror in dictionary_lines:
  #print("Juror No.", juror, " Lines: ", len(dictionary_lines[juror]))

vocabulary, dictionary_juror_words = pre_process_inputs(dictionary_lines)

print("Vocabulary: ", vocabulary)
#print("Juror_words", dictionary_juror_words)
language_models = unigram_lm(vocabulary, dictionary_juror_words)
#for juror in language_models:
  #print("Juror No.", juror, " Probability of word: ", language_models[juror])

Vocabulary:  {'all': 58, 'right': 81, 'gentlemen': 7, 'let': 39, 's': 193, 'take': 18, 'our': 6, 'seats': 1, 'how': 35, 'about': 37, 'sitting': 3, 'down': 17, 'the': 315, 'gentleman': 5, 'at': 32, 'window': 15, 'is': 63, 'everybody': 3, 'here': 34, 'we': 73, 'd': 20, 'like': 41, 'to': 203, 'get': 19, 'started': 2, 'it': 174, 'find': 2, 'a': 176, 'seat': 1, 'aii': 1, 'now': 38, 'you': 256, 'can': 39, 'handle': 3, 'this': 74, 'any': 8, 'way': 10, 'want': 32, 'i': 216, 'mean': 23, 'm': 33, 'not': 54, 'going': 24, 'make': 10, 'rules': 1, 'if': 26, 'discuss': 3, 'first': 3, 'and': 88, 'then': 8, 'vote': 22, 'that': 127, 'one': 23, 'or': 15, 'see': 22, 'stand': 5, 'anybody': 4, 'doesn': 7, 't': 120, 'okay': 16, 'those': 7, 'voting': 3, 'guilty': 55, 'raise': 3, 'your': 29, 'hands': 3, 'nine': 4, 'ten': 10, 'eleven': 8, 'for': 40, 'know': 46, 'where': 13, 'are': 41, 'think': 49, 'good': 5, 'point': 11, 'have': 49, 'job': 1, 'do': 59, 'sounds': 3, 'fair': 6, 'enough': 10, 'supposing': 5, 'go':