# **Importing Dataset**

## **Importing Libraries**

In [1]:
import numpy as np
import pandas as pd
import re
import pickle
import random
from tqdm import tqdm

## **Mounting the drive**

In [2]:
from google.colab import drive
drive.mount("/content/drive")

Mounted at /content/drive


## **Displaying the dataset**

In [3]:
df = pd.read_csv("drive/My Drive/sample_reuters_dataset.csv")
df

Unnamed: 0,sentence_number,sentence_text
0,0,ASIAN EXPORTERS FEAR DAMAGE FROM U . S .- JAPA...
1,1,They told Reuter correspondents in Asian capit...
2,2,But some exporters said that while the conflic...
3,3,The U . S . Has said it will impose 300 mln dl...
4,4,Unofficial Japanese estimates put the impact o...
...,...,...
9995,9995,"In addition , British Printing and Communicati..."
9996,9996,Salomon said in a filing with the Securities a...
9997,9997,If the court decides they should be converted ...
9998,9998,Harcourt is asking the court to rule the compa...


## **Displaying 10 sample dialogs**

In [4]:
dialogs = list(df["sentence_text"])

In [5]:
random.sample(dialogs,10)

['MONSANTO TO BUY RHONE - POULENC POLYPHENYL BUSINESS Monsanto Chemical Company , a unit of Monsanto Co & lt ; MTC . N >, is to acquire the polyphenyls business of Rhone - Poulenc Chimie , a unit of Rhone - Poulenc & lt ; RHON . PA >, Monsanto said in a statement issued from its European headquarters .',
 'Reagan , in imposing the curbs , said they would be lifted as soon as there was evidence of a pattern that Japan was adhering to the pact .',
 'GREAT NORTHERN NEKOOSA & lt ; GNN > 1ST QTR NET Shr 1 . 59 dlrs vs 54 cts Net 43 . 3 mln vs 13 . 9 mln Revs 566 . 7 mln vs 487 . 8 mln Avg shrs 27 . 2 mln vs 25 . 9 mln NOTE : 1986 figures restated for adoption of financial accounting standards board statement 87 " employer \' s accounting for pensions ."',
 'Bilizerian said he was optimistic his offer will be accepted at a meeting of the board of directors Wednesday .',
 'Pentland said it will use proceeds to fund growth and possible acquisitions .',
 'STOREHOUSE REPORTS HIGHER PROFITS IN 19

# **Creating Vocabulary**

## **Text Cleaning**

In [6]:
dial_clean = []

for i in dialogs:
  i = re.sub("[^a-zA-Z' ]","",i)
  i = i.lower()
  dial_clean.append(i)

In [7]:
random.sample(dial_clean,10)

['soviet    coarse grain production is estimated at    mln tonnes  vs    mln tonnes last month ',
 'the accumulated trade surplus over the first two months of  stands at    mln dlrs against    mln dlrs the previous year ',
 'he did not elaborate ',
 'as previously reported  dome is seeking approval in principle for the debt restructuring plan ',
 'the ec tariffs  which would involve renouncing obligations entered into with the world trade body gatt  would be designed to stop a diversion of exports to the ec market from that of the u  s ',
 "harcourt ' s recapitalization will come under scrutiny of a u  s  court in orlando  fla  monday ",
 'the previous target for  was    tonnes ',
 "spokesman fitzwater said  we don ' t want a trade war  but the imposition of sanctions showed the united states would act when it had evidence that trade pacts were being violated ",
 "without giving figures  the report said last month ' s rise  partly linked to efforts to catch up with production lost earl

## **Creating Vocabulary and display word count**

In [8]:
all_words = " ".join(dial_clean).split()

word_dict = {}

for word in all_words:
  if word in word_dict:
    word_dict[word] = word_dict[word] + 1
  else:
    word_dict[word] = 1

In [9]:
word_dict

{'asian': 13,
 'exporters': 52,
 'fear': 8,
 'damage': 29,
 'from': 1369,
 'u': 1117,
 's': 2864,
 'japan': 441,
 'rift': 1,
 'mounting': 5,
 'trade': 549,
 'friction': 8,
 'between': 191,
 'the': 12496,
 'and': 4599,
 'has': 974,
 'raised': 70,
 'fears': 13,
 'among': 44,
 'many': 54,
 'of': 6671,
 'asia': 14,
 "'": 2094,
 'exporting': 12,
 'nations': 71,
 'that': 1376,
 'row': 3,
 'could': 291,
 'inflict': 1,
 'far': 55,
 'reaching': 7,
 'economic': 244,
 'businessmen': 15,
 'officials': 190,
 'said': 4649,
 'they': 518,
 'told': 237,
 'reuter': 27,
 'correspondents': 3,
 'in': 5070,
 'capitals': 3,
 'a': 4412,
 'move': 101,
 'against': 270,
 'might': 59,
 'boost': 45,
 'protectionist': 22,
 'sentiment': 10,
 'lead': 96,
 'to': 6337,
 'curbs': 12,
 'on': 1643,
 'american': 126,
 'imports': 242,
 'their': 230,
 'products': 200,
 'but': 650,
 'some': 278,
 'while': 164,
 'conflict': 3,
 'would': 926,
 'hurt': 11,
 'them': 58,
 'long': 119,
 'run': 21,
 'short': 87,
 'term': 120,
 'toky

## **Displaying vocabulary in Dataframe sorted by word-count**

In [10]:
word_df = pd.DataFrame({"word":list(word_dict.keys()),"count":list(word_dict.values())})
word_df = word_df.sort_values(by = "count")
word_df.reset_index(inplace=True,drop=True)

In [11]:
word_df

Unnamed: 0,word,count
0,ulcer,1
1,gaons,1
2,securitiesd,1
3,unfiltered,1
4,preceeding,1
...,...,...
12575,said,4649
12576,in,5070
12577,to,6337
12578,of,6671


### **Number of words**

In [12]:
len(word_df)

12580

# **Creating the N-Gram Model**

## **Creating a dataframe for N-Gram model**

In [13]:
dataset = pd.DataFrame()
dataset["Sentence"] = dial_clean
dataset.head(5)

Unnamed: 0,Sentence
0,asian exporters fear damage from u s japan r...
1,they told reuter correspondents in asian capit...
2,but some exporters said that while the conflic...
3,the u s has said it will impose mln dlrs of...
4,unofficial japanese estimates put the impact o...


## **Unigram Model**

In [14]:
def create_unigram(sentence):
  unigram_list = []
  tokens = sentence.split()
  for i in range(len(tokens)):
    unigram_list.append(tokens[i:i+1])
  return unigram_list

## **Bigram Model**

In [15]:
def create_bigram(sentence):
  bigram_list = []
  tokens = sentence.split()
  for i in range(len(tokens)-1):
    bigram_list.append(tokens[i:i+2])
  return bigram_list

## **Trigram Model**

In [16]:
def create_trigram(sentence):
  trigram_list = []
  tokens = sentence.split()
  for i in range(len(tokens)-2):
    trigram_list.append(tokens[i:i+3])
  return trigram_list

## **Display the models in dataframe**

In [17]:
dataset['Unigram'] = dataset["Sentence"].apply(create_unigram)
dataset['Bigram'] = dataset["Sentence"].apply(create_bigram)
dataset['Trigram'] = dataset["Sentence"].apply(create_trigram)
dataset

Unnamed: 0,Sentence,Unigram,Bigram,Trigram
0,asian exporters fear damage from u s japan r...,"[[asian], [exporters], [fear], [damage], [from...","[[asian, exporters], [exporters, fear], [fear,...","[[asian, exporters, fear], [exporters, fear, d..."
1,they told reuter correspondents in asian capit...,"[[they], [told], [reuter], [correspondents], [...","[[they, told], [told, reuter], [reuter, corres...","[[they, told, reuter], [told, reuter, correspo..."
2,but some exporters said that while the conflic...,"[[but], [some], [exporters], [said], [that], [...","[[but, some], [some, exporters], [exporters, s...","[[but, some, exporters], [some, exporters, sai..."
3,the u s has said it will impose mln dlrs of...,"[[the], [u], [s], [has], [said], [it], [will],...","[[the, u], [u, s], [s, has], [has, said], [sai...","[[the, u, s], [u, s, has], [s, has, said], [ha..."
4,unofficial japanese estimates put the impact o...,"[[unofficial], [japanese], [estimates], [put],...","[[unofficial, japanese], [japanese, estimates]...","[[unofficial, japanese, estimates], [japanese,..."
...,...,...,...,...
9995,in addition british printing and communicatio...,"[[in], [addition], [british], [printing], [and...","[[in, addition], [addition, british], [british...","[[in, addition, british], [addition, british, ..."
9996,salomon said in a filing with the securities a...,"[[salomon], [said], [in], [a], [filing], [with...","[[salomon, said], [said, in], [in, a], [a, fil...","[[salomon, said, in], [said, in, a], [in, a, f..."
9997,if the court decides they should be converted ...,"[[if], [the], [court], [decides], [they], [sho...","[[if, the], [the, court], [court, decides], [d...","[[if, the, court], [the, court, decides], [cou..."
9998,harcourt is asking the court to rule the compa...,"[[harcourt], [is], [asking], [the], [court], [...","[[harcourt, is], [is, asking], [asking, the], ...","[[harcourt, is, asking], [is, asking, the], [a..."


# **Language modelling using Trigram Model**

## **Finding occurence of a word given previous two words**

In [18]:
from collections import Counter,defaultdict

model = defaultdict(lambda:defaultdict(lambda:0))

for i in range(dataset.shape[0]):
  for w1,w2,w3 in dataset["Trigram"][i]:
    model[(w1,w2)][w3] += 1

In [19]:
model

defaultdict(<function __main__.<lambda>()>,
            {('asian',
              'exporters'): defaultdict(<function __main__.<lambda>.<locals>.<lambda>()>, {'fear': 1}),
             ('exporters',
              'fear'): defaultdict(<function __main__.<lambda>.<locals>.<lambda>()>, {'damage': 1}),
             ('fear',
              'damage'): defaultdict(<function __main__.<lambda>.<locals>.<lambda>()>, {'from': 1}),
             ('damage',
              'from'): defaultdict(<function __main__.<lambda>.<locals>.<lambda>()>, {'u': 1,
                          'local': 1}),
             ('from',
              'u'): defaultdict(<function __main__.<lambda>.<locals>.<lambda>()>, {'s': 5,
                          'k': 1}),
             ('u',
              's'): defaultdict(<function __main__.<lambda>.<locals>.<lambda>()>, {'japan': 6,
                          'and': 34,
                          'move': 1,
                          'has': 7,
                          'said': 4,
          

## **Examples**

In [20]:
dict(model['has','raised'])

{'fears': 1, 'its': 2, 'concern': 1, 'the': 1, 'his': 1, 'prices': 1}

In [21]:
dict(model['japan','has'])

{'raised': 1,
 'been': 2,
 'the': 1,
 'little': 2,
 'violated': 1,
 'said': 3,
 'come': 1,
 'no': 3,
 'rejected': 1,
 'far': 1,
 'resulted': 1}

## **Find probability of occurence of a word**

In [22]:
unigram_dict = {}

for i in tqdm(range(dataset.shape[0])):
  for word in dataset["Unigram"][i]:
    if word[0] in unigram_dict:
      unigram_dict[word[0]] += 1
    else:
      unigram_dict[word[0]] = 1

100%|██████████| 10000/10000 [00:00<00:00, 37998.18it/s]


In [23]:
unigram_dict

{'asian': 13,
 'exporters': 52,
 'fear': 8,
 'damage': 29,
 'from': 1369,
 'u': 1117,
 's': 2864,
 'japan': 441,
 'rift': 1,
 'mounting': 5,
 'trade': 549,
 'friction': 8,
 'between': 191,
 'the': 12496,
 'and': 4599,
 'has': 974,
 'raised': 70,
 'fears': 13,
 'among': 44,
 'many': 54,
 'of': 6671,
 'asia': 14,
 "'": 2094,
 'exporting': 12,
 'nations': 71,
 'that': 1376,
 'row': 3,
 'could': 291,
 'inflict': 1,
 'far': 55,
 'reaching': 7,
 'economic': 244,
 'businessmen': 15,
 'officials': 190,
 'said': 4649,
 'they': 518,
 'told': 237,
 'reuter': 27,
 'correspondents': 3,
 'in': 5070,
 'capitals': 3,
 'a': 4412,
 'move': 101,
 'against': 270,
 'might': 59,
 'boost': 45,
 'protectionist': 22,
 'sentiment': 10,
 'lead': 96,
 'to': 6337,
 'curbs': 12,
 'on': 1643,
 'american': 126,
 'imports': 242,
 'their': 230,
 'products': 200,
 'but': 650,
 'some': 278,
 'while': 164,
 'conflict': 3,
 'would': 926,
 'hurt': 11,
 'them': 58,
 'long': 119,
 'run': 21,
 'short': 87,
 'term': 120,
 'toky

In [24]:
counts = Counter(unigram_dict)
counts

Counter({'asian': 13,
         'exporters': 52,
         'fear': 8,
         'damage': 29,
         'from': 1369,
         'u': 1117,
         's': 2864,
         'japan': 441,
         'rift': 1,
         'mounting': 5,
         'trade': 549,
         'friction': 8,
         'between': 191,
         'the': 12496,
         'and': 4599,
         'has': 974,
         'raised': 70,
         'fears': 13,
         'among': 44,
         'many': 54,
         'of': 6671,
         'asia': 14,
         "'": 2094,
         'exporting': 12,
         'nations': 71,
         'that': 1376,
         'row': 3,
         'could': 291,
         'inflict': 1,
         'far': 55,
         'reaching': 7,
         'economic': 244,
         'businessmen': 15,
         'officials': 190,
         'said': 4649,
         'they': 518,
         'told': 237,
         'reuter': 27,
         'correspondents': 3,
         'in': 5070,
         'capitals': 3,
         'a': 4412,
         'move': 101,
         'against': 2

In [25]:
total_count = sum(unigram_dict.values())
total_count

241583

In [26]:
for word in counts:
  counts[word] /= float(total_count)

counts

Counter({'asian': 5.3811733441508716e-05,
         'exporters': 0.00021524693376603486,
         'fear': 3.311491288708229e-05,
         'damage': 0.00012004155921567329,
         'from': 0.005666789467801956,
         'u': 0.004623669711858864,
         's': 0.011855138813575458,
         'japan': 0.0018254595729004111,
         'rift': 4.139364110885286e-06,
         'mounting': 2.0696820554426428e-05,
         'trade': 0.002272510896876022,
         'friction': 3.311491288708229e-05,
         'between': 0.0007906185451790896,
         'the': 0.05172549392962253,
         'and': 0.01903693554596143,
         'has': 0.004031740644002268,
         'raised': 0.00028975548776197,
         'fears': 5.3811733441508716e-05,
         'among': 0.00018213202087895256,
         'many': 0.00022352566198780545,
         'of': 0.02761369798371574,
         'asia': 5.7951097552394e-05,
         "'": 0.008667828448193788,
         'exporting': 4.967236933062343e-05,
         'nations': 0.00029389485

## **Find the probabilty of occurence of a word given previous two words**

In [27]:
for w1_w2 in model:
  total_count = float(sum(model[w1_w2].values()))
  for w3 in model[w1_w2]:
    model[w1_w2][w3] /= total_count

In [28]:
model

defaultdict(<function __main__.<lambda>()>,
            {('asian',
              'exporters'): defaultdict(<function __main__.<lambda>.<locals>.<lambda>()>, {'fear': 1.0}),
             ('exporters',
              'fear'): defaultdict(<function __main__.<lambda>.<locals>.<lambda>()>, {'damage': 1.0}),
             ('fear',
              'damage'): defaultdict(<function __main__.<lambda>.<locals>.<lambda>()>, {'from': 1.0}),
             ('damage',
              'from'): defaultdict(<function __main__.<lambda>.<locals>.<lambda>()>, {'u': 0.5,
                          'local': 0.5}),
             ('from',
              'u'): defaultdict(<function __main__.<lambda>.<locals>.<lambda>()>, {'s': 0.8333333333333334,
                          'k': 0.16666666666666666}),
             ('u',
              's'): defaultdict(<function __main__.<lambda>.<locals>.<lambda>()>, {'japan': 0.006586169045005488,
                          'and': 0.03732162458836443,
                          'move': 0.001

## **Examples**

In [29]:
dict(model['japan','has'])

{'raised': 0.058823529411764705,
 'been': 0.11764705882352941,
 'the': 0.058823529411764705,
 'little': 0.11764705882352941,
 'violated': 0.058823529411764705,
 'said': 0.17647058823529413,
 'come': 0.058823529411764705,
 'no': 0.17647058823529413,
 'rejected': 0.058823529411764705,
 'far': 0.058823529411764705,
 'resulted': 0.058823529411764705}

In [30]:
dict(model['to','go'])

{'ahead': 0.10526315789473684,
 'on': 0.10526315789473684,
 'under': 0.05263157894736842,
 'against': 0.10526315789473684,
 'unnoticed': 0.05263157894736842,
 'into': 0.15789473684210525,
 'to': 0.05263157894736842,
 'down': 0.05263157894736842,
 'through': 0.05263157894736842,
 'up': 0.05263157894736842,
 'away': 0.05263157894736842,
 'for': 0.05263157894736842,
 'public': 0.10526315789473684}

## **Recommending next word**

In [31]:
max(model['japan','has'], key = lambda w3 : model['japan','has'][w3])

'said'

In [32]:
max(model[('to','go')], key = lambda w3 : model['to','go'][w3])

'into'

## **Sample Recommender**

In [33]:
samp_sentence = input('Enter a sentence to find next word : ')
tokens = samp_sentence.lower().split()
print('Next word : ',max(model[(tokens[-2],tokens[-1])], key = lambda w3 : model[(tokens[-2],tokens[-1])][w3]))

Enter a sentence to find next word : This has to be
Next word :  acquired
