# Next Word Prediction

#### Steps to build the next word recommender system:

1. Loading and exploring the dataset
2. Creating N-grams of the dialogue
3. Building the N-gram Language Model
4. Predicting the next word using N-gram Language Model

### 1. Loading and exploring the dataset

In [1]:
# loading the required libraries
import pandas as pd
import numpy as np
import re
import pickle
import random
from tqdm import tqdm

In [2]:
# loading dataset
data = pd.read_csv(r"C:\Users\TheWhiteWolf\NLP\Module1\Projects\Next_Word_Project\Next_Word\sample_reuters_dataset.csv")
data.head()

Unnamed: 0,sentence_number,sentence_text
0,0,ASIAN EXPORTERS FEAR DAMAGE FROM U . S .- JAPA...
1,1,They told Reuter correspondents in Asian capit...
2,2,But some exporters said that while the conflic...
3,3,The U . S . Has said it will impose 300 mln dl...
4,4,Unofficial Japanese estimates put the impact o...


In [3]:
data['sentence_text'].head()

0    ASIAN EXPORTERS FEAR DAMAGE FROM U . S .- JAPA...
1    They told Reuter correspondents in Asian capit...
2    But some exporters said that while the conflic...
3    The U . S . Has said it will impose 300 mln dl...
4    Unofficial Japanese estimates put the impact o...
Name: sentence_text, dtype: object

In [4]:
# number of text sequences
len(data)

10000

In [5]:
data.shape

(10000, 2)

In [6]:
# function to preprocess (text cleaning)
def clean(text):
    # remove everything except alphabets, ' and white spaces
    text = re.sub("[^a-zA-Z' ]", "", text)
    # convert text to lowercase
    text = text.lower()
    
    text = re.sub(" '", "'", text)
    text = re.sub("' ", "'", text)
    text = re.sub("u  s", "us", text)
    
    return text

In [7]:
# preprocessing speeches
dialogs_clean = data['sentence_text'].apply(clean)

In [8]:
type(dialogs_clean)

pandas.core.series.Series

In [9]:
dialogs_clean[0]

"asian exporters fear damage from us  japan rift mounting trade friction between the us  and japan has raised fears among many of asia's exporting nations that the row could inflict far  reaching economic damage  businessmen and officials said "

In [10]:
random.sample(list(dialogs_clean), 10)

["usda estimates european community crops the us  agriculture department forecast the european community's    wheat crop at    mln tonnes  vs    mln tonnes last month ",
 'spotty germination and winterkill in those fields averaged  to  pct  it said ',
 ' however  that is the normal wording and we expect the hague court to refer questions on the interpretation and application of the levy to the european court of justice in luxembourg  pex added ',
 'garo armen  an analyst with dean witter reynolds  said chemical makers have also benefitted by increasing demand for plastics as prices become more competitive with aluminum  wood and steel products ',
 'arrangements for financing have not yet been made and there can be no assurance that any financing will be received  hmo said ',
 "officials said the complementary character of the two firms'operations was a further reason ",
 'european markets react quietly to g   communique european currency markets reacted quietly to the g   communique  w

In [11]:
# creating the vocabulary
# get list of all the words
all_words = " ".join(dialogs_clean).split()

words_dict = {}

# add word-count pair to the dictionary
for word in all_words:   
    # check if the word is already in dictionary 
    if word in words_dict:
        # increment count of word by 1 
        words_dict[word] = words_dict[word] + 1
    else:
        # add the word to dictionary with count 1 
        words_dict[word] = 1

In [27]:
# word dictionary
for x in list(words_dict.items())[0:5]:
    print(x)

('asian', 13)
('exporters', 49)
('fear', 8)
('damage', 29)
('from', 1369)


In [30]:
# prepare a dataframe
words_df = pd.DataFrame({'word':list(words_dict.keys()), 'count':list(words_dict.values())})

# sort words by their count in increasing order
words_df = words_df.sort_values(by = ['count'])

# reset dataframe index
words_df.reset_index(inplace = True, drop=True)

In [31]:
# words with least frequency
words_df.head()

Unnamed: 0,word,count
0,compute,1
1,baggage,1
2,conergic,1
3,foster's,1
4,improper,1


In [32]:
# words with highest frequency
words_df.tail()

Unnamed: 0,word,count
13165,said,4649
13166,in,5070
13167,to,6337
13168,of,6670
13169,the,12496


In [33]:
# vocabulary size
len(words_df)

13170

### 2. Creating N-grams of the dialogue

In [34]:
# creating an empty dataframe
dataset = pd.DataFrame()

# adding cleaned sentences in the dataframe
dataset['Sentences'] = dialogs_clean

# first 20 cleaned sentences
dataset.head(20)

Unnamed: 0,Sentences
0,asian exporters fear damage from us japan rif...
1,they told reuter correspondents in asian capit...
2,but some exporters said that while the conflic...
3,the us has said it will impose mln dlrs of t...
4,unofficial japanese estimates put the impact o...
5,we wouldn't be able to do business said a sp...
6,if the tariffs remain in place for any length...
7,in taiwan businessmen and officials are also ...
8,we are aware of the seriousness of the us
9,threat against japan because it serves as a wa...


In [35]:
# using .split() to get tokens from the sentence
dataset['Sentences'][0].split()

['asian',
 'exporters',
 'fear',
 'damage',
 'from',
 'us',
 'japan',
 'rift',
 'mounting',
 'trade',
 'friction',
 'between',
 'the',
 'us',
 'and',
 'japan',
 'has',
 'raised',
 'fears',
 'among',
 'many',
 'of',
 "asia's",
 'exporting',
 'nations',
 'that',
 'the',
 'row',
 'could',
 'inflict',
 'far',
 'reaching',
 'economic',
 'damage',
 'businessmen',
 'and',
 'officials',
 'said']

In [36]:
# function to create unigrams
# taking a sentence as input
def create_unigram(sentence):
    # creating tokens from the sentence
    tokens = sentence.split()
    # empty list to store the unigrams
    unigram_list = []
    # number of unigrams is equal to the number of tokens in the sentence
    for i in range(len(tokens)):
        # appending each unigram in the list
        unigram_list.append(tokens[i:i+1])
    # returning the unigram list for a sentence    
    return unigram_list

In [37]:
# function to create bigrams
def create_bigram(sentence):
    tokens = sentence.split()
    bigram_list = []
    # number of bigrams is one less than the number of tokens in the sentence
    for i in range(len(tokens)-1):
        bigram_list.append(tokens[i:i+2])
    return bigram_list

In [38]:
# function to create trigrams
def create_trigram(sentence):
    tokens = sentence.split()
    trigram_list = []
    # number of trigrams is two less than the number of tokens in the sentence
    for i in range(len(tokens)-2):
        trigram_list.append(tokens[i:i+3])
    return trigram_list

In [39]:
# creating unigrams for all the sentences in the dataset 
final_unigram = []
# for each sentence
for i in range(dataset.shape[0]):
    # using the defined unigram function to create unigrams
    final_unigram.append(create_unigram(dataset['Sentences'][i]))

# adding the unigram in a seperate column in the dataset
dataset['unigram'] = final_unigram

In [40]:
# creating bigrams for all the sentences in the dataset
final_bigram = []
for i in range(dataset.shape[0]):
    final_bigram.append(create_bigram(dataset['Sentences'][i]))

dataset['bigram'] = final_bigram

In [41]:
# creating trigrams for all the sentences in the dataset
final_trigram = []
for i in range(dataset.shape[0]):
    final_trigram.append(create_trigram(dataset['Sentences'][i]))

dataset['trigram'] = final_trigram

In [42]:
# first 20 rows of the dataset
dataset.head(20)

Unnamed: 0,Sentences,unigram,bigram,trigram
0,asian exporters fear damage from us japan rif...,"[[asian], [exporters], [fear], [damage], [from...","[[asian, exporters], [exporters, fear], [fear,...","[[asian, exporters, fear], [exporters, fear, d..."
1,they told reuter correspondents in asian capit...,"[[they], [told], [reuter], [correspondents], [...","[[they, told], [told, reuter], [reuter, corres...","[[they, told, reuter], [told, reuter, correspo..."
2,but some exporters said that while the conflic...,"[[but], [some], [exporters], [said], [that], [...","[[but, some], [some, exporters], [exporters, s...","[[but, some, exporters], [some, exporters, sai..."
3,the us has said it will impose mln dlrs of t...,"[[the], [us], [has], [said], [it], [will], [im...","[[the, us], [us, has], [has, said], [said, it]...","[[the, us, has], [us, has, said], [has, said, ..."
4,unofficial japanese estimates put the impact o...,"[[unofficial], [japanese], [estimates], [put],...","[[unofficial, japanese], [japanese, estimates]...","[[unofficial, japanese, estimates], [japanese,..."
5,we wouldn't be able to do business said a sp...,"[[we], [wouldn't], [be], [able], [to], [do], [...","[[we, wouldn't], [wouldn't, be], [be, able], [...","[[we, wouldn't, be], [wouldn't, be, able], [be..."
6,if the tariffs remain in place for any length...,"[[if], [the], [tariffs], [remain], [in], [plac...","[[if, the], [the, tariffs], [tariffs, remain],...","[[if, the, tariffs], [the, tariffs, remain], [..."
7,in taiwan businessmen and officials are also ...,"[[in], [taiwan], [businessmen], [and], [offici...","[[in, taiwan], [taiwan, businessmen], [busines...","[[in, taiwan, businessmen], [taiwan, businessm..."
8,we are aware of the seriousness of the us,"[[we], [are], [aware], [of], [the], [seriousne...","[[we, are], [are, aware], [aware, of], [of, th...","[[we, are, aware], [are, aware, of], [aware, o..."
9,threat against japan because it serves as a wa...,"[[threat], [against], [japan], [because], [it]...","[[threat, against], [against, japan], [japan, ...","[[threat, against, japan], [against, japan, be..."


In [43]:
# sample sentence
dataset['Sentences'][0]

"asian exporters fear damage from us  japan rift mounting trade friction between the us  and japan has raised fears among many of asia's exporting nations that the row could inflict far  reaching economic damage  businessmen and officials said "

In [44]:
# unigram of the sentence
dataset['unigram'][0]

[['asian'],
 ['exporters'],
 ['fear'],
 ['damage'],
 ['from'],
 ['us'],
 ['japan'],
 ['rift'],
 ['mounting'],
 ['trade'],
 ['friction'],
 ['between'],
 ['the'],
 ['us'],
 ['and'],
 ['japan'],
 ['has'],
 ['raised'],
 ['fears'],
 ['among'],
 ['many'],
 ['of'],
 ["asia's"],
 ['exporting'],
 ['nations'],
 ['that'],
 ['the'],
 ['row'],
 ['could'],
 ['inflict'],
 ['far'],
 ['reaching'],
 ['economic'],
 ['damage'],
 ['businessmen'],
 ['and'],
 ['officials'],
 ['said']]

In [45]:
# bigram of the sentence
dataset['bigram'][0]

[['asian', 'exporters'],
 ['exporters', 'fear'],
 ['fear', 'damage'],
 ['damage', 'from'],
 ['from', 'us'],
 ['us', 'japan'],
 ['japan', 'rift'],
 ['rift', 'mounting'],
 ['mounting', 'trade'],
 ['trade', 'friction'],
 ['friction', 'between'],
 ['between', 'the'],
 ['the', 'us'],
 ['us', 'and'],
 ['and', 'japan'],
 ['japan', 'has'],
 ['has', 'raised'],
 ['raised', 'fears'],
 ['fears', 'among'],
 ['among', 'many'],
 ['many', 'of'],
 ['of', "asia's"],
 ["asia's", 'exporting'],
 ['exporting', 'nations'],
 ['nations', 'that'],
 ['that', 'the'],
 ['the', 'row'],
 ['row', 'could'],
 ['could', 'inflict'],
 ['inflict', 'far'],
 ['far', 'reaching'],
 ['reaching', 'economic'],
 ['economic', 'damage'],
 ['damage', 'businessmen'],
 ['businessmen', 'and'],
 ['and', 'officials'],
 ['officials', 'said']]

In [46]:
# trigram of the sentence
dataset['trigram'][0]

[['asian', 'exporters', 'fear'],
 ['exporters', 'fear', 'damage'],
 ['fear', 'damage', 'from'],
 ['damage', 'from', 'us'],
 ['from', 'us', 'japan'],
 ['us', 'japan', 'rift'],
 ['japan', 'rift', 'mounting'],
 ['rift', 'mounting', 'trade'],
 ['mounting', 'trade', 'friction'],
 ['trade', 'friction', 'between'],
 ['friction', 'between', 'the'],
 ['between', 'the', 'us'],
 ['the', 'us', 'and'],
 ['us', 'and', 'japan'],
 ['and', 'japan', 'has'],
 ['japan', 'has', 'raised'],
 ['has', 'raised', 'fears'],
 ['raised', 'fears', 'among'],
 ['fears', 'among', 'many'],
 ['among', 'many', 'of'],
 ['many', 'of', "asia's"],
 ['of', "asia's", 'exporting'],
 ["asia's", 'exporting', 'nations'],
 ['exporting', 'nations', 'that'],
 ['nations', 'that', 'the'],
 ['that', 'the', 'row'],
 ['the', 'row', 'could'],
 ['row', 'could', 'inflict'],
 ['could', 'inflict', 'far'],
 ['inflict', 'far', 'reaching'],
 ['far', 'reaching', 'economic'],
 ['reaching', 'economic', 'damage'],
 ['economic', 'damage', 'businessmen'

### 3. Building the N-gram Language Model

In [47]:
# for defining the N-gram model
from collections import Counter, defaultdict

# Create a placeholder for model
model = defaultdict(lambda: defaultdict(lambda: 0))

# Count frequency of co-occurance  
for i in range(dataset.shape[0]):
    # for each trigram pair
    for w1, w2, w3 in create_trigram(dataset['Sentences'][i]):
        # count the occurance of word 3, given word 1 and word 2
        model[(w1, w2)][w3] += 1

In [51]:
# defined model
for x in list(model.items())[0:5]:
    print(x)

(('asian', 'exporters'), defaultdict(<function <lambda>.<locals>.<lambda> at 0x00000234F33560D0>, {'fear': 1}))
(('exporters', 'fear'), defaultdict(<function <lambda>.<locals>.<lambda> at 0x00000234F3356700>, {'damage': 1}))
(('fear', 'damage'), defaultdict(<function <lambda>.<locals>.<lambda> at 0x00000234F33563A0>, {'from': 1}))
(('damage', 'from'), defaultdict(<function <lambda>.<locals>.<lambda> at 0x00000234F3356280>, {'us': 1, 'local': 1}))
(('from', 'us'), defaultdict(<function <lambda>.<locals>.<lambda> at 0x00000234F3356DC0>, {'japan': 1, 'might': 1, 'ports': 1, 'it': 1, 'plants': 1, "video's": 1}))


### 4. Predicting the next word using N-gram Language Model

In [52]:
# predict the next word
dict(model["between", "the"])

{'us': 5,
 'start': 1,
 'two': 18,
 'employers': 1,
 'united': 4,
 'demands': 1,
 'countries': 1,
 'ec': 1,
 'dollar': 1,
 'white': 1,
 'prime': 3,
 'report': 2,
 'increased': 1,
 'secured': 1,
 'pork': 1,
 'omani': 1,
 'group': 1,
 'purchase': 1,
 'federal': 1,
 'first': 1,
 'heaviest': 1,
 'growth': 1}

In [53]:
# another example
dict(model["from", "us"])

{'japan': 1, 'might': 1, 'ports': 1, 'it': 1, 'plants': 1, "video's": 1}

In [54]:
# another example
dict(model["us", "and"])

{'japan': 6,
 'lead': 1,
 'new': 1,
 'japanese': 3,
 'rising': 1,
 'britain': 2,
 'europe': 2,
 'west': 3,
 'australian': 1,
 'world': 1,
 'european': 1,
 'canada': 2,
 'up': 1,
 'the': 1,
 'hong': 1,
 'its': 1,
 'other': 2,
 'malaysia': 1,
 'tight': 1,
 'on': 1,
 'canadian': 1,
 'raise': 1,
 'we': 1}

In [55]:
# another example
dict(model["short", "term"])

{"tokyo's": 1,
 'commercial': 1,
 'view': 1,
 'fluctuations': 2,
 'interest': 3,
 'impact': 1,
 'refers': 1,
 'differences': 1,
 'discount': 10,
 'investment': 1,
 'potential': 1,
 'given': 1,
 'cut': 1,
 'money': 1,
 'recovery': 1,
 'rates': 2,
 'rally': 1,
 'rate': 2,
 'treasury': 1,
 'foreign': 1,
 'boost': 1}

### Probabilistic Output

In [56]:
# creating the unigram list
unigram_dict = {}
for i in tqdm(range(dataset.shape[0])):
    # add word-count pair to the dictionary
    for word in dataset['unigram'][i]:   
        # check if the word is already in dictionary 
        if word[0] in unigram_dict:
            # increment count of word by 1 
            unigram_dict[word[0]] = unigram_dict[word[0]] + 1
        else:
            # add the word to dictionary with count 1 
            unigram_dict[word[0]] = 1

100%|█████████████████████████████████████████████████████████████████████████| 10000/10000 [00:00<00:00, 40699.86it/s]


In [57]:
# unigram list
for x in list(unigram_dict.items())[0:5]:
    print(x)

('asian', 13)
('exporters', 49)
('fear', 8)
('damage', 29)
('from', 1369)


In [64]:
# find the overall frequency of words in the corpus
counts = Counter(unigram_dict)

In [65]:
for x in list(counts.items())[0:5]:
    print(x)

('asian', 13)
('exporters', 49)
('fear', 8)
('damage', 29)
('from', 1369)


In [66]:
# vocabulary size
total_count = len(unigram_dict)
total_count

13170

In [76]:
# relative frequencies of each word
for word in counts:
    counts[word] /= float(total_count)

list(counts.items())[0:5]

[('asian', 3.281062305721333e-20),
 ('exporters', 1.23670809984881e-19),
 ('fear', 2.019115265059282e-20),
 ('damage', 7.319292835839896e-20),
 ('from', 3.4552109973326964e-18)]

In [77]:
# Let's transform the counts to probabilities
for w1_w2 in model:
    total_count = float(sum(model[w1_w2].values()))
    for w3 in model[w1_w2]:
        model[w1_w2][w3] /= total_count

In [78]:
# predict the next word
dict(model["between", "the"])

{'us': 0.10204081632653059,
 'start': 0.020408163265306117,
 'two': 0.36734693877551017,
 'employers': 0.020408163265306117,
 'united': 0.08163265306122447,
 'demands': 0.020408163265306117,
 'countries': 0.020408163265306117,
 'ec': 0.020408163265306117,
 'dollar': 0.020408163265306117,
 'white': 0.020408163265306117,
 'prime': 0.06122448979591835,
 'report': 0.040816326530612235,
 'increased': 0.020408163265306117,
 'secured': 0.020408163265306117,
 'pork': 0.020408163265306117,
 'omani': 0.020408163265306117,
 'group': 0.020408163265306117,
 'purchase': 0.020408163265306117,
 'federal': 0.020408163265306117,
 'first': 0.020408163265306117,
 'heaviest': 0.020408163265306117,
 'growth': 0.020408163265306117}

In [79]:
# another example
dict(model["us", "and"])

{'japan': 0.16666666666666663,
 'lead': 0.02777777777777777,
 'new': 0.02777777777777777,
 'japanese': 0.08333333333333331,
 'rising': 0.02777777777777777,
 'britain': 0.05555555555555554,
 'europe': 0.05555555555555554,
 'west': 0.08333333333333331,
 'australian': 0.02777777777777777,
 'world': 0.02777777777777777,
 'european': 0.02777777777777777,
 'canada': 0.05555555555555554,
 'up': 0.02777777777777777,
 'the': 0.02777777777777777,
 'hong': 0.02777777777777777,
 'its': 0.02777777777777777,
 'other': 0.05555555555555554,
 'malaysia': 0.02777777777777777,
 'tight': 0.02777777777777777,
 'on': 0.02777777777777777,
 'canadian': 0.02777777777777777,
 'raise': 0.02777777777777777,
 'we': 0.02777777777777777}

In [80]:
# another example
dict(model["short", "term"])

{"tokyo's": 0.02857142857142857,
 'commercial': 0.02857142857142857,
 'view': 0.02857142857142857,
 'fluctuations': 0.05714285714285714,
 'interest': 0.08571428571428572,
 'impact': 0.02857142857142857,
 'refers': 0.02857142857142857,
 'differences': 0.02857142857142857,
 'discount': 0.2857142857142857,
 'investment': 0.02857142857142857,
 'potential': 0.02857142857142857,
 'given': 0.02857142857142857,
 'cut': 0.02857142857142857,
 'money': 0.02857142857142857,
 'recovery': 0.02857142857142857,
 'rates': 0.05714285714285714,
 'rally': 0.02857142857142857,
 'rate': 0.05714285714285714,
 'treasury': 0.02857142857142857,
 'foreign': 0.02857142857142857,
 'boost': 0.02857142857142857}