In [27]:
import re
from tqdm import tqdm 
import numpy as np
import pandas as pd

In [4]:
with open('big.txt','r') as fd:
    lines = fd.readlines()
    
    words = []
    for line in lines: 
        words += re.findall('\w+',line.lower())

## Finding the pairs

In [10]:
def get_pairs(words):
    
    data = []

    for i in range(len(words)-1):
        data.append(' '.join(words[i:i+2]))
        
    return data

In [11]:
data = get_pairs(words)

In [12]:
len(data)

1115584

In [13]:
data[:10]

['the project',
 'project gutenberg',
 'gutenberg ebook',
 'ebook of',
 'of the',
 'the adventures',
 'adventures of',
 'of sherlock',
 'sherlock holmes',
 'holmes by']

## Finding Occurence Probabilities

In [21]:
## takes a lot of time (keyboard interrupt)

print(len(data))
unique_pairs = list(set(data))
print(len(unique_pairs))

prob_dist = []

for pair in tqdm(unique_pairs):
    prob_dist.append([pair,data.count(pair)])

1115584
390694


  0%|          | 1003/390694 [00:31<3:26:19, 31.48it/s]


KeyboardInterrupt: 

In [23]:
a = np.array(data)

pair, count = np.unique(a,return_counts=True)
print(len(pair))

390694


In [24]:
pair[:10]

array(['0 05', '0 25', '0 45', '0 5', '0 6', '0 7', '0 9', '0 i', '00 99',
       '00 went'], dtype='<U30')

In [25]:
count[:10]

array([1, 1, 1, 1, 4, 1, 1, 1, 2, 1], dtype=int64)

In [26]:
prob_dist = []

for i in range(len(pair)):
    prob_dist.append([pair[i],count[i],pair[i].split(' ')[-1]])
    
print(len(prob_dist))

390694


## Predicting the Words

In [39]:
df = pd.DataFrame(prob_dist,columns=['pair', 'freq' , 'output'])
df = df[df['freq']>=5]
df.head()

## Probabilistic Method to define next word

def predict(word):
    df_pred = []

    for i in df.values:
        if i[0].split(' ')[0] == word:
            df_pred.append([i[0],i[1],i[2]])

    df_pred = pd.DataFrame(df_pred, columns = ['in', 'freq', 'out'])
    return (list(df_pred.sort_values(by = 'freq', ascending= False).head()['out'].values))

In [40]:
predict('the')

['same', 'french', 'first', 'old', 'emperor']

In [41]:
predict('same')

['time', 'way', 'as', 'lines', 'thing']

In [42]:
word = 'the'

for i in range(10):
    pred = predict(word)
    word = pred[0]
    
    print(word, end = ' ')

same time to the same time to the same time 

In [44]:
word = 'this' 
preds = []
preds.append(word)

for i in range(5):
    pred = predict(word)
    print(pred)
    word = pred[int(input('Enter the index : '))]
    preds.append(word)
    
print(' '.join(preds))

['is', 'was', 'way', 'and', 'time']
Enter the index : 2
['of', 'to', 'and', 'the', 'in']
Enter the index : 2
['the', 'a', 'in', 'that', 'he']
Enter the index : 2
['the', 'a', 'his', 'which', 'this']
Enter the index : 2
['eyes', 'head', 'own', 'face', 'wife']
Enter the index : 2
this way and in his own


# Bi-Gram , Tri-Gram & N-Gram

In [47]:
with open('big.txt','r') as fd:
    lines = fd.readlines()
    
    words = []
    for line in lines: 
        words += re.findall('\w+',line.lower())
        
def get_pairs(words, n):
    
    n = n+1
    data = []

    for i in range(len(words)-n):
        data.append(' '.join(words[i:i+n]))
        
    return data

In [50]:
data = get_pairs(words,3)
data

['the project gutenberg ebook',
 'project gutenberg ebook of',
 'gutenberg ebook of the',
 'ebook of the adventures',
 'of the adventures of',
 'the adventures of sherlock',
 'adventures of sherlock holmes',
 'of sherlock holmes by',
 'sherlock holmes by sir',
 'holmes by sir arthur',
 'by sir arthur conan',
 'sir arthur conan doyle',
 'arthur conan doyle 15',
 'conan doyle 15 in',
 'doyle 15 in our',
 '15 in our series',
 'in our series by',
 'our series by sir',
 'series by sir arthur',
 'by sir arthur conan',
 'sir arthur conan doyle',
 'arthur conan doyle copyright',
 'conan doyle copyright laws',
 'doyle copyright laws are',
 'copyright laws are changing',
 'laws are changing all',
 'are changing all over',
 'changing all over the',
 'all over the world',
 'over the world be',
 'the world be sure',
 'world be sure to',
 'be sure to check',
 'sure to check the',
 'to check the copyright',
 'check the copyright laws',
 'the copyright laws for',
 'copyright laws for your',
 'laws for

## Finding Occurence Probabilities

In [53]:
a = np.array(data)

pair, count = np.unique(a,return_counts=True)
print(len(data))
unique_pairs = list(set(data))
print(len(unique_pairs))

prob_dist = []

for i in range(len(unique_pairs)):
    prob_dist.append([' '.join(unique_pairs[i].split(' ')[:-1]),unique_pairs[i].split(' ')[-1], count[i]])
    
print('Probability Distribution: ' , len(prob_dist))

1115581
1032700
Probability Distribution:  1032700


# Predicting the word

In [56]:
df = pd.DataFrame(prob_dist,columns=['in', 'out' , 'freq'])
# df = df[df['freq']>=5]
# df.head()

## Probabilistic Method to define next word

def get_prediction(word):
    if len(df[df['in'] == word]):
        df_ = df[df['in']==word]
        return list(df_.sort_values(by='freq',ascending = False).head()['out'].values)
    else:
        print('Seq is not present')

In [57]:
get_prediction('this is not')

['at', 'immediately', 'strictly', 'a', 'such']

In [58]:
get_prediction('this is a')

['certain', 'comparatively', 'great', 'map', 'nervous']

# Prediction with Auto sequencing

In [66]:
word = 'ebook of the'

output = []
output.append(word)

for i in range(50):
    
    pred = get_prediction(word)
    if len(word.split(' ')) > 1 and pred:
        word = ' '.join(word.split(' ')[1:]) + ' ' + pred[0]
        output.append(pred[0])
    else:
        break
    
print(' '.join(output))

ebook of the adventures of sherlock holmes this is wanting in the police court i could hardly imagine a more damning case i remarked if it is sufficiently vascular to nourish the grafts placed on it his small leather gloved hands this short man nodded to dolgorukov as to an intimate acquaintance anna
