<a href="https://colab.research.google.com/github/aditichak22/nlp-bigram/blob/main/Bigram_smoothing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive

drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import re
import pandas as pd


def readFromTrainingData():
  with open('/content/drive/My Drive/train.txt', 'r') as f:
    trainingData = f.readlines()
    wordPattern = re.compile("\w")

    data = []
    for sentence in trainingData:
      for word in sentence.split():
        if wordPattern.match(word):
          data.append(word)

    return data




def createBigramCount(data): 
  bigramList = []
  bigramCount = {}
  unigramCount = {}

  for i in range(len(data) - 1):
    if (i < len(data) - 1):
      bigramList.append((data[i], data[i+1]))
    
    if (data[i] not in unigramCount):
      unigramCount[data[i]] = 1
    else:
      unigramCount[data[i]] +=1
    
    if ((data[i], data[i+1]) not in bigramCount):
      bigramCount[(data[i], data[i+1])] = 1
    else:
      bigramCount[(data[i], data[i+1])] += 1
  return bigramList, unigramCount, bigramCount



def calculateTrainingPorbability(bigramList, unigramCount, bigramCount):
  probability = {}
  for bigram in bigramList:
    firstWord = bigram[0]
    secondWord = bigram[1]
    probability[bigram] = (bigramCount.get(bigram))/(unigramCount.get(firstWord))
  return probability


def readTestData(sentence):
  wordPattern = re.compile("\w")
  sentenceList = sentence.split()
  data = []
  for word in sentenceList:
    if wordPattern.match(word):
      data.append(word)
  return data

def bigramTestData(data, bigramCount, unigramCount, trainingProbability):
  bigramTestList = []
  bigramTestCount = {}
  sentenceProbability = 1
  bigramTestProbability = {}
  unigramTestCount = {}
  
  for i in range(len(data) - 1):
    if (i < len(data) - 1):
      bigramTestList.append((data[i], data[i+1]))

    if ((data[i], data[i+1]) not in bigramCount):
      bigramTestCount[(data[i], data[i+1])] = 0
    elif ((data[i], data[i+1]) not in bigramTestCount):
      bigramTestCount[(data[i], data[i+1])] = bigramCount.get((data[i], data[i+1]))
    elif ((data[i], data[i+1]) in bigramTestCount):
      bigramTestCount[(data[i], data[i+1])] += 1
  
    if (data[i] not in unigramCount):
      unigramTestCount[data[i]] = 0
    elif (data[i] not in unigramTestCount):
      unigramTestCount[data[i]] = unigramCount.get(data[i])
    elif (data[i] in unigramTestCount):
      unigramTestCount[data[i]] += 1

  for bigram in bigramTestList:

    firstWord = bigram[0]
    bigramTestProbability[bigram] = (bigramTestCount.get(bigram))/(unigramTestCount.get(firstWord))

    if bigram in trainingProbability:
      sentenceProbability *= trainingProbability[bigram]
    else:
      sentenceProbability *= 0
  
  return unigramTestCount, bigramTestCount, bigramTestProbability, sentenceProbability




def addOneSmoothingTraining(unigramCount, bigramCount, bigramList):
  smoothUnigram = {}
  smoothBigram = {}
  v = len(unigramCount)
  for word in unigramCount:
    smoothUnigram[word] = unigramCount.get(word) + v

  for word in bigramCount:
    smoothBigram[word] = bigramCount.get(word) + 1
  
  smoothProbability = calculateTrainingPorbability(bigramList, smoothUnigram, smoothBigram)
  
  return smoothUnigram, smoothBigram, smoothProbability


def createBigramCountTable(bigramTestCount, data, smoothing):
  df = pd.DataFrame(columns=data, index=data)
  if (smoothing == 0):
    df = df.fillna(0)
  else:
    df = df.fillna(1)
  for bigram in bigramTestCount:
    df.loc[bigram[0], bigram[1]] = bigramTestCount.get(bigram)
  return df

def createBigramProbabilityTable(bigramTestProbability, data):
  df1 = pd.DataFrame(columns=data, index=data)
  df1 = df1.fillna(0.0)
  for bigram in bigramTestProbability:
    df1.loc[bigram[0], bigram[1]] = bigramTestProbability.get(bigram)
  return df1


def createSmoothBigramProbabilityTable(bigramTestProbability, data, smoothUnigram):
  df1 = pd.DataFrame(columns=data, index=data)
  for bigram in bigramTestProbability:
    df1.loc[bigram[0], bigram[1]] = bigramTestProbability.get(bigram)
  for i in range(len(df1)):
    for j in range(len(df1.columns)):
      if (pd.isnull(df1.iloc[i,j])):
        rowName = data[i]
        newValue = (1 / smoothUnigram.get(rowName))
        df1.iloc[i,j] = newValue
  return df1


def main(smoothing):

  sentences = ["mark antony , heere take you caesars body : you shall not come to them poet .", 
  "no , sir , there are no comets seen , the heauens speede thee in thine enterprize ."]

  for sentence in sentences:
    bigramList, unigramCount, bigramCount = createBigramCount(readFromTrainingData())
    data = readTestData(sentence)
    if (smoothing == 0):
      probability = calculateTrainingPorbability(bigramList, unigramCount, bigramCount)
      unigramTestCount, bigramTestCount, bigramTestProbability, sentenceProbability = bigramTestData(data, bigramCount, unigramCount, probability)
      print("For the sentence: " + sentence)
      print("\nBigram count table without add one smoothing")
      display(createBigramCountTable(bigramTestCount, data, smoothing ))
      print("\nBigram probability table without add one smoothing")
      display(createBigramProbabilityTable(bigramTestProbability, data))
      print("\nSentence probability is ", sentenceProbability)
      
    elif (smoothing == 1):
      smoothUnigram, smoothBigram, smoothProbability = addOneSmoothingTraining(unigramCount, bigramCount, bigramList)
      unigramTestCount, bigramTestCount, bigramTestProbability, sentenceProbability = bigramTestData(data, smoothBigram, smoothUnigram, smoothProbability)
      print("For the sentence: " + sentence)
      print("\nBigram count table with add one smoothing")
      display(createBigramCountTable(bigramTestCount, data, smoothing))
      print("\nBigram probability table with add one smoothing")
      display(createSmoothBigramProbabilityTable(bigramTestProbability, data, smoothUnigram))
      print("\nSentence probability with add one smoothing is ", sentenceProbability)


main(1)
    

# if __name__ == '__main__':
#   import argparse
#   parser = argparse.ArgumentParser()
#   parser.add_argument('--smoothing', type=int, metavar='path', required=True,
#                         help='1 for smoothing, 0 for no smoothing')
#   args = parser.parse_args()
#   main(args.smoothing)


For the sentence: mark antony , heere take you caesars body : you shall not come to them poet .

Bigram count table with add one smoothing


Unnamed: 0,mark,antony,heere,take,you,caesars,body,you.1,shall,not,come,to,them,poet
mark,1,14,1,1,1,1,1,1,1,1,1,1,1,1
antony,1,1,2,1,1,1,1,1,1,1,1,1,1,1
heere,1,1,1,3,1,1,1,1,1,1,1,1,1,1
take,1,1,1,1,2,1,1,2,1,1,1,1,1,1
you,1,1,1,1,1,2,1,1,17,1,1,1,1,1
caesars,1,1,1,1,1,1,3,1,1,1,1,1,1,1
body,1,1,1,1,2,1,1,2,1,1,1,1,1,1
you,1,1,1,1,1,2,1,1,17,1,1,1,1,1
shall,1,1,1,1,1,1,1,1,1,11,1,1,1,1
not,1,1,1,1,1,1,1,1,1,1,8,1,1,1



Bigram probability table with add one smoothing


Unnamed: 0,mark,antony,heere,take,you,caesars,body,you.1,shall,not,come,to,them,poet
mark,0.000329707,0.00461589,0.000329707,0.000329707,0.000329707,0.000329707,0.000329707,0.000329707,0.000329707,0.000329707,0.000329707,0.000329707,0.000329707,0.000329707
antony,0.000323206,0.000323206,0.000646412,0.000323206,0.000323206,0.000323206,0.000323206,0.000323206,0.000323206,0.000323206,0.000323206,0.000323206,0.000323206,0.000323206
heere,0.000324886,0.000324886,0.000324886,0.000974659,0.000324886,0.000324886,0.000324886,0.000324886,0.000324886,0.000324886,0.000324886,0.000324886,0.000324886,0.000324886
take,0.000328731,0.000328731,0.000328731,0.000328731,0.000657462,0.000328731,0.000328731,0.000657462,0.000328731,0.000328731,0.000328731,0.000328731,0.000328731,0.000328731
you,0.000293255,0.000293255,0.000293255,0.000293255,0.000293255,0.000586338,0.000293255,0.000293255,0.00498388,0.000293255,0.000293255,0.000293255,0.000293255,0.000293255
caesars,0.000327118,0.000327118,0.000327118,0.000327118,0.000327118,0.000327118,0.000981354,0.000327118,0.000327118,0.000327118,0.000327118,0.000327118,0.000327118,0.000327118
body,0.000329815,0.000329815,0.000329815,0.000329815,0.000659631,0.000329815,0.000329815,0.000659631,0.000329815,0.000329815,0.000329815,0.000329815,0.000329815,0.000329815
you,0.000293255,0.000293255,0.000293255,0.000293255,0.000293255,0.000586338,0.000293255,0.000293255,0.00498388,0.000293255,0.000293255,0.000293255,0.000293255,0.000293255
shall,0.000318066,0.000318066,0.000318066,0.000318066,0.000318066,0.000318066,0.000318066,0.000318066,0.000318066,0.00349873,0.000318066,0.000318066,0.000318066,0.000318066
not,0.00030525,0.00030525,0.00030525,0.00030525,0.00030525,0.00030525,0.00030525,0.00030525,0.00030525,0.00030525,0.002442,0.00030525,0.00030525,0.00030525



Sentence probability with add one smoothing is  8.995392906234366e-38
For the sentence: no , sir , there are no comets seen , the heauens speede thee in thine enterprize .

Bigram count table with add one smoothing


Unnamed: 0,no,sir,there,are,no.1,comets,seen,the,heauens,speede,thee,in,thine,enterprize
no,1,3,1,1,1,2,1,1,1,1,1,1,1,1
sir,1,1,2,1,1,1,1,1,1,1,1,1,1,1
there,1,1,1,4,1,1,1,1,1,1,1,1,1,1
are,3,1,1,1,3,1,1,1,1,1,1,1,1,1
no,1,3,1,1,1,2,1,1,1,1,1,1,1,1
comets,1,1,1,1,1,1,2,1,1,1,1,1,1,1
seen,1,1,1,1,1,1,1,2,1,1,1,1,1,1
the,1,1,1,1,1,1,1,1,6,1,1,1,1,1
heauens,1,1,1,1,1,1,1,1,1,2,1,1,1,1
speede,1,1,1,1,1,1,1,1,1,1,2,1,1,1



Bigram probability table with add one smoothing


Unnamed: 0,no,sir,there,are,no.1,comets,seen,the,heauens,speede,thee,in,thine,enterprize
no,0.000321337,0.000963701,0.000321337,0.000321337,0.000321337,0.000642467,0.000321337,0.000321337,0.000321337,0.000321337,0.000321337,0.000321337,0.000321337,0.000321337
sir,0.000328731,0.000328731,0.000657462,0.000328731,0.000328731,0.000328731,0.000328731,0.000328731,0.000328731,0.000328731,0.000328731,0.000328731,0.000328731,0.000328731
there,0.000325415,0.000325415,0.000325415,0.00130166,0.000325415,0.000325415,0.000325415,0.000325415,0.000325415,0.000325415,0.000325415,0.000325415,0.000325415,0.000325415
are,0.000959079,0.000319693,0.000319693,0.000319693,0.000959079,0.000319693,0.000319693,0.000319693,0.000319693,0.000319693,0.000319693,0.000319693,0.000319693,0.000319693
no,0.000321337,0.000963701,0.000321337,0.000321337,0.000321337,0.000642467,0.000321337,0.000321337,0.000321337,0.000321337,0.000321337,0.000321337,0.000321337,0.000321337
comets,0.000331126,0.000331126,0.000331126,0.000331126,0.000331126,0.000331126,0.000662252,0.000331126,0.000331126,0.000331126,0.000331126,0.000331126,0.000331126,0.000331126
seen,0.000331126,0.000331126,0.000331126,0.000331126,0.000331126,0.000331126,0.000331126,0.000662252,0.000331126,0.000331126,0.000331126,0.000331126,0.000331126,0.000331126
the,0.000277932,0.000277932,0.000277932,0.000277932,0.000277932,0.000277932,0.000277932,0.000277932,0.00166759,0.000277932,0.000277932,0.000277932,0.000277932,0.000277932
heauens,0.000330688,0.000330688,0.000330688,0.000330688,0.000330688,0.000330688,0.000330688,0.000330688,0.000330688,0.000661376,0.000330688,0.000330688,0.000330688,0.000330688
speede,0.000331016,0.000331016,0.000331016,0.000331016,0.000331016,0.000331016,0.000331016,0.000331016,0.000331016,0.000331016,0.000662032,0.000331016,0.000331016,0.000331016



Sentence probability with add one smoothing is  6.482117024550917e-41
