In [3]:
import nltk
nltk.download("stopwords")
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import pandas as pd
import string

[nltk_data] Downloading package stopwords to /Users/kedar/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:
from collections import defaultdict, Counter


In [5]:
from pathlib import Path
Path.cwd()

PosixPath('/Users/kedar')

In [6]:
with open("train.txt", 'r') as file:
    lines = [line.rstrip() for line in file]

len(lines)

test_sentence = lines[0]
test_sentence = "I booked 2 rooms four months in advance at the Talbott * . We were placed on the 1st floor next to the elevators , which are used all night long ."

In [7]:
df = pd.DataFrame(lines, columns=['reviews'])
df.tail()

Unnamed: 0,reviews
507,Swissotel continues to be a *yawn* As previous...
508,My husband & I stayed at the Fitzpatrick in ea...
509,I stayed at the Hilton Chicago last week and w...
510,Just back from 5 night stay at Omni and would ...
511,We booked our hotel stay thru Yahoo and reques...


In [8]:
df.isnull().sum()

reviews    0
dtype: int64

In [9]:
stop_words = set(stopwords.words("english"))


In [10]:
nltk.download("punkt")

[nltk_data] Downloading package punkt to /Users/kedar/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [11]:
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to /Users/kedar/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [12]:
#remove punctuations and digit
filtered_sent = test_sentence.translate(str.maketrans('', '', string.punctuation))
filtered_sent = ''.join([char for char in filtered_sent if not char.isdigit()])
filtered_sent

'I booked  rooms four months in advance at the Talbott   We were placed on the st floor next to the elevators  which are used all night long '

In [13]:
word_tokens = word_tokenize(filtered_sent)
print(word_tokens)

['I', 'booked', 'rooms', 'four', 'months', 'in', 'advance', 'at', 'the', 'Talbott', 'We', 'were', 'placed', 'on', 'the', 'st', 'floor', 'next', 'to', 'the', 'elevators', 'which', 'are', 'used', 'all', 'night', 'long']


In [14]:
#removing stop words
filtered_tokens = [w for w in word_tokens if not w.lower() in stop_words]
filtered_sent = ' '.join(filtered_tokens)
print(filtered_sent)

booked rooms four months advance Talbott placed st floor next elevators used night long


In [15]:
#stemming
from nltk.stem import PorterStemmer
stemmer = PorterStemmer()
stem_tokens = [stemmer.stem(word) for word in filtered_tokens]
print(stem_tokens)

['book', 'room', 'four', 'month', 'advanc', 'talbott', 'place', 'st', 'floor', 'next', 'elev', 'use', 'night', 'long']


In [16]:
#lemmatization
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
lemmatizer = WordNetLemmatizer()
lem_tokens = [lemmatizer.lemmatize(word) for word in filtered_tokens]
print(lem_tokens)

[nltk_data] Downloading package wordnet to /Users/kedar/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


['booked', 'room', 'four', 'month', 'advance', 'Talbott', 'placed', 'st', 'floor', 'next', 'elevator', 'used', 'night', 'long']


In [17]:
from collections import Counter


In [18]:
#make a function to remove stop words and convert to lower case
def preprocessing(sentence):
    #print(type(sentence))
    filtered_sent = sentence.translate(str.maketrans('', '', string.punctuation)) #remove punctuations
    filtered_sent = ''.join([char for char in filtered_sent if not char.isdigit()]) #remove digits
    word_tokens = word_tokenize(filtered_sent) #tokenize
    filtered_tokens = [w for w in word_tokens if not w.lower() in stop_words] #removing stopwords
    stem_tokens = [stemmer.stem(word) for word in filtered_tokens] #stemming
    
    filtered_sent = ' '.join(stem_tokens) #joining the tokens
    return filtered_sent



In [19]:
#creating a vocab from training data
def handle_unk(data, threshold=1):
    word_counts = Counter()
    for sentences in data:
        sentences = preprocessing(sentences)
        sentences = sentences.lower().split()
        word_counts.update(sentences)

    vocab = {word for word, count in word_counts.items() if count > threshold}
    return word_counts, vocab

word_counts, vocab = handle_unk(lines,1)

In [20]:
#replacing unknown words with a token <unk>
def replace_unk(sentences, vocab=vocab):

    sentences = preprocessing(sentences)
    sentences = sentences.lower().split()

    x = [word if word in vocab else "unkown_word" for word in sentences]
    x = ' '.join(x)
    return x
# replace unk in training set too
lines = [replace_unk(sentence) for sentence in lines]

In [21]:
test_sentence_clean = preprocessing(test_sentence)
print("Before preprocessing:", test_sentence)
print("After preprocessing:", test_sentence_clean)

Before preprocessing: I booked 2 rooms four months in advance at the Talbott * . We were placed on the 1st floor next to the elevators , which are used all night long .
After preprocessing: book room four month advanc talbott place st floor next elev use night long


In [22]:
df['clean'] = df['reviews'].apply(lambda x: preprocessing(x))
# add a new method to apply replace_unk to all the values in df['clean']
df['clean'].iloc[0]

'book two room four month advanc talbott place top floor next elev use night long speak front desk told simpli honor request upper floor request better view look brick wall get sleep also told receiv complaint guest th floor awar nois problem place us floor hotel total book request upper floor constitut place someon top floor use request justifi decid stay request room lower floor away elev spoke length book two room prefer simpli poor treatment guest believ would complain'

Unigram counts!

In [23]:
# Dictionary to store unigrams
unigrams = defaultdict(int)

# Function to calculate unigrams for a sentence using nltk tokenizer
def calculate_unigrams(sentence):
    tokens = nltk.word_tokenize(sentence)  # Tokenize the sentence using nltk
    unigram_counts = Counter(tokens)  # Count the frequency of each token
    for token, count in unigram_counts.items():
        unigrams[token] += count  # Add the count to the unigrams dictionary

# Apply the function to each sentence in the 'clean' column
df['unigram_counts'] =  df['clean'].apply(lambda x: calculate_unigrams(x))

In [24]:
print(len(unigrams))

4268


In [25]:
from collections import Counter, defaultdict
test_sentence = lines[510]
#test_sentence = "the students like the assignment"
print(test_sentence)

back night stay omni would thoroughli recommend staff bent backward assist anyth matter minut great locat magnific mile lot adult unkown_word daughter find fault would extra charg tax etc room servic bar restaur cheap day would definit recommend especi famili


In [26]:

clean_test_sent = preprocessing(test_sentence)
test_tokens = nltk.word_tokenize(test_sentence)
print(test_tokens)

['back', 'night', 'stay', 'omni', 'would', 'thoroughli', 'recommend', 'staff', 'bent', 'backward', 'assist', 'anyth', 'matter', 'minut', 'great', 'locat', 'magnific', 'mile', 'lot', 'adult', 'unkown_word', 'daughter', 'find', 'fault', 'would', 'extra', 'charg', 'tax', 'etc', 'room', 'servic', 'bar', 'restaur', 'cheap', 'day', 'would', 'definit', 'recommend', 'especi', 'famili']


Bigram Counts!

In [27]:
bigram_probabs = defaultdict(dict)

In [28]:
bigrams = defaultdict(Counter)

def calc_bigrams(sentence):
    tokens = nltk.word_tokenize(sentence)
    
    # Update bigram counts
    for i in range(len(tokens) - 1):
        w1, w2 = tokens[i], tokens[i + 1]
        bigrams[w1][w2] += 1  # Increment the count for the bigram (w1, w2)


bigrams_col = df['clean'].apply(lambda x: pd.Series(calc_bigrams(x)))

In [29]:
vocab_size = len(unigrams)

def calc_laplace_ngrams():
    k =1  # Smaller constant for smoothing
    for w1 in bigrams:
        total_w1 = unigrams[w1]  # Count of the first word in the bigram
        for w2 in bigrams[w1]:
            # Correct Laplace smoothing formula with smaller constant
            #bigram_probabs[w1][w2] = (bigrams[w1][w2] + k) / (total_w1 + k * vocab_size)  
            bigram_probabs[w1][w2] = (bigrams[w1][w2] +k) / (total_w1 +k*vocab_size )  

        # Calculate probabilities for unobserved bigrams
        for w2 in unigrams:  # Iterate over the vocabulary
            if w2 not in bigrams[w1]:  # If the bigram (w1, w2) is not observed
                bigram_probabs[w1][w2] = k / (total_w1 + k*vocab_size)  # Laplace smoothing for unobserved bigrams

    # Create a list of bigrams with probabilities
    list_bigrams = [f"P({w2}|{w1}) = {bigram_probabs[w1][w2]:.4f}" for w1 in bigram_probabs for w2 in bigram_probabs[w1]]
    return list_bigrams


In [30]:
with open("val.txt", 'r') as file:
    test_lines = [line.rstrip() for line in file]

len(test_lines)

70

In [31]:
df_test = pd.DataFrame(test_lines, columns=['reviews'])
df_test.tail()

Unnamed: 0,reviews
65,My husband and I were very excited to be stayi...
66,Had a week long stay at the Hilton on south Mi...
67,"If you love Brick Walls and Alleyways , then t..."
68,Booked a room w/ a queen bed for 2 nights for ...
69,Stayed there three nights from 4/17/09 through...


In [32]:
df_test['clean'] = df_test['reviews'].apply(lambda x: preprocessing(x))
df_test['filtered'] = df_test['clean'].apply(lambda x: replace_unk(x,vocab))
df_test.tail() #test data with all unknown words handled

Unnamed: 0,reviews,clean,filtered
65,My husband and I were very excited to be stayi...,husband excit stay conrad unfortun would never...,husband excit stay conrad unfortun would never...
66,Had a week long stay at the Hilton on south Mi...,week long stay hilton south michigan attend me...,week long stay hilton south michigan attend me...
67,"If you love Brick Walls and Alleyways , then t...",love brick wall alleyway view ask room face ri...,love brick wall unkown_word view ask room face...
68,Booked a room w/ a queen bed for 2 nights for ...,book room w queen bed night wonder nye rather ...,book room w queen bed night wonder unkown_word...
69,Stayed there three nights from 4/17/09 through...,stay three night checheck wait minut special r...,stay three night unkown_word wait minut specia...


In [33]:
unk_counts = df_test['filtered'].apply(lambda x: x.count('<unk>'))
print(df_test['filtered'].tail())

65    husband excit stay conrad unfortun would never...
66    week long stay hilton south michigan attend me...
67    love brick wall unkown_word view ask room face...
68    book room w queen bed night wonder unkown_word...
69    stay three night unkown_word wait minut specia...
Name: filtered, dtype: object


In [34]:
# Define a small constant probability for <unk> tokens
import math
UNK_PROB = 1e-3  # You can adjust this value as needed

def calc_probabilities(sentence):
    tokens = nltk.word_tokenize(sentence)

    # Calculate unigram probabilities for the sentence
    # Calculate bigram probabilities for the sentence
    log_combined_prob = 0
    for i in range(len(tokens) - 1):
        w1 = tokens[i]
        w2 = tokens[i + 1]
        # Check bigram probability with fallback for unknown tokens
        if w1 in bigram_probabs and w2 in bigram_probabs[w1]:
            prob = bigram_probabs[w1][w2]
        else:
            prob = UNK_PROB  # Use small constant probability for unknown bigram

        log_combined_prob += math.log(prob)

    # Convert log probability back to probability

    return log_combined_prob


#result = calc_probabilities('book two room four month advanc talbott place top floor next elev use night long speak front desk told simpli honor request upper floor request better view look brick wall get sleep also told receiv complaint guest th floor awar nois problem place us floor hotel total book request upper floor constitut place someon top floor use request justifi decid stay request room lower floor away elev spoke length book two room prefer simpli poor treatment guest believ would complain')


In [35]:
df_test['log_prob_laplace'] = df_test['filtered'].apply(lambda x: pd.Series(calc_probabilities(x)))

In [36]:
df_test['log_prob_laplace']

0    -283.217966
1    -310.848988
2    -366.111030
3    -676.960017
4    -504.266135
         ...    
65   -628.605730
66   -462.819604
67   -414.465317
68   -511.173891
69   -172.693882
Name: log_prob_laplace, Length: 70, dtype: float64

In [37]:
def calculate_perplexity(df_test, col):
    # Sum of log probabilities (already computed in 'log_prob_laplace')
    total_log_prob = df_test[col].sum()

    # Total number of tokens in the dataset (assuming 'filtered' column has the sentences)
    #total_tokens = df_test['filtered'].apply(lambda x: len(nltk.word_tokenize(x))).sum()
    total_tokens = 9811
    # Calculate perplexity
    perplexity = 2**(-total_log_prob / total_tokens)
    
    return perplexity

# Call the function to calculate the perplexity for all sentences


In [38]:
laplace_perplexity = calculate_perplexity(df_test, 'log_prob_laplace')

In [39]:
print("Laplace perplexity: ", laplace_perplexity)

Laplace perplexity:  7.922823656032202


In [49]:
kn_probs = defaultdict(lambda: defaultdict(float))
def calc_kneser_ney_prob(token, d=0.75):
    # Count of the bigram (w1, w2)
    token = token.split(' ')
    for i in range(len(token)-1):
        w1=token[i]
        w2=token[i+1]
        bigram_count = bigrams[w1][w2]

        # Count of the unigram w1
        unigram_count = unigrams[w1]

        # Continuation probability P(w2)
        continuation_count = sum(1 for w in bigrams if w2 in bigrams[w])
        total_bigrams = sum(len(bigrams[w1]) for w1 in bigrams)  # Total number of unique bigrams
        continuation_prob = continuation_count / total_bigrams if total_bigrams > 0 else 0

        # Calculate Kneser-Ney probability
        if bigram_count > 0:
        # Normal bigram probability with discount
            kneser_ney_prob = max(bigram_count - d, 0) / unigram_count + (d * continuation_prob) / total_bigrams
        else:
        # Backoff to continuation probability
            kneser_ney_prob = (d * continuation_prob) / total_bigrams


        kn_probs[w1][w2] = kneser_ney_prob



In [50]:
    df['kn'] = df['clean'].apply(lambda x:calc_kneser_ney_prob(x))

In [51]:
# Define a small constant probability for <unk> tokens
import math
UNK_PROB = 1e-3  # You can adjust this value as needed

def calc_kn_probabilities(sentence):
    tokens = nltk.word_tokenize(sentence)

    # Initialize the log combined probability
    log_combined_prob = 0
    
    calc_kneser_ney_prob(sentence)  
    # Calculate bigram probabilities for the sentence using Kneser-Ney
    for i in range(len(tokens) - 1):
        w1 = tokens[i]
        w2 = tokens[i + 1]

        # Call the Kneser-Ney function to get the probability for the bigram (w1, w2)
        if w1 == 'unkown_word' or w2 == 'unkown_word':
            prob = UNK_PROB  # Use a small constant probability for any unknown token bigram
        else:
            # For known tokens, calculate the Kneser-Ney probability for the bigram (w1, w2)
            # Ensure the Kneser-Ney probabilities are calculated
            prob = kn_probs[w1][w2]  # Retrieve the Kneser-Ney probability
        if prob == 0:
            print(tokens)
            print(sentence)
        log_combined_prob += math.log(prob)



    # Return the combined log probability
    return log_combined_prob


#result = calc_probabilities('book two room four month advanc talbott place top floor next elev use night long speak front desk told simpli honor request upper floor request better view look brick wall get sleep also told receiv complaint guest th floor awar nois problem place us floor hotel total book request upper floor constitut place someon top floor use request justifi decid stay request room lower floor away elev spoke length book two room prefer simpli poor treatment guest believ would complain')


In [52]:
df_test['log_prob_kn'] = df_test['filtered'].apply(lambda x: pd.Series(calc_kn_probabilities(x)))

In [53]:
df_test['log_prob_kn']

0     -398.652625
1     -488.906002
2     -621.026972
3     -903.615911
4     -693.060096
         ...     
65   -1011.585753
66    -662.674746
67    -670.617220
68    -855.337985
69    -255.918032
Name: log_prob_kn, Length: 70, dtype: float64

In [54]:
kn_perplexity = calculate_perplexity(df_test, 'log_prob_kn')


In [55]:
print("Kneser ney perplexity: ", kn_perplexity)

Kneser ney perplexity:  27.196438409134966


In [59]:
#add k smoothing
vocab_size = len(unigrams)
bigram_probabs_addk=defaultdict(dict)
def calc_addk_ngrams():
    k =0.01  # Smaller constant for smoothing
    for w1 in bigrams:
        total_w1 = unigrams[w1]  # Count of the first word in the bigram
        for w2 in bigrams[w1]:
            # Correct addl smoothing formula with smaller constant
            bigram_probabs_addk[w1][w2] = (bigrams[w1][w2] +k) / (total_w1 +k*vocab_size )  

        # Calculate probabilities for unobserved bigrams
        for w2 in unigrams:  # Iterate over the vocabulary
            if w2 not in bigrams[w1]:  # If the bigram (w1, w2) is not observed
                bigram_probabs_addk[w1][w2] = k / (total_w1 + k*vocab_size)  # Laplace smoothing for unobserved bigrams

    # Create a list of bigrams with probabilities
    list_bigrams = [f"P({w2}|{w1}) = {bigram_probabs_addk[w1][w2]:.4f}" for w1 in bigram_probabs_addk for w2 in bigram_probabs_addk[w1]]
    return list_bigrams


In [60]:
calc_addk_ngrams()

['P(two|book) = 0.0187',
 'P(request|book) = 0.0063',
 'P(ambassador|book) = 0.0063',
 'P(us|book) = 0.0187',
 'P(flight|book) = 0.0063',
 'P(pick|book) = 0.0063',
 'P(even|book) = 0.0063',
 'P(via|book) = 0.0187',
 'P(hotel|book) = 0.1307',
 'P(upgrad|book) = 0.0063',
 'P(pricelin|book) = 0.0250',
 'P(reserv|book) = 0.0125',
 'P(amalfi|book) = 0.0125',
 'P(directli|book) = 0.0125',
 'P(room|book) = 0.1121',
 'P(block|book) = 0.0063',
 'P(suit|book) = 0.0063',
 'P(anoth|book) = 0.0063',
 'P(allegro|book) = 0.0063',
 'P(guess|book) = 0.0063',
 'P(solid|book) = 0.0063',
 'P(share|book) = 0.0063',
 'P(dinner|book) = 0.0063',
 'P(thought|book) = 0.0063',
 'P(omni|book) = 0.0063',
 'P(packag|book) = 0.0125',
 'P(five|book) = 0.0063',
 'P(one|book) = 0.0063',
 'P(teen|book) = 0.0063',
 'P(told|book) = 0.0063',
 'P(conrad|book) = 0.0063',
 'P(group|book) = 0.0063',
 'P(sofitel|book) = 0.0063',
 'P(stay|book) = 0.0187',
 'P(cd|book) = 0.0063',
 'P(dumpi|book) = 0.0063',
 'P(internet|book) = 0.

In [61]:
# Define a small constant probability for <unk> tokens
import math
UNK_PROB = 1e-3  # You can adjust this value as needed

def calc_addk_probabilities(sentence):
    tokens = nltk.word_tokenize(sentence)

    # Calculate unigram probabilities for the sentence
    # Calculate bigram probabilities for the sentence
    log_combined_prob = 0
    for i in range(len(tokens) - 1):
        w1 = tokens[i]
        w2 = tokens[i + 1]
        # Check bigram probability with fallback for unknown tokens
        if w1 in bigram_probabs_addk and w2 in bigram_probabs_addk[w1]:
            prob = bigram_probabs_addk[w1][w2]
        else:
            prob = UNK_PROB  # Use small constant probability for unknown bigram

        log_combined_prob += math.log(prob)

    # Convert log probability back to probability

    return log_combined_prob


#result = calc_probabilities('book two room four month advanc talbott place top floor next elev use night long speak front desk told simpli honor request upper floor request better view look brick wall get sleep also told receiv complaint guest th floor awar nois problem place us floor hotel total book request upper floor constitut place someon top floor use request justifi decid stay request room lower floor away elev spoke length book two room prefer simpli poor treatment guest believ would complain')


In [62]:
df_test['log_prob_addk'] = df_test['filtered'].apply(lambda x: pd.Series(calc_addk_probabilities(x)))

In [63]:
addk_perplexity = calculate_perplexity(df_test, 'log_prob_addk')

In [64]:
print("Add-K perplexity: ", addk_perplexity)

Add-K perplexity:  8.886155978047483
