In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Load the dataset into a pandas dataframe
df = pd.read_csv('rt_reviews.csv' , encoding='iso-8859-1')

# Split the dataset into train and test sets
train, test = train_test_split(df, test_size=0.3, random_state=42)

# Split the test set into test and development sets
test, development = train_test_split(test, test_size=0.5, random_state=42)

# Save the train, development, and test sets to separate CSV files 
train.to_csv('train.csv', index=False)
development.to_csv('development.csv', index=False)
test.to_csv('test.csv', index=False)


In [2]:
# Import the required libraries
import string

# Create an empty list to store the vocabulary
vocab = []

# Iterate over each row in the "Review" column
for row in train['Review']:
    # Convert the text to lowercase
    row = row.lower()
    # Remove punctuation
    row = row.translate(str.maketrans('', '', string.punctuation))
    # Split the text into individual words
    words = row.split()
    # Add the words to the vocabulary
    vocab.extend(words)

# Remove duplicates from the vocabulary
vocab = list((vocab))


In [3]:
vocab

['does',
 'one',
 'really',
 'have',
 'to',
 'so',
 'debase',
 'christs',
 'story',
 'in',
 'order',
 'to',
 'make',
 'it',
 'relevant',
 'to',
 'todays',
 'audiences',
 'people',
 'richer',
 'than',
 'you',
 'get',
 'paid',
 'to',
 'come',
 'up',
 'with',
 'this',
 'stuff',
 'while',
 'it',
 'doesnt',
 'exactly',
 'take',
 'your',
 'breath',
 'away',
 'this',
 'small',
 'comingofage',
 'story',
 'will',
 'make',
 'you',
 'smile',
 'about',
 'how',
 'film',
 'can',
 'romanticise',
 'even',
 'the',
 'slightest',
 'of',
 'ideas',
 'one',
 'really',
 'bad',
 'movie',
 'folks',
 'dont',
 'be',
 'a',
 'fool',
 'and',
 'waste',
 'your',
 'time',
 'and',
 'money',
 'on',
 'it',
 'full',
 'content',
 'review',
 'for',
 'parents',
 'sex',
 'profanity',
 'nudity',
 'etc',
 'also',
 'available',
 'all',
 'hat',
 'no',
 'tentacles',
 'last',
 'flag',
 'flying',
 'lacks',
 'the',
 'casual',
 'livedin',
 'realism',
 'you',
 'usually',
 'find',
 'in',
 'a',
 'linklater',
 'film',
 'you',
 'dont',
 'b

In [4]:
# Calculate the number of occurrences of the word "does" in the vocabulary
num_occurrences = vocab.count("does")
print("The word 'does' appears", num_occurrences, "times in the vocabulary.")


The word 'does' appears 7488 times in the vocabulary.


In [5]:
# Import the required libraries
import string
from collections import Counter

# Create empty dictionaries to store word counts and freshness counts
word_counts = {}
freshness_counts = {'fresh': 0, 'rotten': 0}

# Iterate over each row in the train dataframe
for index, row in train.iterrows():
    # Get the Freshness and Review values for the row
    freshness = row['Freshness']
    review = row['Review']
    # Convert the text to lowercase
    review = review.lower()
    # Remove punctuation
    review = review.translate(str.maketrans('', '', string.punctuation))
    # Split the text into individual words
    words = review.split()
    # Update the word and freshness counts
    for word in words:
        if word in word_counts:
            word_counts[word][freshness] += 1
        else:
            word_counts[word] = {'fresh': 0, 'rotten': 0}
            word_counts[word][freshness] += 1
    freshness_counts[freshness] += 1

# Calculate the total number of reviews
total_reviews = freshness_counts['fresh'] + freshness_counts['rotten']

# Calculate the conditional probability of each word
word_probs = {}
for word in word_counts:
    fresh_prob = word_counts[word]['fresh'] / freshness_counts['fresh']
    rotten_prob = word_counts[word]['rotten'] / freshness_counts['rotten']
    word_probs[word] = {'fresh': fresh_prob, 'rotten': rotten_prob}

# Calculate the prior probabilities of Freshness
fresh_prior_prob = freshness_counts['fresh'] / total_reviews
rotten_prior_prob = freshness_counts['rotten'] / total_reviews


In [6]:
word_counts

{'does': {'fresh': 4000, 'rotten': 3488},
 'one': {'fresh': 14406, 'rotten': 11911},
 'really': {'fresh': 2701, 'rotten': 4136},
 'have': {'fresh': 8996, 'rotten': 12882},
 'to': {'fresh': 73877, 'rotten': 85280},
 'so': {'fresh': 8431, 'rotten': 12010},
 'debase': {'fresh': 1, 'rotten': 3},
 'christs': {'fresh': 4, 'rotten': 2},
 'story': {'fresh': 9553, 'rotten': 8473},
 'in': {'fresh': 56484, 'rotten': 50397},
 'order': {'fresh': 328, 'rotten': 377},
 'make': {'fresh': 4877, 'rotten': 5837},
 'it': {'fresh': 42605, 'rotten': 44924},
 'relevant': {'fresh': 290, 'rotten': 98},
 'todays': {'fresh': 198, 'rotten': 93},
 'audiences': {'fresh': 1197, 'rotten': 1124},
 'people': {'fresh': 2784, 'rotten': 2667},
 'richer': {'fresh': 83, 'rotten': 26},
 'than': {'fresh': 10134, 'rotten': 13056},
 'you': {'fresh': 16240, 'rotten': 14947},
 'get': {'fresh': 3194, 'rotten': 3933},
 'paid': {'fresh': 91, 'rotten': 137},
 'come': {'fresh': 2005, 'rotten': 1979},
 'up': {'fresh': 6775, 'rotten': 8

In [7]:
word_probs

{'does': {'fresh': 0.023809098647047968, 'rotten': 0.020762275516824705},
 'one': {'fresh': 0.08574846877734327, 'rotten': 0.07090007559658804},
 'really': {'fresh': 0.01607709386141914, 'rotten': 0.02461948725274856},
 'have': {'fresh': 0.05354666285721088, 'rotten': 0.07667994071322702},
 'to': {'fresh': 0.4397361951869907, 'rotten': 0.5076281124067692},
 'so': {'fresh': 0.050183627673315356, 'rotten': 0.07148937183402085},
 'debase': {'fresh': 5.952274661761993e-06, 'rotten': 1.7857461740388223e-05},
 'christs': {'fresh': 2.380909864704797e-05, 'rotten': 1.1904974493592147e-05},
 'story': {'fresh': 0.056862079843812315, 'rotten': 0.05043542444210313},
 'in': {'fresh': 0.3362082819949644, 'rotten': 0.29998749977678174},
 'order': {'fresh': 0.0019523460890579335, 'rotten': 0.00224408769204212},
 'make': {'fresh': 0.029029243525413237, 'rotten': 0.034744668059548685},
 'it': {'fresh': 0.2535966619643697, 'rotten': 0.2674095370750668},
 'relevant': {'fresh': 0.0017261596519109778, 'rott

In [8]:
fresh_prior_prob

0.5000089285714285

In [9]:
rotten_prior_prob

0.4999910714285714

In [10]:
# Create a list to store the predicted Freshness values
predictions = []

# Iterate over each row in the dev dataframe
for index, row in development.iterrows():
    # Get the Review text for the row
    review = row['Review']
    # Convert the text to lowercase
    review = review.lower()
    # Remove punctuation
    review = review.translate(str.maketrans('', '', string.punctuation))
    # Split the text into individual words
    words = review.split()
    # Initialize the probabilities for each Freshness value
    fresh_prob = fresh_prior_prob
    rotten_prob = rotten_prior_prob
    # Update the probabilities based on the words in the review
    for word in words:
        if word in word_probs:
            fresh_prob *= word_probs[word]['fresh']
            rotten_prob *= word_probs[word]['rotten']
    # Determine the predicted Freshness value
    if fresh_prob > rotten_prob:
        predictions.append('fresh')
    else:
        predictions.append('rotten')

# Calculate the accuracy of the predictions
correct_predictions = (predictions == development['Freshness']).sum()
total_predictions = len(predictions)
accuracy = correct_predictions / total_predictions
print(f'Dev accuracy: {accuracy:.2%}')


Dev accuracy: 80.20%


In [11]:
predictions

['rotten',
 'rotten',
 'fresh',
 'fresh',
 'fresh',
 'rotten',
 'fresh',
 'fresh',
 'fresh',
 'fresh',
 'fresh',
 'rotten',
 'fresh',
 'rotten',
 'fresh',
 'fresh',
 'fresh',
 'fresh',
 'fresh',
 'fresh',
 'fresh',
 'rotten',
 'fresh',
 'fresh',
 'rotten',
 'rotten',
 'fresh',
 'fresh',
 'fresh',
 'rotten',
 'fresh',
 'fresh',
 'fresh',
 'rotten',
 'rotten',
 'fresh',
 'rotten',
 'rotten',
 'rotten',
 'fresh',
 'rotten',
 'rotten',
 'fresh',
 'fresh',
 'fresh',
 'rotten',
 'fresh',
 'fresh',
 'rotten',
 'fresh',
 'rotten',
 'rotten',
 'rotten',
 'rotten',
 'fresh',
 'fresh',
 'rotten',
 'rotten',
 'fresh',
 'rotten',
 'rotten',
 'rotten',
 'rotten',
 'fresh',
 'rotten',
 'fresh',
 'rotten',
 'fresh',
 'rotten',
 'fresh',
 'fresh',
 'fresh',
 'rotten',
 'fresh',
 'fresh',
 'fresh',
 'fresh',
 'fresh',
 'fresh',
 'rotten',
 'fresh',
 'rotten',
 'fresh',
 'fresh',
 'fresh',
 'rotten',
 'fresh',
 'rotten',
 'fresh',
 'fresh',
 'fresh',
 'fresh',
 'fresh',
 'fresh',
 'fresh',
 'fresh',
 'ro

In [12]:
# Set the value of k for Laplace smoothing
k = 1

# Calculate the conditional probability of each word with Laplace smoothing
word_probs = {}
for word in word_counts:
    fresh_count = word_counts[word]['fresh'] + k
    fresh_prob = fresh_count / (freshness_counts['fresh'] + k * len(word_counts))
    rotten_count = word_counts[word]['rotten'] + k
    rotten_prob = rotten_count / (freshness_counts['rotten'] + k * len(word_counts))
    word_probs[word] = {'fresh': fresh_prob, 'rotten': rotten_prob}


In [13]:
# Create a list to store the predicted Freshness values
predictions_smooth = []

# Iterate over each row in the dev dataframe
for index, row in development.iterrows():
    # Get the Review text for the row
    review_smooth = row['Review']
    # Convert the text to lowercase
    review_smooth = review_smooth.lower()
    # Remove punctuation
    review_smooth = review_smooth.translate(str.maketrans('', '', string.punctuation))
    # Split the text into individual words
    words_smooth = review_smooth.split()
    # Initialize the probabilities for each Freshness value
    fresh_prob_smooth = fresh_prior_prob
    rotten_prob_smooth = rotten_prior_prob
    # Update the probabilities based on the words in the review
    for word in words_smooth:
        if word in word_probs:
            fresh_prob_smooth *= word_probs[word]['fresh']
            rotten_prob_smooth *= word_probs[word]['rotten']
    # Determine the predicted Freshness value
    if fresh_prob_smooth > rotten_prob_smooth:
        predictions_smooth.append('fresh')
    else:
        predictions_smooth.append('rotten')

# Calculate the accuracy of the predictions
correct_predictions_smooth = (predictions_smooth == development['Freshness']).sum()
total_predictions_smooth = len(predictions_smooth)
accuracy_smooth = correct_predictions_smooth / total_predictions_smooth
print(f'Dev accuracy_smooth: {accuracy_smooth:.2%}')


Dev accuracy_smooth: 80.71%


In [14]:
import numpy as np
# Create empty dictionaries to store log-odds ratio for each word
log_odds_ratio = {'fresh': {}, 'rotten': {}}

# Calculate the log-odds ratio for each word
for word in word_probs:
    fresh_prob = word_probs[word]['fresh']
    rotten_prob = word_probs[word]['rotten']
    log_odds_ratio['fresh'][word] = np.log(fresh_prob / rotten_prob)
    log_odds_ratio['rotten'][word] = np.log(rotten_prob / fresh_prob)

# Sort the words by their log-odds ratio and take the top 10 for each class
top_words = {'fresh': [], 'rotten': []}
for freshness in ['fresh', 'rotten']:
    top_words[freshness] = sorted(log_odds_ratio[freshness], key=log_odds_ratio[freshness].get, reverse=True)[:10]
    
# Print the top 10 words for each class
print("Top 10 words that predict 'Fresh' reviews:")
print(top_words['fresh'])
print("\nTop 10 words that predict 'Rotten' reviews:")
print(top_words['rotten'])


Top 10 words that predict 'Fresh' reviews:
['unmissable', 'sensuous', 'zvyagintsev', 'ida', 'dardennes', 'spiderverse', 'holofcener', 'vega', 'bresson', 'iannucci']

Top 10 words that predict 'Rotten' reviews:
['charmless', 'unexciting', 'unfunny', 'laughless', 'squanders', 'friedberg', 'farrago', 'thirdrate', 'stinker', 'unrewarding']


In [15]:
top_words

{'fresh': ['unmissable',
  'sensuous',
  'zvyagintsev',
  'ida',
  'dardennes',
  'spiderverse',
  'holofcener',
  'vega',
  'bresson',
  'iannucci'],
 'rotten': ['charmless',
  'unexciting',
  'unfunny',
  'laughless',
  'squanders',
  'friedberg',
  'farrago',
  'thirdrate',
  'stinker',
  'unrewarding']}

In [16]:
word_probs['unmissable']

{'fresh': 0.00015476010425203386, 'rotten': 3.517349325548267e-06}

In [17]:
# Create a list to store the predicted Freshness values
predictions_smooth = []

# Iterate over each row in the dev dataframe
for index, row in test.iterrows():
    # Get the Review text for the row
    review_smooth = row['Review']
    # Convert the text to lowercase
    review_smooth = review_smooth.lower()
    # Remove punctuation
    review_smooth = review_smooth.translate(str.maketrans('', '', string.punctuation))
    # Split the text into individual words
    words_smooth = review_smooth.split()
    # Initialize the probabilities for each Freshness value
    fresh_prob_smooth = fresh_prior_prob
    rotten_prob_smooth = rotten_prior_prob
    # Update the probabilities based on the words in the review
    for word in words_smooth:
        if word in word_probs:
            fresh_prob_smooth *= word_probs[word]['fresh']
            rotten_prob_smooth *= word_probs[word]['rotten']
    # Determine the predicted Freshness value
    if fresh_prob_smooth > rotten_prob_smooth:
        predictions_smooth.append('fresh')
    else:
        predictions_smooth.append('rotten')

# Calculate the accuracy of the predictions
correct_predictions_smooth = (predictions_smooth == test['Freshness']).sum()
total_predictions_smooth = len(predictions_smooth)
accuracy_smooth = correct_predictions_smooth / total_predictions_smooth
print(f'Test accuracy_smooth: {accuracy_smooth:.2%}')


Test accuracy_smooth: 80.54%
