In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
# import seaborn as sns
# import scipy.stats as stats
# import math
# import random


In [None]:

df = pd.read_csv('./IMDB_Dataset.csv')

print("Dataset shape:", df.shape)
print("\nFirst few rows:")
print(df.head())


print(df['sentiment'].value_counts())



Dataset shape: (50000, 2)

First few rows:
                                              review sentiment
0  One of the other reviewers has mentioned that ...  positive
1  A wonderful little production. <br /><br />The...  positive
2  I thought this was a wonderful way to spend ti...  positive
3  Basically there's a family where a little boy ...  negative
4  Petter Mattei's "Love in the Time of Money" is...  positive

Sentiment distribution:
sentiment
positive    25000
negative    25000
Name: count, dtype: int64


In [None]:

positive_keywords = ['wonderful', 'amazing', 'excellent', 'fantastic']
 
negative_keywords = ['terrible', 'awful', 'boring', 'horrible']

print("Selected keywords:")
print("Positive:", positive_keywords)
print("Negative:", negative_keywords)

# We'll compute P(Positive|keyword) for each keyword
print("\nWe will compute P(Positive|keyword) for each keyword using Bayes' Theorem")


Selected keywords:
Positive: ['wonderful', 'amazing', 'excellent', 'fantastic']
Negative: ['terrible', 'awful', 'boring', 'horrible']

We will compute P(Positive|keyword) for each keyword using Bayes' Theorem


In [6]:

total_reviews = len(df)
positive_reviews = len(df[df['sentiment'] == 'positive'])
negative_reviews = len(df[df['sentiment'] == 'negative'])

prior_positive = positive_reviews / total_reviews

print(f"Total reviews: {total_reviews}")
print(f"Positive reviews: {positive_reviews}")
print(f"Negative reviews: {negative_reviews}")
print(f"\nPrior P(Positive): {prior_positive:.4f}")
print(f"Prior P(Negative): {1 - prior_positive:.4f}")


Total reviews: 50000
Positive reviews: 25000
Negative reviews: 25000

Prior P(Positive): 0.5000
Prior P(Negative): 0.5000


In [7]:
# Function to check if a keyword appears in a review (case-insensitive)
def keyword_in_review(review_text, keyword):
    return keyword.lower() in review_text.lower()

# Function to implement Bayes' Theorem
def bayes_theorem(keyword, df, prior_positive):
    """
    Calculate P(Positive|keyword) using Bayes' Theorem
    
    P(Positive|keyword) = P(keyword|Positive) * P(Positive) / P(keyword)
    
    Where:
    - P(Positive) is the prior probability
    - P(keyword|Positive) is the likelihood 
    - P(keyword) is the marginal probability
    """
    
    # Count reviews containing the keyword
    reviews_with_keyword = df[df['review'].apply(lambda x: keyword_in_review(x, keyword))]
    total_with_keyword = len(reviews_with_keyword)
    
    # Count positive reviews containing the keyword
    positive_with_keyword = len(reviews_with_keyword[reviews_with_keyword['sentiment'] == 'positive'])
    
    # Calculate probabilities
    if total_with_keyword == 0:
        return {
            'keyword': keyword,
            'prior': prior_positive,
            'likelihood': 0,
            'marginal': 0,
            'posterior': 0,
            'total_with_keyword': 0,
            'positive_with_keyword': 0
        }
    
    # Likelihood: P(keyword|Positive)
    likelihood = positive_with_keyword / positive_reviews
    
    # Marginal: P(keyword)
    marginal = total_with_keyword / total_reviews
    
    # Posterior: P(Positive|keyword) using Bayes' Theorem
    if marginal > 0:
        posterior = (likelihood * prior_positive) / marginal
    else:
        posterior = 0
    
    return {
        'keyword': keyword,
        'prior': prior_positive,
        'likelihood': likelihood,
        'marginal': marginal,
        'posterior': posterior,
        'total_with_keyword': total_with_keyword,
        'positive_with_keyword': positive_with_keyword
    }

print("Bayes' Theorem implementation ready!")


Bayes' Theorem implementation ready!


In [8]:
# Calculate probabilities for all keywords
all_keywords = positive_keywords + negative_keywords
results = []

print("BAYESIAN PROBABILITY ANALYSIS")
print("=" * 50)

for keyword in all_keywords:
    result = bayes_theorem(keyword, df, prior_positive)
    results.append(result)
    
    print(f"\nKeyword: '{keyword}'")
    print(f"  Prior P(Positive): {result['prior']:.4f}")
    print(f"  Likelihood P({keyword}|Positive): {result['likelihood']:.4f}")
    print(f"  Marginal P({keyword}): {result['marginal']:.4f}")
    print(f"  Posterior P(Positive|{keyword}): {result['posterior']:.4f}")
    print(f"  Reviews containing '{keyword}': {result['total_with_keyword']}")
    print(f"  Positive reviews containing '{keyword}': {result['positive_with_keyword']}")


BAYESIAN PROBABILITY ANALYSIS

Keyword: 'wonderful'
  Prior P(Positive): 0.5000
  Likelihood P(wonderful|Positive): 0.1066
  Marginal P(wonderful): 0.0650
  Posterior P(Positive|wonderful): 0.8203
  Reviews containing 'wonderful': 3249
  Positive reviews containing 'wonderful': 2665

Keyword: 'amazing'
  Prior P(Positive): 0.5000
  Likelihood P(amazing|Positive): 0.0740
  Marginal P(amazing): 0.0496
  Posterior P(Positive|amazing): 0.7463
  Reviews containing 'amazing': 2479
  Positive reviews containing 'amazing': 1850

Keyword: 'excellent'
  Prior P(Positive): 0.5000
  Likelihood P(excellent|Positive): 0.1174
  Marginal P(excellent): 0.0725
  Posterior P(Positive|excellent): 0.8099
  Reviews containing 'excellent': 3625
  Positive reviews containing 'excellent': 2936

Keyword: 'fantastic'
  Prior P(Positive): 0.5000
  Likelihood P(fantastic|Positive): 0.0459
  Marginal P(fantastic): 0.0290
  Posterior P(Positive|fantastic): 0.7905
  Reviews containing 'fantastic': 1451
  Positive rev

In [9]:
# Create a summary table
import pandas as pd

# Convert results to DataFrame for better visualization
results_df = pd.DataFrame(results)

print("\nSUMMARY TABLE")
print("=" * 80)
print(results_df[['keyword', 'prior', 'likelihood', 'marginal', 'posterior']].round(4))

print("\nINTERPRETATION:")
print("-" * 40)
print("• Prior P(Positive): Overall probability of positive sentiment")
print("• Likelihood P(keyword|Positive): Probability of keyword appearing given positive sentiment")
print("• Marginal P(keyword): Overall probability of keyword appearing")
print("• Posterior P(Positive|keyword): Probability of positive sentiment given keyword appears")
print("\nHigher posterior values indicate stronger association with positive sentiment.")



SUMMARY TABLE
     keyword  prior  likelihood  marginal  posterior
0  wonderful    0.5      0.1066    0.0650     0.8203
1    amazing    0.5      0.0740    0.0496     0.7463
2  excellent    0.5      0.1174    0.0725     0.8099
3  fantastic    0.5      0.0459    0.0290     0.7905
4   terrible    0.5      0.0154    0.0541     0.1419
5      awful    0.5      0.0136    0.0624     0.1093
6     boring    0.5      0.0247    0.0623     0.1983
7   horrible    0.5      0.0130    0.0421     0.1549

INTERPRETATION:
----------------------------------------
• Prior P(Positive): Overall probability of positive sentiment
• Likelihood P(keyword|Positive): Probability of keyword appearing given positive sentiment
• Marginal P(keyword): Overall probability of keyword appearing
• Posterior P(Positive|keyword): Probability of positive sentiment given keyword appears

Higher posterior values indicate stronger association with positive sentiment.


In [None]:
# Verify Bayes' Theorem calculation manually for one keyword
print("\nVERIFICATION OF BAYES' THEOREM")
print("=" * 50)

# Let's verify with the keyword 'wonderful'
keyword = 'wonderful'
result = bayes_theorem(keyword, df, prior_positive)

print(f"Manual verification for keyword: '{keyword}'")
print(f"P(Positive) = {result['prior']:.4f}")
print(f"P({keyword}|Positive) = {result['likelihood']:.4f}")
print(f"P({keyword}) = {result['marginal']:.4f}")

# Manual calculation
manual_posterior = (result['likelihood'] * result['prior']) / result['marginal']
print(f"\nManual calculation:")
print(f"P(Positive|{keyword}) = P({keyword}|Positive) × P(Positive) / P({keyword})")
print(f"P(Positive|{keyword}) = {result['likelihood']:.4f} × {result['prior']:.4f} / {result['marginal']:.4f}")
print(f"P(Positive|{keyword}) = {manual_posterior:.4f}")

print(f"\nFunction result: {result['posterior']:.4f}")
print(f"Manual result: {manual_posterior:.4f}")
print(f"Match: {abs(result['posterior'] - manual_posterior) < 0.0001}")
