In [9]:
import re

corpus = [
    ("I am happy because I am learning NLP", 1),
    ("I am happy", 1),
    ('"I am sad, I am not learning NLP"', 0),
    ("I am sad", 0)
]

# Tokenization and extracting unique words
vocabulary = set()
for text, _ in corpus:  # Changed to unpack only the text, ignoring the label
    # Tokenize text using regular expression
    words = re.findall(r'\b\w+\b', text.lower())  # Access text and then apply lower()
    vocabulary.update(words)

# Convert set to list for easier indexing
vocabulary = list(vocabulary)

print("Vocabulary V:", vocabulary)


Vocabulary V: ['i', 'sad', 'because', 'learning', 'am', 'not', 'nlp', 'happy']


In [10]:
positive_corpus = []
negative_corpus = []

for text, label in corpus:
    if label == 1:
        positive_corpus.append(text)
    else:
        negative_corpus.append(text)

print("Positive Tweet Corpus:")
for tweet in positive_corpus:
    print(tweet)

print("\nNegative Tweet Corpus:")
for tweet in negative_corpus:
    print(tweet)

Positive Tweet Corpus:
I am happy because I am learning NLP
I am happy

Negative Tweet Corpus:
"I am sad, I am not learning NLP"
I am sad


In [11]:
# Vocabulary from previous step


positive_frequency = {word: 0 for word in vocabulary}
negative_frequency = {word: 0 for word in vocabulary}

# Counting positive word frequencies
for tweet in positive_corpus:
    words = re.findall(r'\b\w+\b', tweet.lower())
    for word in words:
        if word in positive_frequency:
            positive_frequency[word] += 1

# Counting negative word frequencies
for tweet in negative_corpus:
    words = re.findall(r'\b\w+\b', tweet.lower())
    for word in words:
        if word in negative_frequency:
            negative_frequency[word] += 1

print("Positive Frequency:")
print(positive_frequency)
print("\nNegative Frequency:")
print(negative_frequency)


Positive Frequency:
{'i': 3, 'sad': 0, 'because': 1, 'learning': 1, 'am': 3, 'not': 0, 'nlp': 1, 'happy': 2}

Negative Frequency:
{'i': 3, 'sad': 2, 'because': 0, 'learning': 1, 'am': 3, 'not': 1, 'nlp': 1, 'happy': 0}


In [16]:
# Calculate P(wi | class) for each word in the vocabulary
total_positive_words = sum(positive_frequency.values())
total_negative_words = sum(negative_frequency.values())

# Calculate conditional probabilities
positive_probs = {word: (count + 1) / (total_positive_words + len(vocabulary)) for word, count in positive_frequency.items()}
negative_probs = {word: (count + 1) / (total_negative_words + len(vocabulary)) for word, count in negative_frequency.items()}

# Print positive probabilities
print("Positive Probabilities:")
for word, prob in positive_probs.items():
    print(f"{word}: {prob}")

# Print negative probabilities
print("\nNegative Probabilities:")
for word, prob in negative_probs.items():
    print(f"{word}: {prob}")




Positive Probabilities:
i: 0.21052631578947367
sad: 0.05263157894736842
because: 0.10526315789473684
learning: 0.10526315789473684
am: 0.21052631578947367
not: 0.05263157894736842
nlp: 0.10526315789473684
happy: 0.15789473684210525

Negative Probabilities:
i: 0.21052631578947367
sad: 0.15789473684210525
because: 0.05263157894736842
learning: 0.10526315789473684
am: 0.21052631578947367
not: 0.10526315789473684
nlp: 0.10526315789473684
happy: 0.05263157894736842


In [17]:
# Given tweet to predict sentiment
given_tweet = "I am happy because I am learning NLP"

# Tokenize the given tweet
words_in_tweet = re.findall(r'\b\w+\b', given_tweet.lower())

# Calculate the product of ratios
product = 1
for word in words_in_tweet:
    if word in vocabulary:
        ratio = positive_probs[word] / negative_probs[word]
        product *= ratio

# Predict sentiment based on the product
predicted_sentiment = "Positive" if product > 1 else "Negative"

print("Predicted Sentiment:", predicted_sentiment)

Predicted Sentiment: Positive


In [18]:
# Calculate P(wi | class) for each word in the vocabulary with Laplace smoothing
total_positive_words = sum(positive_frequency.values())
total_negative_words = sum(negative_frequency.values())
unique_positive_words = len(positive_frequency)
unique_negative_words = len(negative_frequency)

# Calculate conditional probabilities with Laplace smoothing
positive_probs = {word: (count + 1) / (total_positive_words + unique_positive_words) for word, count in positive_frequency.items()}
negative_probs = {word: (count + 1) / (total_negative_words + unique_negative_words) for word, count in negative_frequency.items()}

# Print positive probabilities
print("Positive Probabilities:")
for word, prob in positive_probs.items():
    print(f"{word}: {prob}")

# Print negative probabilities
print("\nNegative Probabilities:")
for word, prob in negative_probs.items():
    print(f"{word}: {prob}")

Positive Probabilities:
i: 0.21052631578947367
sad: 0.05263157894736842
because: 0.10526315789473684
learning: 0.10526315789473684
am: 0.21052631578947367
not: 0.05263157894736842
nlp: 0.10526315789473684
happy: 0.15789473684210525

Negative Probabilities:
i: 0.21052631578947367
sad: 0.15789473684210525
because: 0.05263157894736842
learning: 0.10526315789473684
am: 0.21052631578947367
not: 0.10526315789473684
nlp: 0.10526315789473684
happy: 0.05263157894736842


In [19]:
# Test tweets
test_tweets = [
    "I am happy and learning NLP",
    "I am sad because I am not learning NLP"
]

# Predict sentiment of test tweets
for tweet in test_tweets:
    words_in_tweet = re.findall(r'\b\w+\b', tweet.lower())
    product = 1
    for word in words_in_tweet:
        if word in vocabulary:
            ratio = positive_probs.get(word, 1) / negative_probs.get(word, 1)
            product *= ratio
    predicted_sentiment = "Positive" if product > 1 else "Negative"
    print(f"Tweet: '{tweet}' - Predicted Sentiment: {predicted_sentiment}")

Tweet: 'I am happy and learning NLP' - Predicted Sentiment: Positive
Tweet: 'I am sad because I am not learning NLP' - Predicted Sentiment: Negative


In [20]:
import math

# Test tweets
test_tweets = [
    "I am happy and learning NLP",
    "I am sad because I am not learning NLP"
]

# Prior probabilities
prior_positive = math.log(len(positive_corpus) / len(corpus))
prior_negative = math.log(len(negative_corpus) / len(corpus))

# Predict sentiment of test tweets
for tweet in test_tweets:
    words_in_tweet = re.findall(r'\b\w+\b', tweet.lower())
    log_likelihood_positive = prior_positive
    log_likelihood_negative = prior_negative
    for word in words_in_tweet:
        if word in vocabulary:
            log_likelihood_positive += positive_probs.get(word, 0)
            log_likelihood_negative += negative_probs.get(word, 0)
    predicted_sentiment = "Positive" if log_likelihood_positive > log_likelihood_negative else "Negative"
    print(f"Tweet: '{tweet}' - Predicted Sentiment: {predicted_sentiment}")

Tweet: 'I am happy and learning NLP' - Predicted Sentiment: Positive
Tweet: 'I am sad because I am not learning NLP' - Predicted Sentiment: Negative
