In [1]:
import pandas as pd
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import re

# Download NLTK stopwords if not already downloaded.
import nltk
# nltk.download('stopwords')
# nltk.download('punkt')

# Load the CSV file into a DataFrame
df = pd.read_csv('demo1.csv')

# Function to preprocess text
def preprocess_text(text):
    # Convert text to lowercase
    text = text.lower()
    
    # Remove special characters, numbers, and usernames.
    text = re.sub(r'[@#\w]+', '', text)
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    
    # Tokenize the text
    tokens = word_tokenize(text)
    
    # Remove stopwords
    stop_words = set(stopwords.words('english')) #This line loads the set of stopwords 
    filtered_tokens = [word for word in tokens if word not in stop_words]
    
    # Join the filtered tokens back into a string
    filtered_text = ' '.join(filtered_tokens)
    
    return filtered_text

# Apply preprocessing to the 'Text' column
df['Text'] = df['Text'].apply(preprocess_text)



In [3]:
import pandas as pd
from nltk.tokenize import word_tokenize

# Tokenization and extracting unique words
vocabulary = set()
for text in df['Text']:
    tokens = word_tokenize(text)
    vocabulary.update(tokens)

# Convert the set of unique words to a list
vocabulary_list = list(vocabulary)

# Print the vocabulary list
print(vocabulary_list)


['beauty', 'ancient', 'adrenaline', 'workout', 'delighted', 'future', 'Feeling', 'noise', 'performance', 'event', 'notifications', 'exams', 'offer', 'amazing', 'username', 'sad', 'place', 'all', 'sleep', 'our', '!', 'with', 'historical', 'current', 'anxious', 'intriguing', 'music', 'ongoing', 'exhibit', 'disappointed', 'fascinated', 'deadline', 'lack', 'love', 'delays', 'team', 'complex', 'bored', 'solutions', 'message', 'because', 'same', 'next', 'scientific', 'funny', 'enthusiasm', 'back', 'exam', 'NLP', 'civilization', 'good', 'upcoming', 'strenuous', 'art', 'town', 'artifact', 'constant', 'setbacks', 'day', 'thrilled', 'lonely', 'adventure', 'at', 'magical', 'words', 'reading', 'plot', 'for', 'learning', ',', 'success', 'enchanted', 'exhilarated', 'positive', 'silly', 'opportunity', 'outcome', 'slow', 'so', 'vacation', 'clever', 'Ca', 'a', 'surprise', 'thrilling', 'situation', 'hobbies', 'down', 'interruptions', 'feel', 'painting', 'inspired', 'discovery', 'journey', 'results', 'pe

In [4]:

# Filter positive and negative tweets
positive_tweets = df[df['Label'] == 1]['Text'].tolist()
negative_tweets = df[df['Label'] == 0]['Text'].tolist()

# Print positive and negative tweets corpus separately
print("Positive Tweets:")
for tweet in positive_tweets:
    print(tweet)

print("\nNegative Tweets:")
for tweet in negative_tweets:
    print(tweet)


Positive Tweets:
I am happy because I am learning NLP
I am happy
I love coding in Python
Excited for my vacation next week!
Loving this new book I'm reading
Celebrating my birthday today!
Enjoying a relaxing day at the beach
Feeling inspired after watching a motivational speech
Grateful for my supportive family
Can't wait to see my favorite band in concert
Just adopted a new puppy, so happy!
Feeling proud of myself for finishing a project
Excited to start a new job next month!
Enjoying a cozy night in with a good book
Wishing I could spend more time with my kids
Loving the fall weather and changing leaves
Just had a great conversation with an old friend
Excited for the holiday season and family gatherings
Feeling grateful for all the opportunities in my life
Looking forward to my graduation next month!
Enjoying a delicious meal with loved ones
Excited to start learning a new language
Feeling nostalgic listening to old songs
Just got promoted at work, so proud!
Looking forward to a week

In [5]:
# Initialize dictionaries to store word frequencies
positive_freq = {word: 0 for word in vocabulary}
negative_freq = {word: 0 for word in vocabulary}

# Calculate positive word frequencies
for tweet in positive_tweets:
    tokens = word_tokenize(tweet)
    for token in tokens:
        if token in vocabulary:
            positive_freq[token] += 1

# Calculate negative word frequencies
for tweet in negative_tweets:
    tokens = word_tokenize(tweet)
    for token in tokens:
        if token in vocabulary:
            negative_freq[token] += 1

# Print positive and negative word frequencies
print("Positive Word Frequencies:")
for word, freq in positive_freq.items():
    print(f"{word}: {freq}")



Positive Word Frequencies:
strong: 1
motivational: 1
graduation: 1
's: 0
,: 6
during: 1
tonight: 2
reading: 1
in: 8
inspired: 2
without: 0
grandma: 0
fix: 0
language: 1
leaves: 1
because: 1
month: 3
frustrated: 0
reunion: 1
an: 1
mountains: 1
dreams: 1
a: 41
could: 1
birthday: 1
learning: 2
vacation: 2
far: 0
abroad: 0
family: 6
movie: 3
shop: 0
are: 0
forward: 6
meal: 1
spa: 2
productive: 0
about: 0
adventures: 0
beauty: 1
got: 1
Celebrating: 1
season: 2
proud: 4
lonely: 0
news: 0
to: 22
exhausted: 0
delicious: 1
with: 9
spend: 1
Missing: 0
situations: 0
being: 0
my: 15
completed: 1
studying: 0
support: 1
grateful: 4
nothing: 0
canceled: 0
puppy: 1
friend: 1
dinner: 1
cup: 1
lately: 0
songs: 1
am: 3
myself: 3
home: 1
show: 1
new: 15
now: 0
lost: 0
Wishing: 1
speech: 1
more: 1
release: 1
life: 2
of: 12
social: 0
joy: 1
restaurant: 1
!: 13
upcoming: 0
work: 2
days: 0
responsibilities: 0
summer: 1
weather: 1
rainy: 0
ride: 0
that: 1
watching: 1
Bummed: 0
who: 0
simpler: 0
achieve: 1
Feel

In [6]:
print("\nNegative Word Frequencies:")
for word, freq in negative_freq.items():
    print(f"{word}: {freq}")


Negative Word Frequencies:
strong: 0
motivational: 0
graduation: 0
's: 1
,: 2
during: 2
tonight: 1
reading: 0
in: 2
inspired: 0
without: 2
grandma: 1
fix: 1
language: 0
leaves: 0
because: 0
month: 0
frustrated: 1
reunion: 0
an: 5
mountains: 0
dreams: 0
a: 3
could: 8
birthday: 0
learning: 1
vacation: 0
far: 2
abroad: 1
family: 0
movie: 1
shop: 1
are: 1
forward: 1
meal: 0
spa: 0
productive: 1
about: 7
adventures: 1
beauty: 0
got: 1
Celebrating: 0
season: 0
proud: 0
lonely: 3
news: 1
to: 3
exhausted: 1
delicious: 0
with: 4
spend: 0
Missing: 6
situations: 1
being: 1
my: 13
completed: 0
studying: 1
support: 0
grateful: 0
nothing: 1
canceled: 1
puppy: 0
friend: 0
dinner: 0
cup: 0
lately: 1
songs: 0
am: 3
myself: 0
home: 1
show: 1
new: 0
now: 1
lost: 1
Wishing: 8
speech: 0
more: 2
release: 0
life: 1
of: 1
social: 1
joy: 0
restaurant: 0
!: 0
upcoming: 6
work: 3
days: 1
responsibilities: 1
summer: 0
weather: 0
rainy: 1
ride: 1
that: 3
watching: 0
Bummed: 3
who: 2
simpler: 1
achieve: 0
Feeling:

In [7]:
# Calculate P(wi | class) for each word in the vocabulary
total_positive_words = sum(positive_freq.values())
total_negative_words = sum(negative_freq.values())

# Calculate conditional probabilities without Laplace smoothing
positive_probs = {word: count / total_positive_words for word, count in positive_freq.items()}
negative_probs = {word: count / total_negative_words for word, count in negative_freq.items()}

# Print positive probabilities
print("Positive Probabilities:")
for word, prob in positive_probs.items():
    print(f"{word}: {prob}")


Positive Probabilities:
strong: 0.002074688796680498
motivational: 0.002074688796680498
graduation: 0.002074688796680498
's: 0.0
,: 0.012448132780082987
during: 0.002074688796680498
tonight: 0.004149377593360996
reading: 0.002074688796680498
in: 0.016597510373443983
inspired: 0.004149377593360996
without: 0.0
grandma: 0.0
fix: 0.0
language: 0.002074688796680498
leaves: 0.002074688796680498
because: 0.002074688796680498
month: 0.006224066390041493
frustrated: 0.0
reunion: 0.002074688796680498
an: 0.002074688796680498
mountains: 0.002074688796680498
dreams: 0.002074688796680498
a: 0.08506224066390042
could: 0.002074688796680498
birthday: 0.002074688796680498
learning: 0.004149377593360996
vacation: 0.004149377593360996
far: 0.0
abroad: 0.0
family: 0.012448132780082987
movie: 0.006224066390041493
shop: 0.0
are: 0.0
forward: 0.012448132780082987
meal: 0.002074688796680498
spa: 0.004149377593360996
productive: 0.0
about: 0.0
adventures: 0.0
beauty: 0.002074688796680498
got: 0.00207468879668

In [8]:

# Print negative probabilities
print("\nNegative Probabilities:")
for word, prob in negative_probs.items():
    print(f"{word}: {prob}")



Negative Probabilities:
strong: 0.0
motivational: 0.0
graduation: 0.0
's: 0.003703703703703704
,: 0.007407407407407408
during: 0.007407407407407408
tonight: 0.003703703703703704
reading: 0.0
in: 0.007407407407407408
inspired: 0.0
without: 0.007407407407407408
grandma: 0.003703703703703704
fix: 0.003703703703703704
language: 0.0
leaves: 0.0
because: 0.0
month: 0.0
frustrated: 0.003703703703703704
reunion: 0.0
an: 0.018518518518518517
mountains: 0.0
dreams: 0.0
a: 0.011111111111111112
could: 0.02962962962962963
birthday: 0.0
learning: 0.003703703703703704
vacation: 0.0
far: 0.007407407407407408
abroad: 0.003703703703703704
family: 0.0
movie: 0.003703703703703704
shop: 0.003703703703703704
are: 0.003703703703703704
forward: 0.003703703703703704
meal: 0.0
spa: 0.0
productive: 0.003703703703703704
about: 0.025925925925925925
adventures: 0.003703703703703704
beauty: 0.0
got: 0.003703703703703704
Celebrating: 0.0
season: 0.0
proud: 0.0
lonely: 0.011111111111111112
news: 0.003703703703703704


In [10]:
import re

# Given tweet to predict sentiment
given_tweet = "I am happy because I am learning NLP"

# Tokenize the given tweet
words_in_tweet = re.findall(r'\b\w+\b', given_tweet.lower())

# Calculate the product of ratios
product = 1
for word in words_in_tweet:
    if word in vocabulary:
        ratio = positive_probs[word] / negative_probs[word]
        product *= ratio

# Predict sentiment based on the product
predicted_sentiment = "Positive" if product > 1 else "Negative"

print("Predicted Sentiment:", predicted_sentiment)

Predicted Sentiment: Positive


> The above Naive Bayes Inference condition ends.
# π P( wi / pos) / P( wi / neg) 

> Laplace Smoothing begins

> P ( wi / class ) = freq ( wi,class) + 1 / Nclass + Vclass

Nclass --> positive freq ka total count , neg freq ka total count
Vclass --> Nos of Unique words in class

In [11]:
# Calculate P(wi | class) for each word in the vocabulary with Laplace smoothing
total_positive_words = sum(positive_freq.values())
total_negative_words = sum(negative_freq.values())
unique_positive_words = len(positive_freq)
unique_negative_words = len(negative_freq)

# Calculate conditional probabilities with Laplace smoothing
positive_probs = {word: (count + 1) / (total_positive_words + unique_positive_words) for word, count in positive_freq.items()}
negative_probs = {word: (count + 1) / (total_negative_words + unique_negative_words) for word, count in negative_freq.items()}

# Print positive probabilities
print("Positive Probabilities:")
for word, prob in positive_probs.items():
    print(f"{word}: {prob}")



Positive Probabilities:
beach: 0.005369127516778523
finishing: 0.0026845637583892616
that: 0.0026845637583892616
they: 0.0013422818791946308
relaxing: 0.004026845637583893
during: 0.0026845637583892616
Loving: 0.004026845637583893
friends: 0.004026845637583893
time: 0.0026845637583892616
down: 0.0013422818791946308
nature: 0.0026845637583892616
watching: 0.0026845637583892616
recipe: 0.0026845637583892616
learning: 0.004026845637583893
dreams: 0.0026845637583892616
listening: 0.0026845637583892616
job: 0.0026845637583892616
favorite: 0.004026845637583893
with: 0.013422818791946308
nervous: 0.0013422818791946308
situations: 0.0013422818791946308
places: 0.0013422818791946308
to: 0.03087248322147651
evening: 0.0026845637583892616
reading: 0.0026845637583892616
supportive: 0.0026845637583892616
kids: 0.0026845637583892616
who: 0.0013422818791946308
cozy: 0.004026845637583893
semester: 0.0026845637583892616
Missing: 0.0013422818791946308
pet: 0.0013422818791946308
homesick: 0.0013422818791

In [12]:
# Print negative probabilities
print("\nNegative Probabilities:")
for word, prob in negative_probs.items():
    print(f"{word}: {prob}")


Negative Probabilities:
beach: 0.001876172607879925
finishing: 0.001876172607879925
that: 0.0075046904315197
they: 0.00375234521575985
relaxing: 0.001876172607879925
during: 0.005628517823639775
Loving: 0.001876172607879925
friends: 0.005628517823639775
time: 0.0075046904315197
down: 0.00375234521575985
nature: 0.001876172607879925
watching: 0.001876172607879925
recipe: 0.001876172607879925
learning: 0.00375234521575985
dreams: 0.001876172607879925
listening: 0.001876172607879925
job: 0.00375234521575985
favorite: 0.0075046904315197
with: 0.009380863039399626
nervous: 0.00375234521575985
situations: 0.00375234521575985
places: 0.00375234521575985
to: 0.0075046904315197
evening: 0.001876172607879925
reading: 0.001876172607879925
supportive: 0.001876172607879925
kids: 0.001876172607879925
who: 0.005628517823639775
cozy: 0.001876172607879925
semester: 0.001876172607879925
Missing: 0.013133208255159476
pet: 0.00375234521575985
homesick: 0.00375234521575985
quiet: 0.001876172607879925
chil

In [13]:
# Test tweets
test_tweets = [
    "I am happy and learning NLP",
    "I am sad because I am not learning NLP"
]

# Predict sentiment of test tweets
for tweet in test_tweets:
    words_in_tweet = re.findall(r'\b\w+\b', tweet.lower())
    product = 1
    for word in words_in_tweet:
        if word in vocabulary:
            ratio = positive_probs.get(word, 1) / negative_probs.get(word, 1)
            product *= ratio
    predicted_sentiment = "Positive" if product > 1 else "Negative"
    print(f"Tweet: '{tweet}' - Predicted Sentiment: {predicted_sentiment}")

Tweet: 'I am happy and learning NLP' - Predicted Sentiment: Positive
Tweet: 'I am sad because I am not learning NLP' - Predicted Sentiment: Negative
