> Implementing Twitter_Sentiment_Analysis using BOW.

In [10]:
import pandas as pd
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import re

# Download NLTK stopwords if not already downloaded
import nltk
# nltk.download('stopwords')
# nltk.download('punkt')

# Load the CSV file into a DataFrame
df = pd.read_csv('demo1.csv')

# Function to preprocess text
def preprocess_text(text):
    # Convert text to lowercase
    text = text.lower()
    
    # Remove special characters, numbers, and usernames
    text = re.sub(r'[@#\w]+', '', text)
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    
    # Tokenize the text
    tokens = word_tokenize(text)
    
    # Remove stopwords
    stop_words = set(stopwords.words('english')) #This line loads the set of stopwords 
    filtered_tokens = [word for word in tokens if word not in stop_words]
    
    # Join the filtered tokens back into a string
    filtered_text = ' '.join(filtered_tokens)
    
    return filtered_text

# Apply preprocessing to the 'Text' column
df['Text'] = df['Text'].apply(preprocess_text)



In [12]:
import pandas as pd
from nltk.tokenize import word_tokenize



# Tokenization and extracting unique words
vocabulary = set()
for text in df['Text']:
    tokens = word_tokenize(text)
    vocabulary.update(tokens)

# Convert the set of unique words to a list
vocabulary_list = list(vocabulary)

# Print the vocabulary list
print(vocabulary_list)


['wait', 'nature', 'inspired', 'exhibit', 'place', 'uncertain', 'home', 'slow', 'funny', 'new', 'powerful', 'day', 'down', 'solutions', 'breathtaking', 'Feeling', 'ecstatic', 'Stressed', 'back', 'book', 'Excited', 'setbacks', 'workload', 'scientific', 'sleep', 'words', 'art', 'outcome', 'traffic', "n't", 'project', 'dedication', 'remarkable', 'adrenaline', 'interview', 'surprise', 'irritated', '@', 'vacation', 'noise', 'learning', 'thrilling', 'I', 'ongoing', "'m", 'feel', 'exam', 'issues', 'tedious', 'tonight', 'my', 'to', 'hate', 'enthusiasm', 'heartfelt', 'see', 'determined', 'dull', 'delicious', 'good', 'exhilarated', 'they', 'video', 'positive', 'our', 'grateful', 'exhausted', 'adventure', 'week', 'username', 'long', 'all', 'party', 'intriguing', 'amazing', 'tranquility', 'enchanted', 'passionate', 'delays', 'stressed', 'overwhelmed', 'performance', 'current', 'civilization', 'are', 'team', 'intense', 'unexpected', 'energized', 'situation', 'happy', 'friends', 'after', 'interrupti

In [None]:
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize



# Function to remove usernames and stopwords
def preprocess_text(text):
    # Remove usernames
    text = re.sub(r'@\w+', '', text)
    
    # Tokenize the text
    tokens = word_tokenize(text)
    
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [word for word in tokens if word.lower() not in stop_words]
    
    # Join the filtered tokens back into a string
    filtered_text = ' '.join(filtered_tokens)
    
    return filtered_text

# Apply preprocessing to each text in the list
processed_texts_with_labels = [(preprocess_text(text), label) for text, label in texts_with_labels]

# Print the processed texts with labels
for processed_text, label in processed_texts_with_labels:
    print(processed_text, label)


In [7]:

# Filter positive and negative tweets
positive_tweets = df[df['Label'] == 1]['Text'].tolist()
negative_tweets = df[df['Label'] == 0]['Text'].tolist()

# Print positive and negative tweets corpus separately
print("Positive Tweets:")
for tweet in positive_tweets:
    print(tweet)

print("\nNegative Tweets:")
for tweet in negative_tweets:
    print(tweet)


Positive Tweets:
I am happy because I am learning NLP
I am happy @username
I love coding in Python
Excited for my vacation next week!
Loving this new book I'm reading
Can't wait to see my friends tomorrow!
I feel so blessed today
I am grateful for this opportunity
I'm ecstatic about the new job offer
Excited to start my new project
I'm passionate about my hobbies
I'm thrilled to announce our success
I'm proud of my accomplishments
I'm inspired by the beauty of nature
I'm fascinated by this new discovery
I'm determined to achieve my goals
I'm intrigued by this mystery novel
I'm captivated by this art exhibit
I'm enchanted by this magical place
I'm delighted by the surprise party
I'm moved by this touching story
I'm impressed by his dedication
I'm energized after a good night's sleep
I'm amused by this funny video
I'm exhilarated by this thrilling adventure
I'm captivated by her performance
I'm fascinated by this ancient civilization
I'm enchanted by the beauty of this place
I'm delighte

In [8]:
# Initialize dictionaries to store word frequencies
positive_freq = {word: 0 for word in vocabulary}
negative_freq = {word: 0 for word in vocabulary}

# Calculate positive word frequencies
for tweet in positive_tweets:
    tokens = word_tokenize(tweet)
    for token in tokens:
        if token in vocabulary:
            positive_freq[token] += 1

# Calculate negative word frequencies
for tweet in negative_tweets:
    tokens = word_tokenize(tweet)
    for token in tokens:
        if token in vocabulary:
            negative_freq[token] += 1

# Print positive and negative word frequencies
print("Positive Word Frequencies:")
for word, freq in positive_freq.items():
    print(f"{word}: {freq}")



Positive Word Frequencies:
wait: 1
nature: 1
inspired: 1
exhibit: 1
place: 3
uncertain: 0
home: 0
slow: 0
funny: 1
new: 4
powerful: 1
day: 0
down: 0
solutions: 1
breathtaking: 1
Feeling: 0
ecstatic: 1
Stressed: 0
back: 0
book: 1
Excited: 2
setbacks: 0
workload: 0
scientific: 1
sleep: 1
words: 1
art: 1
outcome: 0
traffic: 0
n't: 1
project: 1
dedication: 1
remarkable: 1
adrenaline: 1
interview: 0
surprise: 2
irritated: 0
@: 1
vacation: 1
noise: 0
learning: 1
thrilling: 1
I: 47
ongoing: 0
'm: 41
feel: 1
exam: 0
issues: 0
tedious: 0
tonight: 0
my: 6
to: 4
hate: 0
enthusiasm: 1
heartfelt: 1
see: 1
determined: 1
dull: 0
delicious: 1
good: 1
exhilarated: 3
they: 0
video: 1
positive: 1
our: 1
grateful: 1
exhausted: 0
adventure: 1
week: 1
username: 1
long: 0
all: 0
party: 1
intriguing: 1
amazing: 1
tranquility: 1
enchanted: 4
passionate: 1
delays: 0
stressed: 0
overwhelmed: 0
performance: 3
current: 0
civilization: 1
are: 0
team: 1
intense: 0
unexpected: 1
energized: 3
situation: 0
happy: 2
fri

In [9]:
print("\nNegative Word Frequencies:")
for word, freq in negative_freq.items():
    print(f"{word}: {freq}")


Negative Word Frequencies:
changing: 0
conversation: 0
friend: 0
park: 0
so: 1
ones: 1
book: 0
semester: 0
favorite: 3
attend: 0
chapter: 0
kids: 0
being: 1
month: 0
support: 0
rainy: 1
who: 2
concert: 0
mistakes: 1
that: 3
finished: 0
exercise: 0
health: 0
my: 13
sticking: 0
right: 1
recipe: 0
Not: 1
grateful: 0
job: 1
game: 1
tea: 0
meals: 1
well-being: 0
pursue: 0
dreams: 0
graduation: 0
overcoming: 0
with: 4
while: 1
home-cooked: 1
for: 0
Grateful: 0
trip: 1
little: 0
gatherings: 0
Enjoying: 0
dull: 1
car: 1
'm: 0
start: 0
simpler: 1
summer: 0
popcorn: 0
had: 0
road: 0
interview: 1
week: 0
exhausted: 1
travel: 4
lonely: 3
puppy: 0
leaves: 0
sad: 2
confident: 1
completed: 0
without: 2
coffee: 1
an: 5
grandma: 1
team: 1
Bored: 1
moments: 0
workout: 0
because: 0
next: 0
birthday: 0
turn: 2
restaurant: 0
meal: 0
fix: 1
exams: 1
Feeling: 18
cozy: 0
event: 1
Excited: 0
world: 1
bad: 2
happy: 0
learn: 0
finishing: 0
listening: 0
speech: 0
back: 3
family: 0
exam: 1
the: 5
myself: 0
disapp

In [11]:
from nltk.tokenize import word_tokenize

def encode_tweet(tweet, positive_freq, negative_freq):
    # Tokenize the tweet
    tokens = word_tokenize(tweet)

    # Initialize variables to store frequencies
    positive_sum = 0
    negative_sum = 0

    # Calculate the sum of positive and negative frequencies
    for token in tokens:
        if token in positive_freq:
            positive_sum += positive_freq[token]
        if token in negative_freq:
            negative_sum += negative_freq[token]

    # Encode the tweet into the specified format
    encoded_vector = [1, positive_sum, negative_sum]
    return encoded_vector

# Example usage:
tweet = "I am happy because I am learning NLP"
encoded_vector = encode_tweet(tweet, positive_freq, negative_freq)
print("Encoded Vector:", encoded_vector)


Encoded Vector: [1, 25, 32]


In [13]:
from nltk.tokenize import word_tokenize
import numpy as np

def encode_tweet(tweet, positive_freq, negative_freq):
    tokens = word_tokenize(tweet)
    positive_sum = sum(positive_freq.get(token, 0) for token in tokens)
    negative_sum = sum(negative_freq.get(token, 0) for token in tokens)
    return [1, positive_sum, negative_sum]


# Sample tweets
tweets = [
    "I am happy because I am learning NLP",
    "I am sad, I am not learning NLP",
    "I love coding in Python",
    "I hate Mondays, they are so dull"
]

# Initialize matrix X
X = np.zeros((len(tweets), 3))

# Extract features for each tweet and populate the matrix X
for idx, tweet in enumerate(tweets):
    encoded_vector = encode_tweet(tweet, positive_freq, negative_freq)
    X[idx, :] = encoded_vector

print("Matrix X:")
print(X)


Matrix X:
[[ 1. 25. 32.]
 [ 1. 27. 37.]
 [ 1. 17. 14.]
 [ 1. 14. 20.]]


In [15]:
from nltk.tokenize import word_tokenize
import numpy as np

# Sample data
tweets = [
    "I am happy because I am learning NLP",
    "I am sad, I am not learning NLP",
    "I love coding in Python",
    "I hate Mondays, they are so dull"
]

labels = np.array([1, 0, 1, 0])  # 1 for positive, 0 for negative

# Encode tweets
def encode_tweet(tweet, positive_freq, negative_freq):
    tokens = word_tokenize(tweet)
    positive_sum = sum(positive_freq.get(token, 0) for token in tokens)
    negative_sum = sum(negative_freq.get(token, 0) for token in tokens)
    return [1, positive_sum, negative_sum]

X = np.array([encode_tweet(tweet, positive_freq, negative_freq) for tweet in tweets])

# Initialize parameters (theta)
theta = np.zeros(X.shape[1])

# Sigmoid function
def sigmoid(z):
    return 1 / (1 + np.exp(-z))

# Cost function.
def cost_function(X, y, theta):
    m = len(y)
    h = sigmoid(X.dot(theta))
    epsilon = 1e-5  # to avoid log(0)
    cost = -1/m * np.sum(y * np.log(h + epsilon) + (1 - y) * np.log(1 - h + epsilon))
    return cost

# Gradient descent
def gradient_descent(X, y, theta, alpha, iterations):
    m = len(y)
    costs = []
    for _ in range(iterations):
        h = sigmoid(X.dot(theta))
        gradient = X.T.dot(h - y) / m
        theta -= alpha * gradient
        cost = cost_function(X, y, theta)
        costs.append(cost)
    return theta, costs

# Set hyperparameters
alpha = 0.01
iterations = 10000

# Run gradient descent
theta_optimized, costs = gradient_descent(X, labels, theta, alpha, iterations)

# Make predictions on the training data
predictions = np.round(sigmoid(X.dot(theta_optimized)))

# Calculate accuracy
accuracy = np.mean(predictions == labels)
print("Optimized Theta:", theta_optimized)
print("Accuracy:", accuracy)


Optimized Theta: [-0.25119726  3.89650908 -2.93435979]
Accuracy: 1.0


In [16]:
# Define a threshold
threshold = 0.5

# Make predictions based on the threshold
probabilities = sigmoid(X.dot(theta_optimized))
predictions = (probabilities >= threshold).astype(int)

# Print predictions
for tweet, pred in zip(tweets, predictions):
    sentiment = "positive" if pred == 1 else "negative"
    print(f"Tweet: {tweet} - Sentiment: {sentiment}")

Tweet: I am happy because I am learning NLP - Sentiment: positive
Tweet: I am sad, I am not learning NLP - Sentiment: negative
Tweet: I love coding in Python - Sentiment: positive
Tweet: I hate Mondays, they are so dull - Sentiment: negative


In [18]:
predictions = np.round(sigmoid(X.dot(theta_optimized)))

array([1., 0., 1., 0.])

In [19]:
# Calculate accuracy manually
correct_predictions = np.sum(predictions == labels)
total_samples = len(labels)
accuracy = correct_predictions / total_samples

print("Accuracy:", accuracy)


Accuracy: 1.0
