# Sentiment Analysis

## Importing Libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

## Data Collection

In [3]:
df = pd.read_csv('/content/sentiment dataset.csv')
df.shape

(732, 15)

In [4]:
df.head()

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,Text,Sentiment,Timestamp,User,Platform,Hashtags,Retweets,Likes,Country,Year,Month,Day,Hour
0,0,0,Enjoying a beautiful day at the park! ...,Positive,2023-01-15 12:30:00,User123,Twitter,#Nature #Park,15.0,30.0,USA,2023,1,15,12
1,1,1,Traffic was terrible this morning. ...,Negative,2023-01-15 08:45:00,CommuterX,Twitter,#Traffic #Morning,5.0,10.0,Canada,2023,1,15,8
2,2,2,Just finished an amazing workout! ðŸ’ª ...,Positive,2023-01-15 15:45:00,FitnessFan,Instagram,#Fitness #Workout,20.0,40.0,USA,2023,1,15,15
3,3,3,Excited about the upcoming weekend getaway! ...,Positive,2023-01-15 18:20:00,AdventureX,Facebook,#Travel #Adventure,8.0,15.0,UK,2023,1,15,18
4,4,4,Trying out a new recipe for dinner tonight. ...,Neutral,2023-01-15 19:55:00,ChefCook,Instagram,#Cooking #Food,12.0,25.0,Australia,2023,1,15,19


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 732 entries, 0 to 731
Data columns (total 15 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Unnamed: 0.1  732 non-null    int64  
 1   Unnamed: 0    732 non-null    int64  
 2   Text          732 non-null    object 
 3   Sentiment     732 non-null    object 
 4   Timestamp     732 non-null    object 
 5   User          732 non-null    object 
 6   Platform      732 non-null    object 
 7   Hashtags      732 non-null    object 
 8   Retweets      732 non-null    float64
 9   Likes         732 non-null    float64
 10  Country       732 non-null    object 
 11  Year          732 non-null    int64  
 12  Month         732 non-null    int64  
 13  Day           732 non-null    int64  
 14  Hour          732 non-null    int64  
dtypes: float64(2), int64(6), object(7)
memory usage: 85.9+ KB


In [6]:
df.describe()

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,Retweets,Likes,Year,Month,Day,Hour
count,732.0,732.0,732.0,732.0,732.0,732.0,732.0,732.0
mean,366.464481,369.740437,21.508197,42.901639,2020.471311,6.122951,15.497268,15.521858
std,211.513936,212.428936,7.061286,14.089848,2.802285,3.411763,8.474553,4.113414
min,0.0,0.0,5.0,10.0,2010.0,1.0,1.0,0.0
25%,183.75,185.75,17.75,34.75,2019.0,3.0,9.0,13.0
50%,366.5,370.5,22.0,43.0,2021.0,6.0,15.0,16.0
75%,549.25,553.25,25.0,50.0,2023.0,9.0,22.0,19.0
max,732.0,736.0,40.0,80.0,2023.0,12.0,31.0,23.0


## Text Preprocessing



In [7]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

nltk.download('stopwords')
nltk.download('wordnet')

print("NLTK stopwords and wordnet data downloaded.")

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


NLTK stopwords and wordnet data downloaded.


In [11]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('punkt_tab') # Added punkt_tab download

print("NLTK stopwords, wordnet, punkt, and punkt_tab data downloaded.")

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


NLTK stopwords, wordnet, punkt, and punkt_tab data downloaded.


In [12]:
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def clean_text(text):
    text = text.lower() # Lowercasing
    text = re.sub(r'\d+', '', text) # Remove numbers
    text = re.sub(r'[\W_]+', ' ', text) # Remove punctuation and replace with space
    tokens = nltk.word_tokenize(text) # Tokenization
    tokens = [word for word in tokens if word not in stop_words] # Remove stopwords
    tokens = [lemmatizer.lemmatize(word) for word in tokens] # Lemmatization
    return ' '.join(tokens)

df['Cleaned_Text'] = df['Text'].apply(clean_text)
print("Text cleaning and lemmatization applied. Displaying first 5 rows of 'Cleaned_Text':")
print(df[['Text', 'Cleaned_Text']].head())

Text cleaning and lemmatization applied. Displaying first 5 rows of 'Cleaned_Text':
                                                Text  \
0   Enjoying a beautiful day at the park!        ...   
1   Traffic was terrible this morning.           ...   
2   Just finished an amazing workout! ðŸ’ª          ...   
3   Excited about the upcoming weekend getaway!  ...   
4   Trying out a new recipe for dinner tonight.  ...   

                       Cleaned_Text  
0       enjoying beautiful day park  
1          traffic terrible morning  
2          finished amazing workout  
3  excited upcoming weekend getaway  
4  trying new recipe dinner tonight  


## Feature Extraction


In [13]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Instantiate TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer()

# Apply fit_transform to the 'Cleaned_Text' column
X = tfidf_vectorizer.fit_transform(df['Cleaned_Text'])

# Print the shape of the TF-IDF feature matrix
print("Shape of TF-IDF feature matrix (X):", X.shape)
print("TF-IDF Vectorization complete.")

Shape of TF-IDF feature matrix (X): (732, 2274)
TF-IDF Vectorization complete.


## Encode Target Variable



In [14]:
from sklearn.preprocessing import LabelEncoder

# Instantiate LabelEncoder
label_encoder = LabelEncoder()

# Apply fit_transform to the 'Sentiment' column and store in a new column
df['Sentiment_Encoded'] = label_encoder.fit_transform(df['Sentiment'])
y = df['Sentiment_Encoded']

# Display the mapping of original sentiment to encoded values and the first 5 rows with the new column
print("Original Sentiment labels and their encoded values:")
for i, sentiment in enumerate(label_encoder.classes_):
    print(f"{sentiment}: {i}")

print("\nFirst 5 rows with 'Sentiment_Encoded':")
print(df[['Sentiment', 'Sentiment_Encoded']].head())

Original Sentiment labels and their encoded values:
 Acceptance   : 0
 Acceptance      : 1
 Accomplishment : 2
 Admiration : 3
 Admiration   : 4
 Admiration    : 5
 Adoration    : 6
 Adrenaline     : 7
 Adventure : 8
 Affection    : 9
 Amazement : 10
 Ambivalence : 11
 Ambivalence     : 12
 Amusement    : 13
 Amusement     : 14
 Anger        : 15
 Anticipation : 16
 Anticipation  : 17
 Anxiety   : 18
 Anxiety         : 19
 Appreciation  : 20
 Apprehensive : 21
 Arousal       : 22
 ArtisticBurst : 23
 Awe : 24
 Awe    : 25
 Awe          : 26
 Awe           : 27
 Bad : 28
 Betrayal : 29
 Betrayal      : 30
 Bitter       : 31
 Bitterness : 32
 Bittersweet : 33
 Blessed       : 34
 Boredom : 35
 Boredom         : 36
 Breakthrough : 37
 Calmness     : 38
 Calmness      : 39
 Captivation : 40
 Celebration : 41
 Celestial Wonder : 42
 Challenge : 43
 Charm : 44
 Colorful : 45
 Compassion: 46
 Compassion    : 47
 Compassionate : 48
 Confidence    : 49
 Confident : 50
 Confusion : 51
 Confusion

## Data Splitting



In [16]:
from sklearn.preprocessing import LabelEncoder

# Helper function to categorize sentiments into 'Positive', 'Negative', or 'Neutral'
def categorize_sentiment(sentiment_label):
    sentiment_label = sentiment_label.strip().lower() # Normalize case and strip spaces

    # Explicit keywords for positive sentiments based on the previous label_encoder output
    positive_keywords = ['positive', 'acceptance', 'accomplishment', 'admiration', 'adoration', 'adrenaline', 'adventure', 'affection', 'amazement', 'amusement', 'anticipation', 'appreciation', 'arousal', 'artisticburst', 'awe', 'blessed', 'breakthrough', 'calmness', 'captivation', 'celebration', 'celestial wonder', 'charm', 'colorful', 'compassion', 'confident', 'connection', 'contentment', 'coziness', 'creative inspiration', 'creativity', 'culinary adventure', 'culinaryodyssey', 'curiosity', 'dazzle', 'dreamchaser', 'ecstasy', 'elation', 'elegance', 'empathetic', 'empowerment', 'enchantment', 'energy', 'engagement', 'enjoyment', 'enthusiasm', 'euphoria', 'excitement', 'festivejoy', 'free-spirited', 'freedom', 'friendship', 'fulfillment', 'grandeur', 'grateful', 'gratitude', 'happiness', 'happy', 'harmony', 'heartwarming', 'hope', 'hopeful', 'hypnotic', 'iconic', 'imagination', 'immersion', 'innerjourney', 'inspiration', 'inspired', 'intrigue', 'joy', 'joy in baking', 'joyfulreunion', 'kind', 'kindness', 'love', 'marvel', 'melodic', 'mesmerizing', 'mindfulness', 'mischievous', 'motivation', 'nature\'s beauty', 'optimism', 'overjoyed', 'pensive', 'playful', 'playfuljoy', 'positivity', 'pride', 'proud', 'radiance', 'reflection', 'rejuvenation', 'relief', 'renewed effort', 'resilience', 'reverence', 'romance', 'runway creativity', 'satisfaction', 'serenity', 'solace', 'solitude', 'spark', 'success', 'surprise', 'sympathy', 'tenderness', 'thrill', 'thrilling journey', 'touched', 'tranquility', 'triumph', 'vibrancy', 'whimsy', 'whispers of the past', 'winter magic', 'wonder', 'wonderment', 'yearning', 'zest']
    # Explicit keywords for negative sentiments
    negative_keywords = ['negative', 'ambivalence', 'anger', 'apprehensive', 'anxiety', 'bad', 'betrayal', 'bitter', 'bitterness', 'bittersweet', 'boredom', 'challenge', 'confusion', 'darkness', 'desolation', 'despair', 'desperation', 'devastated', 'disappointed', 'disappointment', 'disgust', 'dismissive', 'emotionalstorm', 'envious', 'envy', 'exhaustion', 'fear', 'fearful', 'frustrated', 'frustration', 'grief', 'hate', 'heartache', 'heartbreak', 'helplessness', 'intimidation', 'isolation', 'jealous', 'jealousy', 'loneliness', 'loss', 'lostlove', 'melancholy', 'miscalculation', 'numbness', 'obstacle', 'overwhelmed', 'pressure', 'regret', 'resentment', 'ruins', 'sad', 'sadness', 'shame', 'sorrow', 'suffering', 'suspense']
    # Explicit keywords for neutral sentiments
    neutral_keywords = ['neutral', 'contemplation', 'determination', 'envisioning history', 'exploration', 'ocean\'s freedom']

    if sentiment_label in positive_keywords:
        return 'Positive'
    elif sentiment_label in negative_keywords:
        return 'Negative'
    elif sentiment_label in neutral_keywords:
        return 'Neutral'
    else:
        # Fallback for any sentiment label not explicitly matched, assign to Neutral
        return 'Neutral'

# Apply the categorization function to the 'Sentiment' column
df['Sentiment_Categorized'] = df['Sentiment'].apply(categorize_sentiment)

# Instantiate LabelEncoder
label_encoder = LabelEncoder()

# Apply fit_transform to the new 'Sentiment_Categorized' column
df['Sentiment_Encoded'] = label_encoder.fit_transform(df['Sentiment_Categorized'])
y = df['Sentiment_Encoded']

# Display the mapping of original sentiment to encoded values and the first 5 rows with the new column
print("Original Sentiment labels and their encoded values (after categorization):")
for i, sentiment in enumerate(label_encoder.classes_):
    print(f"{sentiment}: {i}")

print("\nFirst 5 rows with 'Sentiment_Categorized' and 'Sentiment_Encoded':")
print(df[['Sentiment', 'Sentiment_Categorized', 'Sentiment_Encoded']].head())

print("\nValue counts for 'Sentiment_Categorized' to confirm class distribution:")
print(df['Sentiment_Categorized'].value_counts())

Original Sentiment labels and their encoded values (after categorization):
Negative: 0
Neutral: 1
Positive: 2

First 5 rows with 'Sentiment_Categorized' and 'Sentiment_Encoded':
     Sentiment Sentiment_Categorized  Sentiment_Encoded
0   Positive                Positive                  2
1   Negative                Negative                  0
2   Positive                Positive                  2
3   Positive                Positive                  2
4   Neutral                  Neutral                  1

Value counts for 'Sentiment_Categorized' to confirm class distribution:
Sentiment_Categorized
Positive    474
Negative    194
Neutral      64
Name: count, dtype: int64


In [17]:
from sklearn.model_selection import train_test_split

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Print the shapes of the resulting sets
print("Shape of X_train:", X_train.shape)
print("Shape of X_test:", X_test.shape)
print("Shape of y_train:", y_train.shape)
print("Shape of y_test:", y_test.shape)
print("Dataset split into training and testing sets successfully.")

Shape of X_train: (585, 2274)
Shape of X_test: (147, 2274)
Shape of y_train: (585,)
Shape of y_test: (147,)
Dataset split into training and testing sets successfully.


## Model Training




In [18]:
from sklearn.linear_model import LogisticRegression

# Instantiate the LogisticRegression model
model = LogisticRegression(max_iter=1000, random_state=42)

# Train the model
model.fit(X_train, y_train)
print("Logistic Regression model trained successfully.")

# Make predictions on the test data
y_pred = model.predict(X_test)
print("Predictions made on test data.")

Logistic Regression model trained successfully.
Predictions made on test data.


## Model Evaluation

In [19]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

# Calculate evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

# Print the metrics
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-Score: {f1:.4f}")

# Print a detailed classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

Accuracy: 0.7347
Precision: 0.8119
Recall: 0.7347
F1-Score: 0.6737

Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.31      0.47        39
           1       1.00      0.08      0.14        13
           2       0.71      1.00      0.83        95

    accuracy                           0.73       147
   macro avg       0.90      0.46      0.48       147
weighted avg       0.81      0.73      0.67       147

