In [5]:
!pip install wordcloud
!pip install nltk
!pip install scikit-learn
!pip install matplotlib
!pip install seaborn
!pip install -U spacy
!pip install vaderSentiment
!python -m spacy download en_core_web_sm


Collecting vaderSentiment
  Downloading vaderSentiment-3.3.2-py2.py3-none-any.whl.metadata (572 bytes)
Downloading vaderSentiment-3.3.2-py2.py3-none-any.whl (125 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m126.0/126.0 kB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: vaderSentiment
Successfully installed vaderSentiment-3.3.2
Collecting en-core-web-sm==3.8.0
  Using cached https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


# **0.1. Import Libraries and Load Dataset**
Import the necessary libraries and load the Newsgroups dataset for further processing

In [6]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

nltk.download('stopwords')
nltk.download('punkt')

df = pd.read_csv('tweets-data.csv')

df.head()

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


Unnamed: 0.1,Unnamed: 0,Date Created,Number of Likes,Source of Tweet,Tweets,hashtag
0,0,2023-06-25 19:16:20+00:00,0,,@jacksonhinklle #wagner with 6.2 billion dolla...,wagner
1,1,2023-06-25 19:16:18+00:00,0,,Pobrecito es discapacitado\n#Reddetuiterosdemo...,wagner
2,2,2023-06-25 19:16:07+00:00,0,,News from the EIR Daily Alert\n\n“#Putin Addre...,wagner
3,3,2023-06-25 19:15:56+00:00,0,,It's Messi day #Messi𓃵 #Messi36 #Russia #bigst...,wagner
4,4,2023-06-25 19:15:54+00:00,0,,Il passaggio chiave di Machiavelli era questo ...,wagner


# **2. Define Sentiment Function**
**We define the get_sentiment(text) function to return a tuple (label, score), where:**

label indicates the sentiment category: "positive", "neutral", or "negative"

score is the compound sentiment score calculated using the VADER sentiment analyzer

In [8]:
analyzer = SentimentIntensityAnalyzer()

def get_sentiment(text: str) -> tuple:
    """
    Compute VADER sentiment for a given text.
    Returns:
        sentiment_label (str): 'positive', 'neutral', or 'negative'
        compound_score (float): the VADER compound score
    """
    vs = analyzer.polarity_scores(text)
    compound = vs['compound']
    if compound >=  0.05:
        label = 'positive'
    elif compound <= -0.05:
        label = 'negative'
    else:
        label = 'neutral'
    return label, compound

# **3. Clean Tweets Text**
**We define the clean_text(text) function to perform the following preprocessing steps:**

Convert text to lowercase

Remove URLs, mentions (e.g., @user), and hashtags (e.g., #tag)

Eliminate non-alphabetic characters

Tokenize the text and remove English stopwords

Reconstruct and return the cleaned sentence

In [9]:
stop_words = set(stopwords.words('english'))

def clean_text(text: str) -> str:
    """
    Clean tweet text by:
      - lowercasing
      - removing URLs, mentions, hashtags
      - removing non-letter characters
      - tokenizing and removing stopwords
      - rejoining cleaned tokens
    """
    text = text.lower()
    text = re.sub(r'http\S+|www.\S+', '', text)
    text = re.sub(r'@\w+|#\w+', '', text)
    text = re.sub(r'[^a-z\s]', '', text)
    tokens = word_tokenize(text)
    tokens = [tok for tok in tokens if tok not in stop_words]
    return ' '.join(tokens)

nltk.download('punkt_tab')
df['cleaned_text'] = df['Tweets'].apply(clean_text)

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


# **4. Apply Sentiment Function and Add Columns**
We apply the get_sentiment function to the cleaned_text column, unpack the resulting tuples, and store the outputs in two new columns: sentiment_label and sentiment_score.

In [10]:
sentiments = df['cleaned_text'].apply(get_sentiment)
df['sentiment_label'] = sentiments.apply(lambda x: x[0])
df['sentiment_score'] = sentiments.apply(lambda x: x[1])

df.head()

Unnamed: 0.1,Unnamed: 0,Date Created,Number of Likes,Source of Tweet,Tweets,hashtag,cleaned_text,sentiment_label,sentiment_score
0,0,2023-06-25 19:16:20+00:00,0,,@jacksonhinklle #wagner with 6.2 billion dolla...,wagner,billion dollar,neutral,0.0
1,1,2023-06-25 19:16:18+00:00,0,,Pobrecito es discapacitado\n#Reddetuiterosdemo...,wagner,pobrecito es discapacitado,neutral,0.0
2,2,2023-06-25 19:16:07+00:00,0,,News from the EIR Daily Alert\n\n“#Putin Addre...,wagner,news eir daily alert addressed people armed ju...,positive,0.296
3,3,2023-06-25 19:15:56+00:00,0,,It's Messi day #Messi𓃵 #Messi36 #Russia #bigst...,wagner,messi day,neutral,0.0
4,4,2023-06-25 19:15:54+00:00,0,,Il passaggio chiave di Machiavelli era questo ...,wagner,il passaggio chiave di machiavelli era questo ...,neutral,0.0
