In [4]:
!pip install wordcloud
!pip install nltk
!pip install scikit-learn
!pip install matplotlib
!pip install seaborn
!pip install -U spacy
!pip install vaderSentiment
!python -m spacy download en_core_web_sm


Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m98.1 MB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


# **1. Load libraries & data**

In [5]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from transformers import pipeline

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('punkt_tab')

df = pd.read_csv('tweets-data.csv')

df.head()

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


Unnamed: 0.1,Unnamed: 0,Date Created,Number of Likes,Source of Tweet,Tweets,hashtag
0,0,2023-06-25 19:16:20+00:00,0,,@jacksonhinklle #wagner with 6.2 billion dolla...,wagner
1,1,2023-06-25 19:16:18+00:00,0,,Pobrecito es discapacitado\n#Reddetuiterosdemo...,wagner
2,2,2023-06-25 19:16:07+00:00,0,,News from the EIR Daily Alert\n\n“#Putin Addre...,wagner
3,3,2023-06-25 19:15:56+00:00,0,,It's Messi day #Messi𓃵 #Messi36 #Russia #bigst...,wagner
4,4,2023-06-25 19:15:54+00:00,0,,Il passaggio chiave di Machiavelli era questo ...,wagner


# **2. Clean the Tweets Text**
Perform preprocessing on the tweets by converting to lowercase, removing URLs, mentions, hashtags, non-alphabetic characters, and stopwords, then reconstructing the cleaned text.

In [6]:
stop_words = set(stopwords.words('english'))

def clean_text(text: str) -> str:
    """
    Clean tweet text by:
      - lowercasing
      - removing URLs, mentions, hashtags
      - removing non-letter characters
      - tokenizing and removing stopwords
      - rejoining cleaned tokens into a sentence
    """
    text = text.lower()
    text = re.sub(r'http\S+|www\.\S+', '', text)
    text = re.sub(r'@\w+|#\w+', '', text)
    text = re.sub(r'[^a-z\s]', '', text)
    tokens = word_tokenize(text)
    tokens = [tok for tok in tokens if tok not in stop_words and len(tok) > 1]
    return ' '.join(tokens)


df['cleaned_text'] = df['Tweets'].astype(str).apply(clean_text)

df[['Tweets', 'cleaned_text']].head()

Unnamed: 0,Tweets,cleaned_text
0,@jacksonhinklle #wagner with 6.2 billion dolla...,billion dollar
1,Pobrecito es discapacitado\n#Reddetuiterosdemo...,pobrecito es discapacitado
2,News from the EIR Daily Alert\n\n“#Putin Addre...,news eir daily alert addressed people armed ju...
3,It's Messi day #Messi𓃵 #Messi36 #Russia #bigst...,messi day
4,Il passaggio chiave di Machiavelli era questo ...,il passaggio chiave di machiavelli era questo ...


# **3. Sentiment Analysis Using a Transformers Pipeline**
We set up the sentiment-analysis pipeline utilizing a DistilBERT model pre-trained on the SST-2 dataset, and then process the cleaned tweets in batches to determine their sentiment.

In [7]:
from transformers import pipeline
classifier = pipeline('sentiment-analysis')

results = classifier(
    df['cleaned_text'].tolist(),
    batch_size=32,
    truncation=True,
    max_length=512
)

df['sentiment_label'] = [res['label'] for res in results]
df['sentiment_score'] = [res['score'] for res in results]

df.head()

No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision 714eb0f (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

Device set to use cpu


Unnamed: 0.1,Unnamed: 0,Date Created,Number of Likes,Source of Tweet,Tweets,hashtag,cleaned_text,sentiment_label,sentiment_score
0,0,2023-06-25 19:16:20+00:00,0,,@jacksonhinklle #wagner with 6.2 billion dolla...,wagner,billion dollar,POSITIVE,0.999244
1,1,2023-06-25 19:16:18+00:00,0,,Pobrecito es discapacitado\n#Reddetuiterosdemo...,wagner,pobrecito es discapacitado,NEGATIVE,0.574673
2,2,2023-06-25 19:16:07+00:00,0,,News from the EIR Daily Alert\n\n“#Putin Addre...,wagner,news eir daily alert addressed people armed ju...,NEGATIVE,0.878521
3,3,2023-06-25 19:15:56+00:00,0,,It's Messi day #Messi𓃵 #Messi36 #Russia #bigst...,wagner,messi day,NEGATIVE,0.999102
4,4,2023-06-25 19:15:54+00:00,0,,Il passaggio chiave di Machiavelli era questo ...,wagner,il passaggio chiave di machiavelli era questo ...,POSITIVE,0.850299
