In [45]:
# General Imports
import re
from tqdm.notebook import tqdm
from emoji import demojize
from typing import Union, List

from transformers import pipeline
# Load the sentiment analysis pipeline
# zero shot classification
classifier = pipeline("zero-shot-classification")

# BERT-base-uncased-emotion
# classifier = pipeline("text-classification",model='bhadresh-savani/bert-base-uncased-emotion', return_all_scores=True)

# Data Analysis and visualizations
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 

# Import Spacy
import spacy
from spacy.lang.en import English
nlp = spacy.load("en_core_web_md", disable=["parser", "ner"])

# Import NLTK
import nltk
from nltk.corpus import sentiwordnet as swn
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# Download necessary NLTK data
nltk.download('stopwords', quiet=True)
stop_words = set(stopwords.words('english'))
nltk.download('punkt', quiet=True)
nltk.download('sentiwordnet')
nltk.download('wordnet')

No model was supplied, defaulted to facebook/bart-large-mnli and revision c626438 (https://huggingface.co/facebook/bart-large-mnli).
Using a pipeline without specifying a model name and revision in production is not recommended.
[nltk_data] Downloading package sentiwordnet to /home/jon/nltk_data...
[nltk_data]   Package sentiwordnet is already up-to-date!
[nltk_data] Downloading package wordnet to /home/jon/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [9]:
# Preprocessing functions

def preprocess_text(texts: Union[str, List[str], pd.Series], clean_emojis: bool = False) -> Union[str, List[str]]:
    cleaned_texts = []

    # Processing texts using Spacy pipeline
    for doc in tqdm(nlp.pipe(texts, batch_size=20), total=len(texts), desc="Cleaning Texts"):

        # Handle emojis: translate to text if not removing, else remove
        if clean_emojis:
            doc = re.sub(r':[^:]+:', '', demojize(doc.text))  # Remove emojis
        else:
            doc = demojize(doc.text)  # Convert emojis to text

        # Tokenization and preprocessing
        tokens = [token.text.lower() for token in nlp(doc) if token.text.isalpha()]

        # Removing stopwords and short tokens
        tokens = [token for token in tokens if token not in stop_words and len(token) > 1]

        cleaned_texts.append(' '.join(tokens))  # Rejoin tokens into a string

    return cleaned_texts


In [13]:
df = pd.read_csv('truth_seeker.csv')

  df = pd.read_csv('truth_seeker.csv')


In [15]:
# Fill all the NaN values in the body column with an empty string
df['tweet'] = df['tweet'].fillna('')

# Preview the loaded data 
display(df.head(10))


Unnamed: 0.1,Unnamed: 0,author,statement,target,BinaryNumTarget,manual_keywords,tweet,5_label_majority_answer,3_label_majority_answer,timestamp
0,0,D.L. Davis,End of eviction moratorium means millions of A...,True,1.0,"Americans, eviction moratorium",@POTUS Biden Blunders - 6 Month Update\n\nInfl...,Mostly Agree,Agree,Thu Sep 09 23:58:53 +0000 2021
1,1,D.L. Davis,End of eviction moratorium means millions of A...,True,1.0,"Americans, eviction moratorium",@S0SickRick @Stairmaster_ @6d6f636869 Not as m...,NO MAJORITY,Agree,Mon Aug 30 18:58:09 +0000 2021
2,2,D.L. Davis,End of eviction moratorium means millions of A...,True,1.0,"Americans, eviction moratorium",THE SUPREME COURT is siding with super rich pr...,Agree,Agree,Fri Aug 27 09:53:44 +0000 2021
3,3,D.L. Davis,End of eviction moratorium means millions of A...,True,1.0,"Americans, eviction moratorium",@POTUS Biden Blunders\n\nBroken campaign promi...,Mostly Agree,Agree,Tue Oct 05 20:37:14 +0000 2021
4,4,D.L. Davis,End of eviction moratorium means millions of A...,True,1.0,"Americans, eviction moratorium",@OhComfy I agree. The confluence of events rig...,Agree,Agree,Fri Aug 27 10:58:24 +0000 2021
5,5,D.L. Davis,End of eviction moratorium means millions of A...,True,1.0,"Americans, eviction moratorium","I've said this before, but it really is incred...",Agree,Agree,Fri Aug 27 14:00:41 +0000 2021
6,6,D.L. Davis,End of eviction moratorium means millions of A...,True,1.0,"Americans, eviction moratorium","As many face backlogged rent payments, America...",Mostly Agree,Agree,Sat Sep 18 01:50:18 +0000 2021
7,7,D.L. Davis,End of eviction moratorium means millions of A...,True,1.0,"Americans, eviction moratorium",@Thomas1774Paine @JoeBiden\n#DOJ@TheJusticeDep...,Mostly Agree,Agree,Tue Aug 10 05:28:26 +0000 2021
8,8,D.L. Davis,End of eviction moratorium means millions of A...,True,1.0,"Americans, eviction moratorium",@SocialismIsDone @TheeKHiveQueenB Its a win fo...,Agree,Agree,Tue Aug 24 01:11:52 +0000 2021
9,9,D.L. Davis,End of eviction moratorium means millions of A...,True,1.0,"Americans, eviction moratorium",@daysofarelives2 @Sen_JoeManchin There is not ...,NO MAJORITY,Agree,Wed Oct 06 05:41:01 +0000 2021


In [20]:

# Your text samples
texts = pd.DataFrame(df['tweet'], columns=['tweet'])

# Candidate labels
candidate_labels = ["happiness", "trust", "hope", "sadness", "anger", "fear"]

# Preprocess each title and track progress with tqdm
texts['processed_tweet'] = preprocess_text(texts['tweet'], clean_emojis=True)
# takes around 15 min


Cleaning Texts:   0%|          | 0/134203 [00:00<?, ?it/s]

In [21]:
display(texts.head(40))

Unnamed: 0,tweet,processed_tweet
0,@POTUS Biden Blunders - 6 Month Update\n\nInfl...,biden blunders month update inflation delta mi...
1,@S0SickRick @Stairmaster_ @6d6f636869 Not as m...,many people literally starving streets century...
2,THE SUPREME COURT is siding with super rich pr...,supreme court siding super rich property owner...
3,@POTUS Biden Blunders\n\nBroken campaign promi...,biden blunders broken campaign promises inflat...
4,@OhComfy I agree. The confluence of events rig...,agree confluence events right unprecedented af...
5,"I've said this before, but it really is incred...",said really incredibly way afghanistan complet...
6,"As many face backlogged rent payments, America...",many face backlogged rent payments americans o...
7,@Thomas1774Paine @JoeBiden\n#DOJ@TheJusticeDep...,instructing moratorium americans depend earnin...
8,@SocialismIsDone @TheeKHiveQueenB Its a win fo...,win americansim worried taking credit matter l...
9,@daysofarelives2 @Sen_JoeManchin There is not ...,never stimulus checks plan joey biden already ...


In [58]:
num_rows_to_keep = int(len(texts) * 0.30)

# Slice the DataFrame to keep only the first 30% of rows
sample_texts = texts.iloc[:num_rows_to_keep]

# Initialize lists to store the emotions and scores
emotions = []
scores = []

# Iterate through each row in the limited DataFrame, adding tqdm for progress tracking
for text in tqdm(sample_texts['processed_tweet'], desc="Analyzing Emotions"):
    # Get the classification results for this text
    result = classifier(text, candidate_labels)
    # Extract the most likely emotion and its corresponding score
    emotion = result['labels'][0]
    score = result['scores'][0]
    # Append the results to our lists
    emotions.append(emotion)
    scores.append(score)

# Add the lists as new columns in the limited DataFrame
texts['emotion'] = emotions
texts['score'] = scores

# Show the updated DataFrame
display(texts)

Analyzing Emotions:   0%|          | 0/40260 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [56]:
limited_texts

Unnamed: 0,tweet,processed_tweet
0,@POTUS Biden Blunders - 6 Month Update\n\nInfl...,biden blunders month update inflation delta mi...
1,@S0SickRick @Stairmaster_ @6d6f636869 Not as m...,many people literally starving streets century...
2,THE SUPREME COURT is siding with super rich pr...,supreme court siding super rich property owner...
3,@POTUS Biden Blunders\n\nBroken campaign promi...,biden blunders broken campaign promises inflat...
4,@OhComfy I agree. The confluence of events rig...,agree confluence events right unprecedented af...
...,...,...
995,So far the US has roughly spent 100 billion do...,far us roughly spent billion dollars reconstru...
996,@CBSNews Damaged Credibility: U.S. Military Ge...,damaged credibility military generals spending...
997,@toryboypierce @realDonaldTrump Hasn't started...,started wars israel situation monumental coole...
998,"In the end, over a trillion dollars spent in A...",end trillion dollars spent afghanistan iraq bi...
