# Unreliable tweet detection
### Authors
    - Jonathan Gonzalez 
    - Xavier Lapointe 
    - Olivia Mirijello 
    - Tudor Nicolae Rosu
---
#### Imports

In [1]:
# General Python libraries and utilities
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re
from tqdm.notebook import tqdm
from collections import Counter
import pprint
from typing import Union, List

# NLP and text processing libraries
import nltk
from nltk.tokenize import word_tokenize
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.corpus import stopwords, sentiwordnet as swn
from textblob import TextBlob
import spacy
from spacy.lang.en import English
from emoji import demojize
import gensim
from gensim import corpora
from gensim.models import LdaMulticore, callbacks
from gensim.models.phrases import ENGLISH_CONNECTOR_WORDS
import gensim.downloader as api
from nrclex import NRCLex

# Machine Learning and PyTorch
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, TensorDataset
from torch.utils.tensorboard import SummaryWriter

# Visualization for NLP
import pyLDAvis
import pyLDAvis.gensim
from wordcloud import WordCloud
import matplotlib.colors as mcolors
from matplotlib.patches import Rectangle
from matplotlib.ticker import FuncFormatter

# Custom modules
import modules.preprocess as preprocess
from modules.utils import build_dataset, text_to_word2vec, evaluate
from modules.rnn_model import TextRNN
from modules.sentiment import get_sentiment
from modules.emotion import analyze_tweets_emotions, plot_emotion_distribution
# NLTK and Spacy setup
nltk.download('punkt', quiet=True)
nltk.download('stopwords', quiet=True)
nltk.download('vader_lexicon', quiet=True)
nltk.download('sentiwordnet', quiet=True)
stop_words = set(stopwords.words('english'))
except_words = {'through'}
stop_words = stop_words - except_words
nlp = spacy.load("en_core_web_md", disable=["parser", "ner"])

# Set pprint options
pprint = pprint.PrettyPrinter(indent=4).pprint

# Initialize VADER
sid = SentimentIntensityAnalyzer()


  from pandas.core import (
2024-04-03 22:28:54.491844: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-04-03 22:28:54.491868: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-04-03 22:28:54.492621: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-04-03 22:28:54.498304: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
[nltk_data] Downloading p

#### Define file path and import raw data

In [2]:
file_path = 'archive/truth_seeker.xlsx'


In [3]:
raw_data = pd.read_excel(file_path)
raw_data = raw_data.dropna(subset=['tweet'])
raw_data = raw_data.rename(columns={'Unnamed: 0': 'key'}) #add index as key for joining df into main

In [4]:
raw_data

Unnamed: 0,key,author,statement,target,BinaryNumTarget,manual_keywords,tweet,5_label_majority_answer,3_label_majority_answer,timestamp
0,0,D.L. Davis,End of eviction moratorium means millions of A...,1.0,1.0,"Americans, eviction moratorium",@POTUS Biden Blunders - 6 Month Update\n\nInfl...,Mostly Agree,Agree,Thu Sep 09 23:58:53 +0000 2021
1,1,D.L. Davis,End of eviction moratorium means millions of A...,1.0,1.0,"Americans, eviction moratorium",@S0SickRick @Stairmaster_ @6d6f636869 Not as m...,NO MAJORITY,Agree,Mon Aug 30 18:58:09 +0000 2021
2,2,D.L. Davis,End of eviction moratorium means millions of A...,1.0,1.0,"Americans, eviction moratorium",THE SUPREME COURT is siding with super rich pr...,Agree,Agree,Fri Aug 27 09:53:44 +0000 2021
3,3,D.L. Davis,End of eviction moratorium means millions of A...,1.0,1.0,"Americans, eviction moratorium",@POTUS Biden Blunders\n\nBroken campaign promi...,Mostly Agree,Agree,Tue Oct 05 20:37:14 +0000 2021
4,4,D.L. Davis,End of eviction moratorium means millions of A...,1.0,1.0,"Americans, eviction moratorium",@OhComfy I agree. The confluence of events rig...,Agree,Agree,Fri Aug 27 10:58:24 +0000 2021
...,...,...,...,...,...,...,...,...,...,...
134198,134193,Tom Kertscher,Joe Bidens great-grandfather Joseph J. Biden w...,0.0,0.0,"Biden, great grandfather, slave owner",Joe Biden's family owned African slaves....\n\...,Mostly Agree,Agree,Mon Jun 22 15:02:31 +0000 2020
134199,134194,Tom Kertscher,Joe Bidens great-grandfather Joseph J. Biden w...,0.0,0.0,"Biden, great grandfather, slave owner","Joe Bidens great, great grandfather was a slav...",Agree,Agree,Mon Oct 12 15:52:02 +0000 2020
134200,134195,Tom Kertscher,Joe Bidens great-grandfather Joseph J. Biden w...,0.0,0.0,"Biden, great grandfather, slave owner","@ChevyChaseToGo ""Joe Bidens great-grandfather ...",Mostly Agree,Agree,Fri Oct 16 21:02:49 +0000 2020
134201,134196,Tom Kertscher,Joe Bidens great-grandfather Joseph J. Biden w...,0.0,0.0,"Biden, great grandfather, slave owner",@JoeBiden Facts are Bidens VP Kamala Harris Gr...,NO MAJORITY,Agree,Thu Jun 17 20:30:22 +0000 2021


#### Sentiment analysis

In [5]:
sentiment_df = get_sentiment(raw_data)

In [18]:
display(sentiment_df.head(40))

Unnamed: 0,key,Author,Tweet,Tone,Negative,Neutral,Positive,Polarity Score TB,Polarity Score Vader,Abs Polarity Vader,Score Difference,Target
0,0,D.L. Davis,@POTUS Biden Blunders - 6 Month Update\n\nInfl...,positive,0.316,0.631,0.053,0.5,-0.9169,0.9169,1.4169,1.0
1,1,D.L. Davis,@S0SickRick @Stairmaster_ @6d6f636869 Not as m...,positive,0.311,0.689,0.0,0.125,-0.9449,0.9449,1.0699,1.0
2,2,D.L. Davis,THE SUPREME COURT is siding with super rich pr...,positive,0.297,0.447,0.256,0.206667,-0.3147,0.3147,0.521367,1.0
3,3,D.L. Davis,@POTUS Biden Blunders\n\nBroken campaign promi...,negative,0.276,0.576,0.147,-0.4,-0.828,0.828,0.428,1.0
4,4,D.L. Davis,@OhComfy I agree. The confluence of events rig...,positive,0.241,0.701,0.058,0.442857,-0.8316,0.8316,1.274457,1.0
5,5,D.L. Davis,"I've said this before, but it really is incred...",positive,0.095,0.905,0.0,0.1875,-0.6124,0.6124,0.7999,1.0
6,6,D.L. Davis,"As many face backlogged rent payments, America...",positive,0.115,0.885,0.0,0.25,-0.6075,0.6075,0.8575,1.0
7,7,D.L. Davis,@Thomas1774Paine @JoeBiden\n#DOJ@TheJusticeDep...,neutral,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
8,8,D.L. Davis,@SocialismIsDone @TheeKHiveQueenB Its a win fo...,positive,0.163,0.533,0.304,1.0,0.5393,0.5393,0.4607,1.0
9,9,D.L. Davis,@daysofarelives2 @Sen_JoeManchin There is not ...,positive,0.11,0.784,0.106,0.8,-0.0516,0.0516,0.8516,1.0


#### Emotion Analysis

In [7]:
# Dataframe with keys for merging 
texts = list(zip(raw_data['key'], raw_data['tweet']))

# Process texts in parallel and track progress with tqdm
processed_tweets = preprocess.emotion_clean_text_parallel(texts, clean_emojis=True)

#texts['processed_tweet'] = processed_tweets

Cleaning Texts:   0%|          | 0/134197 [00:00<?, ?it/s]

In [8]:
display(processed_tweets)


[(208,
  'strugglin americans getting taken care first especially eviction moratorium ending soon politicians show kind concern well'),
 (415,
  'sign failing administration knowingly unlawful actions taken political advantageas eviction moratorium biden told americans nt believe mandatory vaccinations dictator biden harris must removed office save america'),
 (180,
  'dems blkd fed feed schoolkids overturned eviction moratorium creating homeless blkd ui forcing people work unsafe cv conditions tried overturn aca voted arp aid americans whined abt afghan refugees ripped brown kids fams'),
 (464,
  'today america million working class americans thrown unemployment benefits eviction moratorium surging pandemic due biden administration mismanagement public health policy buddy joe got plan'),
 (0,
  'biden blunders month update inflation delta mismanagement covid kids abandoning americans afghanistan arming taliban border crisis breaking job growth abuse power many exec orders reconciliati

In [9]:
emotion_df = analyze_tweets_emotions(processed_tweets)

In [10]:
display(emotion_df.head(40))

Unnamed: 0,key,tweet,emotion 1,emotion 1 score,emotion 2,emotion 2 score
0,208,strugglin americans getting taken care first e...,trust,2,anger,1
1,415,sign failing administration knowingly unlawful...,fear,4,anger,3
2,180,dems blkd fed feed schoolkids overturned evict...,fear,3,anger,2
3,464,today america million working class americans ...,anticipation,3,fear,2
4,0,biden blunders month update inflation delta mi...,fear,3,anger,2
5,419,may want use little caution common american se...,anger,2,anticipation,2
6,427,consider risking days lost income right evicti...,sadness,4,trust,2
7,202,rented former condo pandemic forced sale evict...,fear,3,sadness,3
8,417,americans panic state actor marcy celebrates b...,fear,2,anger,1
9,457,definitely tuition free native americans back ...,joy,2,trust,2


RNN_Classifier for unreliability

In [None]:
# data_rnn = text_edit(raw_data,
#                     grp_num=False,
#                     rm_newline=True,
#                     rm_punctuation=True,
#                     rm_stop_words=False,
#                     lowercase=True,
#                     lemmatize=False,
#                     expand=False,
#                     html_=True,
#                     symb_to_text=False,
#                     convert_entities=False,
#                     reduce_mentions=False)

# Main Data Frame

In [26]:

main_df = pd.merge(sentiment_df, emotion_df, on='key', how='outer')
main_df = main_df.drop(columns=['tweet'])

In [27]:
display(main_df.head(40))

Unnamed: 0,key,Author,Tweet,Tone,Negative,Neutral,Positive,Polarity Score TB,Polarity Score Vader,Abs Polarity Vader,Score Difference,Target,emotion 1,emotion 1 score,emotion 2,emotion 2 score
0,0,D.L. Davis,@POTUS Biden Blunders - 6 Month Update\n\nInfl...,positive,0.316,0.631,0.053,0.5,-0.9169,0.9169,1.4169,1.0,fear,3,anger,2
1,1,D.L. Davis,@S0SickRick @Stairmaster_ @6d6f636869 Not as m...,positive,0.311,0.689,0.0,0.125,-0.9449,0.9449,1.0699,1.0,anger,3,disgust,3
2,2,D.L. Davis,THE SUPREME COURT is siding with super rich pr...,positive,0.297,0.447,0.256,0.206667,-0.3147,0.3147,0.521367,1.0,fear,4,anger,3
3,3,D.L. Davis,@POTUS Biden Blunders\n\nBroken campaign promi...,negative,0.276,0.576,0.147,-0.4,-0.828,0.828,0.428,1.0,fear,6,anger,5
4,4,D.L. Davis,@OhComfy I agree. The confluence of events rig...,positive,0.241,0.701,0.058,0.442857,-0.8316,0.8316,1.274457,1.0,disgust,3,fear,3
5,5,D.L. Davis,"I've said this before, but it really is incred...",positive,0.095,0.905,0.0,0.1875,-0.6124,0.6124,0.7999,1.0,fear,2,anger,1
6,6,D.L. Davis,"As many face backlogged rent payments, America...",positive,0.115,0.885,0.0,0.25,-0.6075,0.6075,0.8575,1.0,anger,1,disgust,1
7,7,D.L. Davis,@Thomas1774Paine @JoeBiden\n#DOJ@TheJusticeDep...,neutral,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,anticipation,1,trust,1
8,8,D.L. Davis,@SocialismIsDone @TheeKHiveQueenB Its a win fo...,positive,0.163,0.533,0.304,1.0,0.5393,0.5393,0.4607,1.0,sadness,2,trust,1
9,9,D.L. Davis,@daysofarelives2 @Sen_JoeManchin There is not ...,positive,0.11,0.784,0.106,0.8,-0.0516,0.0516,0.8516,1.0,sadness,2,anticipation,1
