# Examine features in text dataset

Looking at the features we can find in a typical set of text messages

In [1]:
# import libraries, add some helper functions, set data filename
import json
import pandas as pd
from pandas.io.json import json_normalize
import clean_twitter
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, TfidfTransformer
from sklearn.decomposition import NMF, LatentDirichletAllocation
import spacy
from langdetect import detect
import nltk
from nltk import word_tokenize          
from nltk.stem.porter import PorterStemmer
from stop_words import get_stop_words
from textblob import TextBlob
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import seaborn as sns


#pd.set_option('display.max_rows', 500)

# compare columns
def map_columns(df, rows='user', cols='ymdh', vals='tweet_url'):
    return pd.pivot_table(df, index=rows, columns=cols, values=vals, aggfunc='count').fillna('')

# word counts
def get_wordcounts(count_vect, word_counts): 
    ''' Ungodly hack to look at word counts in scikitlearn CountVectorizer '''
    df = pd.DataFrame(word_counts.A, columns=count_vect.get_feature_names()).transpose()
    df =  pd.DataFrame(df.transpose().sum()).sort_values(0, ascending=False)
    
    # cleaner way to do this - nb is unchecked, worried it returns different values...
    dfcv = pd.DataFrame.from_dict(count_vect.vocabulary_, orient='index', columns=['count'])
    dfcv.sort_values('count', ascending=False)
    return df

# languages
def detect_langs(df, col='Text', verbose=False):
    langarray = []
    for i in df[col].to_list():
        #print(i)
        try:
            # Language Detection
            lang = detect(i) 
        except:
            lang = 'cant_process'
        langarray += [lang]
        #print(lang)

    df['langarray'] = langarray
    return df

# Named entities
def get_ents(text):
    doc = nlp(text) 
    ents = {ent.text:ent.label_ for ent in doc.ents}
    print(ents)
    return ents

# word stems
# based on http://www.cs.duke.edu/courses/spring14/compsci290/assignments/lab02.html
stemmer = PorterStemmer()
def stem_tokens(tokens, stemmer):
    stemmed = []
    for item in tokens:
        stemmed.append(stemmer.stem(item))
    return stemmed

def tokenize(text):
    tokens = word_tokenize(text)
    stems = stem_tokens(tokens, stemmer)
    return stems
########

def get_subjectivity(text):
    return TextBlob(text).sentiment.subjectivity
def get_polarity(text):
    return TextBlob(text).sentiment.polarity



nlp = spacy.load('en_core_web_sm') 
nltk.download('punkt') #don't have to do this every time

fin = 'data/andypateltwitter/20211019172839_stopthesteal/20211019172839_stopthesteal_tweets.json'

[nltk_data] Downloading package punkt to /Users/sara/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


## Load in datasets

In [2]:
# Use the json libary to look at the raw dataset
with open(fin, "r") as read_file:
    tweetjson = json.load(read_file)

tweetjson

{'https://twitter.com/Theon_Orbis/status/1450571068473626629': 'RT @Tam_Resist: US sanctioned Deripaska for *meddling in the 2016 election-Tfg lifted those sanctions as 2016 was the precursor of #StopThe…',
 'https://twitter.com/smimsitis/status/1450565418343559170': 'RT @ddanpereira: The #FBI has raided the home of Oleg Deripaska, Russian Billionaire &amp; #Putin ally, in DC! #PaulManafort owed a lot of mone…',
 'https://twitter.com/ElkeHollings/status/1450544622527156227': 'RT @ddanpereira: The #FBI has raided the home of Oleg Deripaska, Russian Billionaire &amp; #Putin ally, in DC! #PaulManafort owed a lot of mone…',
 'https://twitter.com/PumpkinBlend/status/1450543853208018952': '@kitsumitsu46 @ashleyloveslmg wow what happened to my vote #stopthesteal',
 'https://twitter.com/nola_chwica/status/1450543275937435649': 'RT @ZhiZhuWeb: "Republicans’ #StopTheSteal campaign ... is also a massive and devastatingly effective deployment of Russian-style informati…',
 'https://twitter.com/aur

In [3]:
# use clean_twitter library to extract information in the AndyPatel formatted dataset into a csv
dftweets = clean_twitter.tweet_json_to_csv(fin)
dftweets

data/andypateltwitter/20211019172839_stopthesteal/20211019172839_stopthesteal_tweets.json has 2098 tweets


Unnamed: 0,tweet_url,text,user,tweet_id,timestamp,datetime,date,hour,ymdh,retweet,retweeting
0,https://twitter.com/Theon_Orbis/status/1450571...,RT @Tam_Resist: US sanctioned Deripaska for *m...,Theon_Orbis,1450571068473626629,1634678067688,2021-10-19 21:14:27.688,2021-10-19,21,2021101921,True,@Tam_Resist
1,https://twitter.com/smimsitis/status/145056541...,RT @ddanpereira: The #FBI has raided the home ...,smimsitis,1450565418343559170,1634676720592,2021-10-19 20:52:00.592,2021-10-19,20,2021101920,True,@ddanpereira
2,https://twitter.com/ElkeHollings/status/145054...,RT @ddanpereira: The #FBI has raided the home ...,ElkeHollings,1450544622527156227,1634671762483,2021-10-19 19:29:22.483,2021-10-19,19,2021101919,True,@ddanpereira
3,https://twitter.com/PumpkinBlend/status/145054...,@kitsumitsu46 @ashleyloveslmg wow what happene...,PumpkinBlend,1450543853208018952,1634671579063,2021-10-19 19:26:19.063,2021-10-19,19,2021101919,False,
4,https://twitter.com/nola_chwica/status/1450543...,"RT @ZhiZhuWeb: ""Republicans’ #StopTheSteal cam...",nola_chwica,1450543275937435649,1634671441431,2021-10-19 19:24:01.431,2021-10-19,19,2021101919,True,@ZhiZhuWeb
...,...,...,...,...,...,...,...,...,...,...,...
2093,https://twitter.com/golfnutallways/status/1447...,RT @gbrough10: Demand a new election #StopTheS...,golfnutallways,1447385163332214785,1633918488711,2021-10-11 02:14:48.711,2021-10-11,2,2021101102,True,@gbrough10
2094,https://twitter.com/KW07038391/status/14473850...,RT @gbrough10: Demand a new election #StopTheS...,KW07038391,1447385078435352576,1633918468470,2021-10-11 02:14:28.470,2021-10-11,2,2021101102,True,@gbrough10
2095,https://twitter.com/Chi2soCal2LV/status/144738...,RT @gbrough10: Demand a new election #StopTheS...,Chi2soCal2LV,1447383774459666434,1633918157578,2021-10-11 02:09:17.578,2021-10-11,2,2021101102,True,@gbrough10
2096,https://twitter.com/Lola36405979/status/144738...,#StopTheSteal https://t.co/FfvNpjXKvA,Lola36405979,1447382501987659781,1633917854197,2021-10-11 02:04:14.197,2021-10-11,2,2021101102,False,


## Do simple analysis of raw text and non-text content

In [4]:
# Are the tweets unique? 
text_counts = dftweets['text'].value_counts().reset_index()
text_counts

Unnamed: 0,index,text
0,RT @gbrough10: Kimberly Guilfoyle asking for f...,742
1,RT @gbrough10: Demand a new election #StopTheS...,317
2,RT @RollingStone: Trump is laying the groundwo...,42
3,RT @Tam_Resist: CancunCruz plays to an audienc...,40
4,RT @BombshellDAILY: BIG LIE CREATED FOR BRUISE...,39
...,...,...
553,@patriottakes https://t.co/dGEAM4ma7w,1
554,It’s crazy when the very small amount of peopl...,1
555,Yeah dumb ass it’s called saving America from ...,1
556,#trumpwon #stopthesteal #azaudit #patriots wri...,1


In [5]:
# Are there any superspreaders in the top texts? e.g. how concentrated is this (how high's the Gini index?)
dftweets[dftweets['text'].isin(text_counts['index'].to_list()[:5])]['user'].value_counts()

amy_penkalski      2
DAVIDCA52134525    2
DesdeFlorida       2
melissadittohed    2
saangus            2
                  ..
MTexas0640         1
RetCorr            1
sickofinsanity     1
TethLL             1
clbrownjr          1
Name: user, Length: 1140, dtype: int64

In [6]:
# Related question: are the non-retweet tweets unique? 
text_counts[~text_counts['index'].str.startswith('RT ')]

Unnamed: 0,index,text
35,@Minecraft #STOPTHESTEAL,3
49,@Minecraft #stopthesteal,2
50,#StoptheSteal,2
55,@Minecraft #StopTheSteal,2
64,@EWErickson @JonahDispatch “It’s just a phase ...,1
...,...,...
553,@patriottakes https://t.co/dGEAM4ma7w,1
554,It’s crazy when the very small amount of peopl...,1
555,Yeah dumb ass it’s called saving America from ...,1
556,#trumpwon #stopthesteal #azaudit #patriots wri...,1


In [7]:
# Are there any particularly prolific accounts? 
user_counts = dftweets['user'].value_counts().reset_index()
user_counts

Unnamed: 0,index,user
0,ChemicalEyeGuy,19
1,BigBoa,15
2,Hig44,12
3,Tam_Resist,10
4,SqueezeTokenOG,8
...,...,...
1783,RockDad1990,1
1784,JamesJo67513483,1
1785,kida_hiroyuki,1
1786,davi88427104,1


In [8]:
# What are those accounts talking about? 
dftweets[dftweets['user'].isin(user_counts['index'].to_list()[:5])].sort_values(['user', 'date'])[['text', 'user']]

Unnamed: 0,text,user
448,Because normal people don't look at it as bein...,BigBoa
449,You always make a habit of shooting unarmed pe...,BigBoa
450,Of course they knew. And they knew they were g...,BigBoa
452,Why was this murderer given a gold medal by th...,BigBoa
453,"Right on. ANY OTHER such scenario, Byrd White,...",BigBoa
...,...,...
1623,What’s GovAbbott going doing next? Forbid priv...,Tam_Resist
1655,Did KY AG Cameron participate in GOP AG #StopT...,Tam_Resist
494,@marcia4justice Everything they don’t like is ...,Tam_Resist
143,CancunCruz plays to an audience of 1 &amp; pro...,Tam_Resist


## Analyse the text

In [9]:
# have a look at languages
dftweets = detect_langs(dftweets, 'text')
dftweets['langarray'].value_counts()

en              2059
fr                18
de                 6
cs                 6
cant_process       2
nl                 2
fi                 1
es                 1
af                 1
it                 1
da                 1
Name: langarray, dtype: int64

In [10]:
user_date = map_columns(dftweets, 'user', 'ymdh', 'tweet_url')
user_lang = map_columns(dftweets, 'user', 'langarray', 'tweet_url')
# ax = sns.heatmap(user_lang)
user_lang

langarray,af,cant_process,cs,da,de,en,es,fi,fr,it,nl
user,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
007Danger007,,,,,,1,,,,,
0_hank3,,,,,,1,,,,,
151_gene,,,,,,1,,,,,
1969bird,,,,,,1,,,,,
1984_christmas,,,,,,1,,,,,
...,...,...,...,...,...,...,...,...,...,...,...
ypegasus7,,,,,,1,,,,,
zeus4ever,,,,,,1,,,,,
ziffal,,,,,,1,,,,,
zipthwung,,,,,,1,,,,,


In [11]:
# # Can plot that as a heatmap, but would have been easier to look at it a different way. 
# sums = dftweets.groupby(by=['user', 'langarray'], axis=0).count()
# sums.sort_values('tweet_url', ascending=False)
# Borked - fix later. 

In [12]:
dftweets[dftweets['langarray'] != 'en']

Unnamed: 0,tweet_url,text,user,tweet_id,timestamp,datetime,date,hour,ymdh,retweet,retweeting,langarray
154,https://twitter.com/HugoVlaenderen/status/1450...,"@er_schild NV-A is een piramidesysteem, iedere...",HugoVlaenderen,1450014901792124929,1634545467218,2021-10-18 08:24:27.218,2021-10-18,8,2021101808,False,,nl
168,https://twitter.com/RikkuGaetani90/status/1449...,@libertaddigital Este majete de izquierdas de ...,RikkuGaetani90,1449932206516936707,1634525751128,2021-10-18 02:55:51.128,2021-10-18,2,2021101802,False,,es
175,https://twitter.com/aegon555/status/1449910188...,https://t.co/fVNVgtsyEX,aegon555,1449910188857864197,1634520501709,2021-10-18 01:28:21.709,2021-10-18,1,2021101801,False,,cant_process
204,https://twitter.com/HugoPRMP/status/1449743704...,"RT @abandon2famille: Re- en sept, invitation p...",HugoPRMP,1449743704462438408,1634480808736,2021-10-17 14:26:48.736,2021-10-17,14,2021101714,True,@abandon2famille,fr
211,https://twitter.com/Project9Phoenix/status/144...,"RT @abandon2famille: Re- en sept, invitation p...",Project9Phoenix,1449683494943199234,1634466453668,2021-10-17 10:27:33.668,2021-10-17,10,2021101710,True,@abandon2famille,fr
251,https://twitter.com/abandon2famille/status/144...,"Re- en sept, invitation par le Danube Institut...",abandon2famille,1449456523734814730,1634412339515,2021-10-16 19:25:39.515,2021-10-16,19,2021101619,False,,fr
266,https://twitter.com/Lordstorm6/status/14494213...,@Minecraft #STOPTHESTEAL,Lordstorm6,1449421363165417479,1634403956582,2021-10-16 17:05:56.582,2021-10-16,17,2021101617,False,,de
269,https://twitter.com/chickeneater357/status/144...,@Minecraft #STOPTHESTEAL,chickeneater357,1449421031349891076,1634403877471,2021-10-16 17:04:37.471,2021-10-16,17,2021101617,False,,de
270,https://twitter.com/TMcdede/status/14494209334...,@Minecraft #STOPTHESTEAL,TMcdede,1449420933475803142,1634403854136,2021-10-16 17:04:14.136,2021-10-16,17,2021101617,False,,de
333,https://twitter.com/kennywolf9/status/14492150...,RT @VMadge: @ChristopherHahn @debbiez49 https:...,kennywolf9,1449215076808990730,1634354774082,2021-10-16 03:26:14.082,2021-10-16,3,2021101603,True,@VMadge,de


In [13]:
# Use scikit-learn's CountVectorizer to split the words in the text column
# this produces tuples: (message id, word id) with a count

count_vect = CountVectorizer(stop_words='english')
word_counts = count_vect.fit_transform(dftweets['text'].str.lower())
reversedict = dict((v, k) for k, v in count_vect.vocabulary_.items()) # not necessary but you might want to check words

print('{}'.format(word_counts))
print('{}'.format(count_vect.vocabulary_))

  (0, 2657)	1
  (0, 2983)	1
  (0, 2683)	1
  (0, 827)	1
  (0, 1960)	1
  (0, 28)	2
  (0, 959)	1
  (0, 3016)	1
  (0, 1814)	1
  (0, 2684)	1
  (0, 2375)	1
  (0, 2906)	1
  (1, 2657)	1
  (1, 827)	1
  (1, 773)	1
  (1, 1105)	1
  (1, 2486)	1
  (1, 1429)	1
  (1, 2192)	1
  (1, 2669)	1
  (1, 391)	1
  (1, 208)	1
  (1, 2451)	1
  (1, 187)	1
  (1, 772)	1
  :	:
  (2094, 2915)	1
  (2094, 1453)	1
  (2094, 2109)	1
  (2094, 1248)	1
  (2094, 798)	1
  (2094, 1734)	1
  (2095, 2657)	1
  (2095, 959)	1
  (2095, 2915)	1
  (2095, 1453)	1
  (2095, 2109)	1
  (2095, 1248)	1
  (2095, 798)	1
  (2095, 1734)	1
  (2096, 2915)	1
  (2096, 1453)	1
  (2096, 1117)	1
  (2097, 2657)	1
  (2097, 959)	1
  (2097, 2915)	1
  (2097, 1453)	1
  (2097, 2109)	1
  (2097, 1248)	1
  (2097, 798)	1
  (2097, 1734)	1
{'rt': 2657, 'tam_resist': 2983, 'sanctioned': 2683, 'deripaska': 827, 'meddling': 1960, '2016': 28, 'election': 959, 'tfg': 3016, 'lifted': 1814, 'sanctions': 2684, 'precursor': 2375, 'stopthe': 2906, 'ddanpereira': 773, 'fbi': 1105,

In [14]:
# Look at the top words in this dataset
get_wordcounts(count_vect, word_counts)[:20]

Unnamed: 0,0
rt,1659
election,1181
gbrough10,1059
stopthesteal,873
2020,791
states,750
50,749
forensic,746
national,746
presidential,746


In [15]:
# Use CountVectorizer to look at the most common *pairs* of words in this dataset
count_vectn = CountVectorizer(ngram_range =(2, 2), stop_words='english')
word_countsn = count_vectn.fit_transform(dftweets['text'])
get_wordcounts(count_vectn, word_countsn)[:20]

Unnamed: 0,0
rt gbrough10,1059
presidential election,746
50 states,744
guilfoyle asking,743
kimberly guilfoyle,743
asking forensic,743
2020 national,743
states 2020,743
audits 50,743
national presidential,743


In [16]:
# Adding (a bit of) word understanding

count_vect3 = CountVectorizer(tokenizer=tokenize, stop_words='english') 
word_counts3 = count_vect3.fit_transform(dftweets['text'])
get_wordcounts(count_vect3, word_counts3)[:20]



Unnamed: 0,0
#,3024
:,2456
@,2288
rt,1659
.,1509
elect,1189
",",1176
gbrough10,1059
stopthest,867
2020,789


In [17]:
# Apply named entity recognition
sentence = "Bill Gates is selling 5G Covid19 data to Microsoft"
doc = nlp(sentence) 
for ent in doc.ents: 
    print(ent.text, ent.label_) 
{ent.text:ent.label_ for ent in doc.ents}

Bill Gates PERSON
5 CARDINAL
Microsoft ORG


{'Bill Gates': 'PERSON', '5': 'CARDINAL', 'Microsoft': 'ORG'}

In [18]:
dftweets['ents'] = dftweets['text'].apply(get_ents)
dftweets

{'US': 'GPE', 'Deripaska': 'ORG', '2016': 'DATE', 'StopThe': 'ORG'}
{'RT @ddanpereira': 'PERSON', 'FBI': 'ORG', 'Oleg Deripaska': 'PERSON', 'Russian Billionaire &amp': 'ORG', 'Putin': 'PERSON', 'DC': 'GPE', 'PaulManafort': 'MONEY'}
{'RT @ddanpereira': 'PERSON', 'FBI': 'ORG', 'Oleg Deripaska': 'PERSON', 'Russian Billionaire &amp': 'ORG', 'Putin': 'PERSON', 'DC': 'GPE', 'PaulManafort': 'MONEY'}
{'@kitsumitsu46 @ashleyloveslmg': 'ORG', '#': 'CARDINAL'}
{'RT @ZhiZhuWeb': 'ORG', 'Republicans': 'NORP', 'Russian': 'NORP'}
{'Trump': 'ORG'}
{"Trump's": 'ORG'}
{"Trump's": 'ORG'}
{'RT @ddanpereira': 'PERSON', 'FBI': 'ORG', 'Oleg Deripaska': 'PERSON', 'Russian Billionaire &amp': 'ORG', 'Putin': 'PERSON', 'DC': 'GPE', 'PaulManafort': 'MONEY'}
{'US': 'GPE', 'Deripaska': 'ORG', '2016': 'DATE', 'StopThe': 'ORG'}
{'Albertastan': 'PRODUCT', 'Cowtown': 'GPE'}
{'RT @ddanpereira': 'PERSON', 'FBI': 'ORG', 'Oleg Deripaska': 'PERSON', 'Russian Billionaire &amp': 'ORG', 'Putin': 'PERSON', 'DC': 'GPE', 'PaulMan

{'CancunCruz': 'ORG', '1': 'CARDINAL', 'Dominionist': 'ORG'}
{'RT @ZhiZhuWeb': 'ORG', 'Republicans': 'NORP', 'Russian': 'NORP'}
{'CancunCruz': 'ORG', '1': 'CARDINAL', 'Dominionist': 'ORG'}
{'CancunCruz': 'ORG', '1': 'CARDINAL', 'Dominionist': 'ORG'}
{'CancunCruz': 'ORG', '1': 'CARDINAL', 'Dominionist': 'ORG'}
{'RT @DanCoxEsq': 'ORG', 'January 1, 2021': 'DATE', 'The United States': 'GPE'}
{'CancunCruz': 'ORG', '1': 'CARDINAL', 'Dominionist': 'ORG'}
{'CancunCruz': 'ORG', '1': 'CARDINAL', 'Dominionist': 'ORG'}
{'Loser #SeditiousTrump': 'PERSON', '#StopTheSteal #': 'MONEY'}
{'RT @ParkerMolloy': 'ORG'}
{'RT @ParkerMolloy': 'ORG'}
{'CancunCruz': 'ORG', '1': 'CARDINAL', 'Dominionist': 'ORG'}
{'RT @ParkerMolloy': 'ORG'}
{'RT @ParkerMolloy': 'ORG'}
{'RT @ParkerMolloy': 'ORG'}
{'RT @ParkerMolloy': 'ORG'}
{'CancunCruz': 'ORG', '1': 'CARDINAL', 'Dominionist': 'ORG'}
{}
{'SavingDemocracy': 'ORG', 'House &amp': 'ORG', 'Senate': 'ORG', 'years': 'DATE'}
{'SavingDemocracy': 'ORG', 'House &amp': 'ORG', 

{'Publix': 'NORP', 'almost $1 million': 'MONEY', 'Trump': 'ORG', 'GOP': 'ORG', '2019': 'DATE', '2020': 'DATE', '#Publix': 'ORG', 'Robocall': 'PERSON', '#StopTheSteal #': 'MONEY', 'Capitol #': 'ORG'}
{}
{'RT @Hig44': 'ORG', '#WaronRugs': 'MONEY', '#': 'CARDINAL'}
{'#StopTheSteal &': 'MONEY', 'https://t.co/eQNIbyE0rV': 'PERSON'}
{'CovidVaccine': 'ORG', 'Israel': 'GPE', 'USA': 'GPE', 'American': 'NORP', '🇸': 'PERSON'}
{'RT @MusicBombshell': 'ORG', '#treason #': 'MONEY', '#': 'CARDINAL'}
{'RT @MusicBombshell': 'ORG', '#treason #': 'MONEY', '#': 'CARDINAL'}
{'Kimberly Guilfoyle': 'PERSON', '2020': 'DATE', 'National Presidential': 'ORG'}
{'Danube Institute': 'ORG', 'Ed Martin': 'PERSON', 'de l’Eagle Forum Education &amp': 'ORG', 'Legal Defense Fund &amp': 'ORG', 'membre': 'PERSON', 'du Council for National Policy': 'ORG', 'CNP': 'ORG'}
{'RT @Hig44': 'ORG', '#WaronRugs': 'MONEY'}
{'RT @ed_george': 'PERSON', 'Trump &amp': 'ORG', 'MitchMcConnell': 'ORG'}
{'RT @Hig44': 'ORG', '#WaronRugs': 'MONE

{'50': 'CARDINAL', 'Toronto': 'GPE', '#StopTheSteal #': 'MONEY'}
{'Kimberly Guilfoyle': 'PERSON', '2020': 'DATE', 'National Presidential': 'ORG'}
{'Kimberly Guilfoyle': 'PERSON', '2020': 'DATE', 'National Presidential': 'ORG'}
{'hundreds': 'CARDINAL', 'Georgia': 'GPE', '#StopTheSteal of': 'MONEY', 'American': 'NORP', '🇸': 'PERSON', 'BananaRepublicans': 'NORP'}
{'RT @ChemicalEyeGuy': 'ORG', '#': 'CARDINAL'}
{'America': 'GPE'}
{'RT @RollingStone': 'ORG', 'Michigan': 'GPE', '2024': 'DATE'}
{'#StopTheSteal https://t.co/7spWzt9oBR': 'MONEY'}
{'Dodgers': 'PERSON', '#': 'CARDINAL'}
{'Taliban': 'ORG'}
{}
{'#StopTheSteal https://t.co/3m5mUb3J6R': 'ORG'}
{'DODGERS STADIUM': 'FAC'}
{'Kimberly Guilfoyle': 'PERSON', '2020': 'DATE', 'National Presidential': 'ORG'}
{'Kimberly Guilfoyle': 'PERSON', '2020': 'DATE', 'National Presidential': 'ORG'}
{'RT @NetNewsBuzz: &gt;&gt;&gt': 'ORG', 'Trump Is Laying the Groundwork to Steal Michigan': 'ORG', '2024': 'DATE', 'audi': 'ORG'}
{'Kimberly Guilfoyle': 'PERS

{'@CharlieKirk11': 'PERSON', 'CNP': 'ORG', 'November #': 'DATE', 'Sto': 'MONEY'}
{'November': 'DATE', 'CMD': 'ORG', 'CNP': 'ORG', 'Ali Alexander': 'PERSON', 'Ali Akbar': 'PERSON', 'StoptheSte': 'GPE'}
{'Trump Is Laying the Groundwork to Steal Michigan': 'ORG', '2024': 'DATE'}
{'Kimberly Guilfoyle': 'PERSON', '2020': 'DATE', 'National Presidential': 'ORG'}
{'Kimberly Guilfoyle': 'PERSON', '2020': 'DATE', 'National Presidential': 'ORG'}
{'Kimberly Guilfoyle': 'PERSON', '2020': 'DATE', 'National Presidential': 'ORG'}
{'Trump Is Laying the Groundwork to Steal Michigan': 'ORG', '2024': 'DATE'}
{'Trump Is Laying the Groundwork to Steal Michigan': 'ORG', '2024': 'DATE'}
{'Trump Is Laying the Groundwork to Steal Michigan': 'ORG', '2024': 'DATE'}
{'Kimberly Guilfoyle': 'PERSON', '2020': 'DATE', 'National Presidential': 'ORG'}
{'Kimberly Guilfoyle': 'PERSON', '2020': 'DATE', 'National Presidential': 'ORG'}
{'2': 'CARDINAL', 'Biden': 'PERSON', '#StopTheSteal #CessateTheInsantiy #': 'MONEY'}
{'Kim

{'RT @ChemicalEyeGuy': 'ORG', 'TinaPeters’': 'PRODUCT'}
{'Kimberly Guilfoyle': 'PERSON', '2020': 'DATE', 'National Presidential': 'ORG'}
{'TinaPeters’': 'PRODUCT', 'U.S.': 'GPE', '#StopTheSteal': 'MONEY', 'American': 'NORP', '🇸': 'PERSON', 'Colorado': 'GPE'}
{'Kimberly Guilfoyle': 'PERSON', '2020': 'DATE', 'National Presidential': 'ORG'}
{'1': 'CARDINAL', '4.7 Million': 'CARDINAL', 'Texas': 'GPE', 'Drop-Off Locations': 'ORG', '@forbes': 'ORG', 'NewJersey': 'PERSON', '#StoptheSteal #': 'MONEY', '#Republican #': 'MONEY', 'Ciatarelli': 'PERSON', '#Malls #': 'MONEY', 'Virginia': 'GPE', '#': 'CARDINAL', 'Pennsylvania': 'GPE'}
{'Kimberly Guilfoyle': 'PERSON', '2020': 'DATE', 'National Presidential': 'ORG'}
{'Kimberly Guilfoyle': 'PERSON', '2020': 'DATE', 'National Presidential': 'ORG'}
{'Colorado': 'GPE', '\u2066@GOP\u2069': 'GPE'}
{'Kimberly Guilfoyle': 'PERSON', '2020': 'DATE', 'National Presidential': 'ORG'}
{'Kimberly Guilfoyle': 'PERSON', '2020': 'DATE', 'National Presidential': 'ORG'}


{'Kimberly Guilfoyle': 'PERSON', '2020': 'DATE', 'National Presidential': 'ORG'}
{}
{}
{'Kimberly Guilfoyle': 'PERSON', '2020': 'DATE', 'National Presidential': 'ORG'}
{'Kimberly Guilfoyle': 'PERSON', '2020': 'DATE', 'National Presidential': 'ORG'}
{'RT @RobHNY': 'PERSON'}
{'Kimberly Guilfoyle': 'PERSON', '2020': 'DATE', 'National Presidential': 'ORG'}
{'Kimberly Guilfoyle': 'PERSON', '2020': 'DATE', 'National Presidential': 'ORG'}
{'@Backedwithdata': 'ORG', 'IOWA': 'GPE', '#StopTheSteal #': 'MONEY', 'CyberNinjas': 'ORG'}
{'At least 10': 'CARDINAL', 'the Supreme Court': 'ORG', 'HangingChad': 'MONEY'}
{'Kimberly Guilfoyle': 'PERSON', '2020': 'DATE', 'National Presidential': 'ORG'}
{'Kimberly Guilfoyle': 'PERSON', '2020': 'DATE', 'National Presidential': 'ORG'}
{'Kimberly Guilfoyle': 'PERSON', '2020': 'DATE', 'National Presidential': 'ORG'}
{'Kimberly Guilfoyle': 'PERSON', '2020': 'DATE', 'National Presidential': 'ORG'}
{'Kimberly Guilfoyle': 'PERSON', '2020': 'DATE', 'National President

{'Kimberly Guilfoyle': 'PERSON', '2020': 'DATE', 'National Presidential': 'ORG'}
{'Kimberly Guilfoyle': 'PERSON', '2020': 'DATE', 'National Presidential': 'ORG'}
{'Kimberly Guilfoyle': 'PERSON', '2020': 'DATE', 'National Presidential': 'ORG'}
{'Kimberly Guilfoyle': 'PERSON', '2020': 'DATE', 'National Presidential': 'ORG'}
{'Kimberly Guilfoyle': 'PERSON', '2020': 'DATE', 'National Presidential': 'ORG'}
{'Kimberly Guilfoyle': 'PERSON', '2020': 'DATE', 'National Presidential': 'ORG'}
{'Kimberly Guilfoyle': 'PERSON', '2020': 'DATE', 'National Presidential': 'ORG'}
{'Kimberly Guilfoyle': 'PERSON', '2020': 'DATE', 'National Presidential': 'ORG'}
{'Kimberly Guilfoyle': 'PERSON', '2020': 'DATE', 'National Presidential': 'ORG'}
{'Kimberly Guilfoyle': 'PERSON', '2020': 'DATE', 'National Presidential': 'ORG'}
{'Kimberly Guilfoyle': 'PERSON', '2020': 'DATE', 'National Presidential': 'ORG'}
{'Kimberly Guilfoyle': 'PERSON', '2020': 'DATE', 'National Presidential': 'ORG'}
{'Kimberly Guilfoyle': 'PERS

{'Kimberly Guilfoyle': 'PERSON', '2020': 'DATE', 'National Presidential': 'ORG'}
{'RT @RollingStone': 'ORG', 'Michigan': 'GPE', '2024': 'DATE'}
{'Kimberly Guilfoyle': 'PERSON', '2020': 'DATE', 'National Presidential': 'ORG'}
{'Kimberly Guilfoyle': 'PERSON', '2020': 'DATE', 'National Presidential': 'ORG'}
{'Kimberly Guilfoyle': 'PERSON', '2020': 'DATE', 'National Presidential': 'ORG'}
{'Republicans': 'NORP'}
{'RT @RollingStone': 'ORG', 'Michigan': 'GPE', '2024': 'DATE'}
{'@njdotcom': 'ORG'}
{'Kimberly Guilfoyle': 'PERSON', '2020': 'DATE', 'National Presidential': 'ORG'}
{'Kimberly Guilfoyle': 'PERSON', '2020': 'DATE', 'National Presidential': 'ORG'}
{'RT @RollingStone': 'ORG', 'Michigan': 'GPE', '2024': 'DATE'}
{'#': 'CARDINAL'}
{'RT @RollingStone': 'ORG', 'Michigan': 'GPE', '2024': 'DATE'}
{'Kimberly Guilfoyle': 'PERSON', '2020': 'DATE', 'National Presidential': 'ORG'}
{'Kimberly Guilfoyle': 'PERSON', '2020': 'DATE', 'National Presidential': 'ORG'}
{'RT @RollingStone': 'ORG', 'Michigan

{'Kimberly Guilfoyle': 'PERSON', '2020': 'DATE', 'National Presidential': 'ORG'}
{'Kimberly Guilfoyle': 'PERSON', '2020': 'DATE', 'National Presidential': 'ORG'}
{'Kimberly Guilfoyle': 'PERSON', '2020': 'DATE', 'National Presidential': 'ORG'}
{'Kimberly Guilfoyle': 'PERSON', '2020': 'DATE', 'National Presidential': 'ORG'}
{'Kimberly Guilfoyle': 'PERSON', '2020': 'DATE', 'National Presidential': 'ORG'}
{'Kimberly Guilfoyle': 'PERSON', '2020': 'DATE', 'National Presidential': 'ORG'}
{'Kimberly Guilfoyle': 'PERSON', '2020': 'DATE', 'National Presidential': 'ORG'}
{'Kimberly Guilfoyle': 'PERSON', '2020': 'DATE', 'National Presidential': 'ORG'}
{'Kimberly Guilfoyle': 'PERSON', '2020': 'DATE', 'National Presidential': 'ORG'}
{'Kimberly Guilfoyle': 'PERSON', '2020': 'DATE', 'National Presidential': 'ORG'}
{'Kimberly Guilfoyle': 'PERSON', '2020': 'DATE', 'National Presidential': 'ORG'}
{'Kimberly Guilfoyle': 'PERSON', '2020': 'DATE', 'National Presidential': 'ORG'}
{'Kimberly Guilfoyle': 'PERS

{'Kimberly Guilfoyle': 'PERSON', '2020': 'DATE', 'National Presidential': 'ORG'}
{'Kimberly Guilfoyle': 'PERSON', '2020': 'DATE', 'National Presidential': 'ORG'}
{'RT @Hig44: PUMP &amp': 'ORG', 'DUMP WALLET ALERT': 'ORG', 'Wallet Watcher': 'PERSON', 'one': 'CARDINAL'}
{'Kimberly Guilfoyle': 'PERSON', '2020': 'DATE', 'National Presidential': 'ORG'}
{'Kimberly Guilfoyle': 'PERSON', '2020': 'DATE', 'National Presidential': 'ORG'}
{'Kimberly Guilfoyle': 'PERSON', '2020': 'DATE', 'National Presidential': 'ORG'}
{'Kimberly Guilfoyle': 'PERSON', '2020': 'DATE', 'National Presidential': 'ORG'}
{'Kimberly Guilfoyle': 'PERSON', '2020': 'DATE', 'National Presidential': 'ORG'}
{'Kimberly Guilfoyle': 'PERSON', '2020': 'DATE', 'National Presidential': 'ORG'}
{'Kimberly Guilfoyle': 'PERSON', '2020': 'DATE', 'National Presidential': 'ORG'}
{'Kimberly Guilfoyle': 'PERSON', '2020': 'DATE', 'National Presidential': 'ORG'}
{'Kimberly Guilfoyle': 'PERSON', '2020': 'DATE', 'National Presidential': 'ORG'}
{'

{'Kimberly Guilfoyle': 'PERSON', '2020': 'DATE', 'National Presidential': 'ORG'}
{'Kimberly Guilfoyle': 'PERSON', '2020': 'DATE', 'National Presidential': 'ORG'}
{'Kimberly Guilfoyle': 'PERSON', '2020': 'DATE', 'National Presidential': 'ORG'}
{'Kimberly Guilfoyle': 'PERSON', '2020': 'DATE', 'National Presidential': 'ORG'}
{'Kimberly Guilfoyle': 'PERSON', '2020': 'DATE', 'National Presidential': 'ORG'}
{'Kimberly Guilfoyle': 'PERSON', '2020': 'DATE', 'National Presidential': 'ORG'}
{'Kimberly Guilfoyle': 'PERSON', '2020': 'DATE', 'National Presidential': 'ORG'}
{'Kimberly Guilfoyle': 'PERSON', '2020': 'DATE', 'National Presidential': 'ORG'}
{'Kimberly Guilfoyle': 'PERSON', '2020': 'DATE', 'National Presidential': 'ORG'}
{'Kimberly Guilfoyle': 'PERSON', '2020': 'DATE', 'National Presidential': 'ORG'}
{'Kimberly Guilfoyle': 'PERSON', '2020': 'DATE', 'National Presidential': 'ORG'}
{'Kimberly Guilfoyle': 'PERSON', '2020': 'DATE', 'National Presidential': 'ORG'}
{'Kimberly Guilfoyle': 'PERS

{'Trump': 'ORG', 'America': 'GPE', 'MAGA': 'ORG', '#TrumpDesantis2024 #StopTheSteal': 'MONEY'}
{'Biden': 'PERSON', 'MAGA': 'ORG', '#StopTheSteal #TrumpDesantis2024': 'MONEY'}
{'Taliban': 'ORG'}
{'RT @mscohen19': 'PERSON', '@NJSpotlightNews': 'ORG', 'NJ': 'ORG'}
{'RT @mscohen19': 'PERSON', '@NJSpotlightNews': 'ORG', 'NJ': 'ORG'}
{'56%': 'PERCENT', 'WOW': 'ORG', '#AZAudit #': 'MONEY'}
{'#StopTheSteal #Trump2020 #': 'MONEY', '#ForensicAudit #': 'MONEY', 'MAGA': 'ORG'}
{'@PARISDENNARD': 'ORG', 'a year ago': 'DATE'}
{'RT @mscohen19': 'PERSON', '@NJSpotlightNews': 'ORG', 'NJ': 'ORG'}
{'#LockThemUp #StopTheSteal #': 'MONEY', 'MAGA': 'ORG', '#USA https://t.co/IAfPsYM9yL': 'ORG'}
{'Trump Never': 'PERSON', 'TrumpWon': 'ORG', '#StopTheSteal #': 'MONEY', '#Trump #USA': 'MONEY'}
{'Trump Never': 'PERSON', 'TrumpWon': 'ORG', '#StopTheSteal #': 'MONEY', '#Trump #USA': 'MONEY'}
{'the Trump Magats': 'ORG'}
{}
{'@My3Alexandra @NJSpotlightNews': 'PERSON'}
{}
{'GQP': 'ORG', 'GQPTraitorsToDemocracy': 'MONEY

{}
{}
{}
{'GQP': 'ORG', 'Democrats': 'NORP', '#gop #republicans #': 'MONEY', '#stopthesteal #': 'MONEY', '#trumplost #': 'MONEY', '#': 'CARDINAL'}
{'KY AG': 'ORG', 'GOP AG': 'ORG', 'Ken Paxton': 'PERSON'}
{'KY AG': 'ORG', 'GOP AG': 'ORG', 'Ken Paxton': 'PERSON'}
{'3': 'CARDINAL', '🇸': 'PERSON'}
{'KY AG': 'ORG', 'GOP AG': 'ORG', 'Ken Paxton': 'PERSON'}
{}
{'Taliban': 'ORG'}
{}
{'KY AG': 'ORG', 'GOP AG': 'ORG', 'Ken Paxton': 'PERSON'}
{'3': 'CARDINAL', '🇸': 'PERSON'}
{'KY AG': 'ORG', 'GOP AG': 'ORG', 'Ken Paxton': 'PERSON'}
{'KY AG': 'ORG', 'GOP AG': 'ORG', 'Ken Paxton': 'PERSON'}
{'KY AG': 'ORG', 'GOP AG': 'ORG', 'Ken Paxton': 'PERSON'}
{'KY AG': 'ORG', 'GOP AG': 'ORG', 'Ken Paxton': 'PERSON'}
{'KY AG': 'ORG', 'GOP AG': 'ORG', 'Ken Paxton': 'PERSON'}
{'KY AG': 'ORG', 'GOP AG': 'ORG', 'Ken Paxton': 'PERSON', 'Paxton': 'PERSON', 'AbortionBan': 'MONEY', 'ONEV1': 'MONEY'}
{'RT @NetworksManager': 'PERSON'}
{'LauraLoomer': 'MONEY', '#': 'CARDINAL'}
{'the American League Cheaters Series': 'ORG

{}
{}
{}
{}
{}
{}
{'Jan6SelectCommittee': 'MONEY'}
{}
{}
{}
{}
{}
{}
{}
{}
{'Taliban': 'ORG'}
{'RT @ActusMondial': 'ORG', 'Jean Le Taxé #STOPtheSTEAL': 'PERSON', '@jeanletaxe': 'GPE', 'La France': 'PERSON', 'Europe': 'LOC'}
{}
{'@nickclegg': 'ORG'}
{}
{'Taliban': 'ORG'}
{'Taliban': 'ORG'}
{}
{'RT @ActusMondial': 'ORG', 'Jean Le Taxé #STOPtheSTEAL': 'PERSON', '@jeanletaxe': 'GPE', 'La France': 'PERSON', 'Europe': 'LOC'}
{}
{}
{'RT @ActusMondial': 'ORG', 'Jean Le Taxé #STOPtheSTEAL': 'PERSON', '@jeanletaxe': 'GPE', 'La France': 'PERSON', 'Europe': 'LOC'}
{'@nickclegg': 'ORG'}
{'RT @ActusMondial': 'ORG', 'Jean Le Taxé #STOPtheSTEAL': 'PERSON', '@jeanletaxe': 'GPE', 'La France': 'PERSON', 'Europe': 'LOC'}
{}
{'Ali Alexander': 'PERSON', 'Nick Fuente': 'PERSON', '#StopTheSteal BS': 'MONEY'}
{}
{'RT @ActusMondial': 'ORG', 'Jean Le Taxé #STOPtheSTEAL': 'PERSON', '@jeanletaxe': 'GPE', 'La France': 'PERSON', 'Europe': 'LOC'}
{'RT @ActusMondial': 'ORG', 'Jean Le Taxé #STOPtheSTEAL': 'PERSON', '@j

Unnamed: 0,tweet_url,text,user,tweet_id,timestamp,datetime,date,hour,ymdh,retweet,retweeting,langarray,ents
0,https://twitter.com/Theon_Orbis/status/1450571...,RT @Tam_Resist: US sanctioned Deripaska for *m...,Theon_Orbis,1450571068473626629,1634678067688,2021-10-19 21:14:27.688,2021-10-19,21,2021101921,True,@Tam_Resist,en,"{'US': 'GPE', 'Deripaska': 'ORG', '2016': 'DAT..."
1,https://twitter.com/smimsitis/status/145056541...,RT @ddanpereira: The #FBI has raided the home ...,smimsitis,1450565418343559170,1634676720592,2021-10-19 20:52:00.592,2021-10-19,20,2021101920,True,@ddanpereira,en,"{'RT @ddanpereira': 'PERSON', 'FBI': 'ORG', 'O..."
2,https://twitter.com/ElkeHollings/status/145054...,RT @ddanpereira: The #FBI has raided the home ...,ElkeHollings,1450544622527156227,1634671762483,2021-10-19 19:29:22.483,2021-10-19,19,2021101919,True,@ddanpereira,en,"{'RT @ddanpereira': 'PERSON', 'FBI': 'ORG', 'O..."
3,https://twitter.com/PumpkinBlend/status/145054...,@kitsumitsu46 @ashleyloveslmg wow what happene...,PumpkinBlend,1450543853208018952,1634671579063,2021-10-19 19:26:19.063,2021-10-19,19,2021101919,False,,en,"{'@kitsumitsu46 @ashleyloveslmg': 'ORG', '#': ..."
4,https://twitter.com/nola_chwica/status/1450543...,"RT @ZhiZhuWeb: ""Republicans’ #StopTheSteal cam...",nola_chwica,1450543275937435649,1634671441431,2021-10-19 19:24:01.431,2021-10-19,19,2021101919,True,@ZhiZhuWeb,en,"{'RT @ZhiZhuWeb': 'ORG', 'Republicans': 'NORP'..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2093,https://twitter.com/golfnutallways/status/1447...,RT @gbrough10: Demand a new election #StopTheS...,golfnutallways,1447385163332214785,1633918488711,2021-10-11 02:14:48.711,2021-10-11,2,2021101102,True,@gbrough10,en,{}
2094,https://twitter.com/KW07038391/status/14473850...,RT @gbrough10: Demand a new election #StopTheS...,KW07038391,1447385078435352576,1633918468470,2021-10-11 02:14:28.470,2021-10-11,2,2021101102,True,@gbrough10,en,{}
2095,https://twitter.com/Chi2soCal2LV/status/144738...,RT @gbrough10: Demand a new election #StopTheS...,Chi2soCal2LV,1447383774459666434,1633918157578,2021-10-11 02:09:17.578,2021-10-11,2,2021101102,True,@gbrough10,en,{}
2096,https://twitter.com/Lola36405979/status/144738...,#StopTheSteal https://t.co/FfvNpjXKvA,Lola36405979,1447382501987659781,1633917854197,2021-10-11 02:04:14.197,2021-10-11,2,2021101102,False,,en,{'#StopTheSteal https://t.co/FfvNpjXKvA': 'MON...


### Sentiment

* Simple version: count up word scores, e.g. https://www.kaggle.com/hamishisham/sentiment-analysis-using-sentiwordnet
* Many other ways to do this, e.g. https://aclanthology.org/S13-2054.pdf

Below methods are quick and dirty. You'll probably wind up building your own sentiment classifiers for disinformation. 

In [19]:
testtext = 'I hate cookies'
print(TextBlob(testtext).sentiment.subjectivity)
print(TextBlob(testtext).sentiment.polarity)
dftweets['subjectivity'] = dftweets['text'].apply(get_subjectivity)
dftweets['polarity'] = dftweets['text'].apply(get_subjectivity)

analyzer = SentimentIntensityAnalyzer()
print(analyzer.polarity_scores(testtext))

0.9
-0.8
{'neg': 0.649, 'neu': 0.351, 'pos': 0.0, 'compound': -0.5719}


In [20]:
dftweets

Unnamed: 0,tweet_url,text,user,tweet_id,timestamp,datetime,date,hour,ymdh,retweet,retweeting,langarray,ents,subjectivity,polarity
0,https://twitter.com/Theon_Orbis/status/1450571...,RT @Tam_Resist: US sanctioned Deripaska for *m...,Theon_Orbis,1450571068473626629,1634678067688,2021-10-19 21:14:27.688,2021-10-19,21,2021101921,True,@Tam_Resist,en,"{'US': 'GPE', 'Deripaska': 'ORG', '2016': 'DAT...",0.000000,0.000000
1,https://twitter.com/smimsitis/status/145056541...,RT @ddanpereira: The #FBI has raided the home ...,smimsitis,1450565418343559170,1634676720592,2021-10-19 20:52:00.592,2021-10-19,20,2021101920,True,@ddanpereira,en,"{'RT @ddanpereira': 'PERSON', 'FBI': 'ORG', 'O...",0.000000,0.000000
2,https://twitter.com/ElkeHollings/status/145054...,RT @ddanpereira: The #FBI has raided the home ...,ElkeHollings,1450544622527156227,1634671762483,2021-10-19 19:29:22.483,2021-10-19,19,2021101919,True,@ddanpereira,en,"{'RT @ddanpereira': 'PERSON', 'FBI': 'ORG', 'O...",0.000000,0.000000
3,https://twitter.com/PumpkinBlend/status/145054...,@kitsumitsu46 @ashleyloveslmg wow what happene...,PumpkinBlend,1450543853208018952,1634671579063,2021-10-19 19:26:19.063,2021-10-19,19,2021101919,False,,en,"{'@kitsumitsu46 @ashleyloveslmg': 'ORG', '#': ...",1.000000,1.000000
4,https://twitter.com/nola_chwica/status/1450543...,"RT @ZhiZhuWeb: ""Republicans’ #StopTheSteal cam...",nola_chwica,1450543275937435649,1634671441431,2021-10-19 19:24:01.431,2021-10-19,19,2021101919,True,@ZhiZhuWeb,en,"{'RT @ZhiZhuWeb': 'ORG', 'Republicans': 'NORP'...",0.900000,0.900000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2093,https://twitter.com/golfnutallways/status/1447...,RT @gbrough10: Demand a new election #StopTheS...,golfnutallways,1447385163332214785,1633918488711,2021-10-11 02:14:48.711,2021-10-11,2,2021101102,True,@gbrough10,en,{},0.454545,0.454545
2094,https://twitter.com/KW07038391/status/14473850...,RT @gbrough10: Demand a new election #StopTheS...,KW07038391,1447385078435352576,1633918468470,2021-10-11 02:14:28.470,2021-10-11,2,2021101102,True,@gbrough10,en,{},0.454545,0.454545
2095,https://twitter.com/Chi2soCal2LV/status/144738...,RT @gbrough10: Demand a new election #StopTheS...,Chi2soCal2LV,1447383774459666434,1633918157578,2021-10-11 02:09:17.578,2021-10-11,2,2021101102,True,@gbrough10,en,{},0.454545,0.454545
2096,https://twitter.com/Lola36405979/status/144738...,#StopTheSteal https://t.co/FfvNpjXKvA,Lola36405979,1447382501987659781,1633917854197,2021-10-11 02:04:14.197,2021-10-11,2,2021101102,False,,en,{'#StopTheSteal https://t.co/FfvNpjXKvA': 'MON...,0.000000,0.000000


## Clean up the text

Decisions made:

* Work on lowercase text
* Remove RT
* Remove URLs
* 

## Run some algorithms over this dataset

In [21]:
# Convert input text into TF-IDF format
tf_transformer = TfidfTransformer(use_idf=False)
tf_features = tf_transformer.fit_transform(word_counts)
print('{}'.format(tf_features))

  (0, 28)	0.5163977794943222
  (0, 827)	0.2581988897471611
  (0, 959)	0.2581988897471611
  (0, 1814)	0.2581988897471611
  (0, 1960)	0.2581988897471611
  (0, 2375)	0.2581988897471611
  (0, 2657)	0.2581988897471611
  (0, 2683)	0.2581988897471611
  (0, 2684)	0.2581988897471611
  (0, 2906)	0.2581988897471611
  (0, 2983)	0.2581988897471611
  (0, 3016)	0.2581988897471611
  (1, 187)	0.24253562503633297
  (1, 208)	0.24253562503633297
  (1, 391)	0.24253562503633297
  (1, 772)	0.24253562503633297
  (1, 773)	0.24253562503633297
  (1, 827)	0.24253562503633297
  (1, 1105)	0.24253562503633297
  (1, 1429)	0.24253562503633297
  (1, 1864)	0.24253562503633297
  (1, 2030)	0.24253562503633297
  (1, 2192)	0.24253562503633297
  (1, 2226)	0.24253562503633297
  (1, 2274)	0.24253562503633297
  :	:
  (2094, 1248)	0.35355339059327373
  (2094, 1453)	0.35355339059327373
  (2094, 1734)	0.35355339059327373
  (2094, 2109)	0.35355339059327373
  (2094, 2657)	0.35355339059327373
  (2094, 2915)	0.35355339059327373
  (209

In [22]:
x = 1117
print()
dftweets[dftweets['text'].str.lower().str.contains(reversedict[x])]




Unnamed: 0,tweet_url,text,user,tweet_id,timestamp,datetime,date,hour,ymdh,retweet,retweeting,langarray,ents,subjectivity,polarity
2096,https://twitter.com/Lola36405979/status/144738...,#StopTheSteal https://t.co/FfvNpjXKvA,Lola36405979,1447382501987659781,1633917854197,2021-10-11 02:04:14.197,2021-10-11,2,2021101102,False,,en,{'#StopTheSteal https://t.co/FfvNpjXKvA': 'MON...,0.0,0.0


### Clustering

In [23]:
# You can extend the stop words list you're using
stop_words = get_stop_words('en')
stop_words += ['trump', 'stopthesteal', 'rt']

In [24]:
# code from https://blog.mlreview.com/topic-modeling-with-scikit-learn-e80d33668730

def display_topics(model, feature_names, no_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic %d:{}".format(topic_idx))
        print(" ".join([feature_names[i]
                        for i in topic.argsort()[:-no_top_words - 1:-1]]))

no_features = 1000
dataset = dftweets['text']

tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, max_features=no_features, stop_words=stop_words)
tfidf = tfidf_vectorizer.fit_transform(dataset)
tfidf_feature_names = tfidf_vectorizer.get_feature_names()

tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2, max_features=no_features, stop_words=stop_words)
tf = tf_vectorizer.fit_transform(dataset)
tf_feature_names = tf_vectorizer.get_feature_names()

no_topics = 7
lda = LatentDirichletAllocation(n_components=no_topics, max_iter=5, learning_method='online', learning_offset=50.,random_state=0).fit(tfidf)

no_top_words = 10
display_topics(lda, tf_feature_names, no_top_words)



Topic %d:0
maga big lie truth rally behind ego created loss kag
Topic %d:1
audits kimberly asking guilfoyle full forensic presidential national stoptheste 50
Topic %d:2
kx8kux0atk new demand https co gbrough10 election will ag gop
Topic %d:3
re la pro life tam_resist next going employees providing health
Topic %d:4
2024 michigan steal groundwork laying rollingstone https ou9isby9w0 co amp
Topic %d:5
tfg wait taliban normalizes negotiating carry exclusively seditionists negotiated msm
Topic %d:6
minecraft still jan6th live founder participants direct account letsgogarland locktrumpup
