#Data Loading and Cleaning

## Downloads

In [None]:
# imports
import os
import itertools
import collections
from collections import Counter
import re
import numpy as np
import config

import tweepy as tw
import pandas as pd
import string

import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud

import nltk 
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
from nltk.stem import LancasterStemmer
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
# download stopwords, punkt and wordnet packages
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

In [None]:
# API key
api_key = config.api_key
secret = config.secret

In [None]:
auth = tw.AppAuthHandler(api_key, secret)
api = tw.API(auth, wait_on_rate_limit=True)

## Define the Key Search Terms 

In [None]:
# define hashtag search terms
# NB: they are case sensitive
search_coronavirus = '#coronavirus'
search_covid = '#covid'
search_covid19 = '#covid19'
search_virus = '#virus'
search_vaccine = '#vaccine'
search_pandemic = '#pandemic'

# # Other Options to search
# search_vaccine = '#vaccine'
# search_vaccinated = '#vaccinated'
# search_vaccination = '#vaccination'
# search_jab = '#jab'
# search_pfizer = '#pfizer'
# search_astrazeneca = '#astrazeneca'
# search_moderna = '#moderna'

## Functions

In [None]:
# REGEX function
def regex_clean(txt, regex):
    """Replace any text matching the regex

    Parameters
    ----------
    txt : string
        A text string that you want to parse and remove matches
    regex : string 
        A text string of the regex pattern you want to match

    Returns
    -------
    The same txt string with the matches removes
    """

    return " ".join(re.sub(regex, "", txt).split())

In [None]:
## lemmatize tweet
lemmatizer = WordNetLemmatizer()
stpwrd = nltk.corpus.stopwords.words('english')
stpwrd.extend(string.punctuation)

def lemmatize_data(n):
    """Cleans up a review!
    
    Parameters
    ------------
    review: string
      A text string that you want to parse and remove matches

    Returns
    ----------
    Cleaned up string - tokenise and stemmed!
    """

    ## Pre Token Cleaning - Stuff that applies to a string.
    
    n = n.lower() 
    n = regex_clean(n, r'\s\d+\s')

    ## Go-Go-Token-Rangers
    n = word_tokenize(n)

    ## Post Token Cleaning - Stuff that applies to a list 
        
    n = [lemmatizer.lemmatize(word) for word in n]
    n = [lemmatizer.lemmatize(word) for word in n if word not in stpwrd]
    
    return n

In [None]:
# stem tweet
p_stemmer = PorterStemmer()

def stem_data(n):
    """Cleans up a review!
    
    Parameters
    ------------
    review: string
      A text string that you want to parse and remove matches

    Returns
    ----------
    Cleaned up string - tokenise and stemmed!
    """

    ## Pre Token Cleaning - Stuff that applies to a string.
    
    n = n.lower() 
    n = regex_clean(n, r'\s\d+\s')

    ## Go-Go-Token-Rangers
    n = word_tokenize(n)

    ## Post Token Cleaning - Stuff that applies to a list 
        
    n = [p_stemmer.stem(word) for word in n]
    n = [p_stemmer.stem(word) for word in n if word not in stpwrd]
    
    return n

In [None]:
# lancaster stem tweet
def lanc_stemmed(df):
    """Cleans up answers!
    
    Parameters
    ------------
    review: string
      A text string that you want to parse and remove matches

    Returns
    ----------
    Cleaned up string - split and stemmed!
    """

    df['split'] = df['cleaned'].apply(lambda x: str(x).lower())
    
    df['split'] = df.split
    df['split'].replace('^[0-9]+$','',inplace=True)
    
    for punc in string.punctuation:
        df['split'] = df.split.str.replace(punc,'')
        
        
    df['split'] = df['split'].apply(lambda x: [item for item in str(x).split() if item not in stopwords.words('english')])
    df['split'] = df['split'].apply(lambda x: [LancasterStemmer().stem(a) for a in x])

    return df

In [None]:
# remove emojis from the text
def remove_emoji(string):
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F" # emoticons
                           u"\U0001F300-\U0001F5FF" # symbols & pictographs
                           u"\U0001F680-\U0001F6FF" # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF" # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', string)

# Dataframes Cleaned and Saved

## Coronavirus Dataframe Cleaned and Saved

In [None]:
## call a cursor item in the twitter library
# use api search
# this item generator collects 1000 items and if we loop through it we can collect all 1000 of our tweets in a dataframe
######### SORT DATES
tweets_coronavirus = tw.Cursor(api.search,
                   q = search_coronavirus,
                   lang ='en',
                   fromDate = '202002010000',
                   toDate = '202118060000',
                   tweet_mode = 'extended',
                   min_retweets = 50,
                   result_type = 'mixed').items(1000)

tweets_coronavirus

<tweepy.cursor.ItemIterator at 0x7fc1868905d0>

In [None]:
# create dataframe
tweet_coronavirus = pd.DataFrame(data=[[tweet.full_text, tweet.user.screen_name, tweet.user.location, tweet.created_at, tweet.id, tweet.source, tweet.favorite_count, tweet.retweet_count, len(tweet.full_text)]for tweet in tweets_coronavirus])

In [None]:
# rename dataframe columns
tweet_coronavirus.columns = ['tweet', 'user', 'location', 'date', 'id', 'source', 'favourites', 'retweets', 'tweet_length']

In [None]:
# find all hashtags
tweet_coronavirus['hashtag'] = tweet_coronavirus['tweet'].apply(lambda x: re.findall(r"#(\w+)", x))

In [None]:
# clean tweet
tweet_coronavirus['cleaned'] = tweet_coronavirus['tweet'].apply(str.lower)

regex_to_clean = ['(@.+?)\s',
                  '\s\d+\s']

for reg in regex_to_clean:
  tweet_coronavirus['cleaned'] = tweet_coronavirus['cleaned'].apply(regex_clean, regex=reg)
  tweet_coronavirus['cleaned'] = tweet_coronavirus['cleaned'].apply(remove_emoji)

In [None]:
# tokenize tweet
tweet_coronavirus['tokens'] = tweet_coronavirus['cleaned'].apply(word_tokenize)

In [None]:
# lemmatize tweet
tweet_coronavirus['lemmatized'] = tweet_coronavirus['cleaned'].apply(lemmatize_data)

In [None]:
# stem tweet
tweet_coronavirus['stemmed'] = tweet_coronavirus['cleaned'].apply(lemmatize_data)

In [None]:
# lancaster stem tweet
tweet_coronavirus = lanc_stemmed(tweet_coronavirus)

In [None]:
#explore datatypes
tweet_coronavirus.dtypes

tweet                   object
user                    object
location                object
date            datetime64[ns]
id                       int64
source                  object
favourites               int64
retweets                 int64
tweet_length             int64
hashtag                 object
cleaned                 object
tokens                  object
lemmatized              object
stemmed                 object
split                   object
dtype: object

In [None]:
tweet_coronavirus

Unnamed: 0,tweet,user,location,date,id,source,favourites,retweets,tweet_length,hashtag,cleaned,tokens,lemmatized,stemmed,split
0,New daily cases &amp; deaths of #coronavirus i...,Omar_Gaza,Palestine,2021-06-22 16:54:21,1407381456305938443,Twitter for Android,709,398,101,"[coronavirus, Gaza]",new daily cases &amp; deaths of #coronavirus i...,"[new, daily, cases, &, amp, ;, deaths, of, #, ...","[new, daily, case, amp, death, coronavirus, ga...","[new, daily, case, amp, death, coronavirus, ga...","[new, dai, cas, amp, death, coronavir, gaz, ev..."
1,#coronavirus I’m feeling emotionally very fr...,JohnBoweActor,,2021-06-23 18:19:07,1407765175168647176,Twitter for iPhone,598,180,263,[coronavirus],#coronavirus i’m feeling emotionally very frai...,"[#, coronavirus, i, ’, m, feeling, emotionally...","[coronavirus, ’, feeling, emotionally, frail, ...","[coronavirus, ’, feeling, emotionally, frail, ...","[coronavir, i’m, feel, emot, frail, breakthrou..."
2,Communist China produced a very inferior vacci...,SenJohnKennedy,Louisiana,2021-06-23 13:51:13,1407697757838663684,Twitter Media Studio,444,238,215,[coronavirus],communist china produced a very inferior vacci...,"[communist, china, produced, a, very, inferior...","[communist, china, produced, inferior, vaccine...","[communist, china, produced, inferior, vaccine...","[commun, chin, produc, infery, vaccin, distrib..."
3,RT @CoronaRecovery: 303044 people recovered fr...,viralvideovlogs,കേരളം,2021-06-23 21:27:22,1407812550369198082,Naattuvartha,0,1,138,[],rtpeople recovered from corona today. total co...,"[rtpeople, recovered, from, corona, today, ., ...","[rtpeople, recovered, corona, today, total, co...","[rtpeople, recovered, corona, today, total, co...","[rtpeople, recov, coron, today, tot, coron, vi..."
4,303044 people recovered from Corona today. Tot...,CoronaRecovery,,2021-06-23 21:27:20,1407812540025946117,corona-recoveries,0,1,151,"[COVID19, coronavirus, StaySafe]",303044 people recovered from corona today. tot...,"[303044, people, recovered, from, corona, toda...","[303044, people, recovered, corona, today, tot...","[303044, people, recovered, corona, today, tot...","[303044, peopl, recov, coron, today, tot, coro..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,Motto of the story: Don't trust #China #corona...,MsLove2Blog,Florida,2021-06-22 15:02:46,1407353372634763264,Twitter Web App,1,0,75,"[China, coronavirus]",motto of the story: don't trust #china #corona...,"[motto, of, the, story, :, do, n't, trust, #, ...","[motto, story, n't, trust, china, coronavirus,...","[motto, story, n't, trust, china, coronavirus,...","[motto, story, dont, trust, chin, coronavir, h..."
996,🚨 27 people died of #COVID19 in the #UK in the...,NicBoothby,#LONDON,2021-06-22 15:02:26,1407353289772089365,Twitter for Android,1,1,301,"[COVID19, UK, Coronavirus, Covid19UK]",people died of #covid19 in the #uk in the past...,"[people, died, of, #, covid19, in, the, #, uk,...","[people, died, covid19, uk, pasthrs, bringing,...","[people, died, covid19, uk, pasthrs, bringing,...","[peopl, died, covid19, uk, pasthr, bring, tot,..."
997,Get the #vaccine dumbass;\n\n A #coronavirus o...,RJMrim,,2021-06-22 15:02:14,1407353238299553792,Hootsuite Inc.,0,0,182,"[vaccine, coronavirus, health]",get the #vaccine dumbass; a #coronavirus outbr...,"[get, the, #, vaccine, dumbass, ;, a, #, coron...","[get, vaccine, dumbass, coronavirus, outbreak,...","[get, vaccine, dumbass, coronavirus, outbreak,...","[get, vaccin, dumbass, coronavir, outbreak, hi..."
998,RT @IndiaToday: Experts have said that the #De...,Mr_madhavendra,"Jaipur, India",2021-06-22 15:02:11,1407353228660932613,Twitter for Android,0,162,140,"[DeltaPlusVariant, DeltaVariant, coronavirus, c]",rt experts have said that the #deltaplusvarian...,"[rt, experts, have, said, that, the, #, deltap...","[rt, expert, said, deltaplusvariant, could, ev...","[rt, expert, said, deltaplusvariant, could, ev...","[rt, expert, said, deltaplusv, could, evad, va..."


In [None]:
#save dataframe to csv
from google.colab import files
tweet_coronavirus.to_csv('tweet_coronavirus.csv')
files.download("tweet_coronavirus.csv")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

## Covid Dataframe Cleaned and Saved

In [None]:
tweets_covid = tw.Cursor(api.search,
                   q = search_covid,
                   lang ='en',
                   fromDate = '2020/02/01',
                   toDate = '2020/12/31',
                   tweet_mode = 'extended',
                   min_retweets = 50,
                   result_type = 'mixed').items(1000)

tweets_covid

<tweepy.cursor.ItemIterator at 0x7fc180a0a790>

In [None]:
tweet_covid = pd.DataFrame(data=[[tweet.full_text, tweet.user.screen_name, tweet.user.location, tweet.created_at, tweet.id, tweet.source, tweet.favorite_count, tweet.retweet_count, len(tweet.full_text)]for tweet in tweets_covid])

In [None]:
tweet_covid.columns = ['tweet', 'user', 'location', 'date', 'id', 'source', 'favourites', 'retweets', 'tweet_length']

In [None]:
def tweet_cleaning(df):
    df['hashtag'] = df['tweet'].apply(lambda x: re.findall(r"#(\w+)", x))
    df['cleaned'] = df['tweet'].apply(str.lower)
  
    regex_to_clean = ['(@.+?)\s',
                      '\s\d+\s']
    for reg in regex_to_clean:
        df['cleaned'] = df['cleaned'].apply(regex_clean, regex=reg)
        df['cleaned'] = df['cleaned'].apply(remove_emoji)
    
    df['tokens'] = df['cleaned'].apply(word_tokenize)
    df['lemmatized'] = df['cleaned'].apply(lemmatize_data)
    df['stemmed'] = df['cleaned'].apply(lemmatize_data)
    df = lanc_stemmed(df)

    return df

In [None]:
tweet_covid = tweet_cleaning(tweet_covid)
tweet_covid

Unnamed: 0,tweet,user,location,date,id,source,favourites,retweets,tweet_length,hashtag,cleaned,tokens,lemmatized,stemmed,split
0,I’m re-reading Jacques Ellul’s classic “Propag...,21WIRE,USA,2021-06-22 17:15:13,1407386706358280193,Twitter for iPhone,615,295,277,"[Covid, lockdowns]",i’m re-reading jacques ellul’s classic “propag...,"[i, ’, m, re-reading, jacques, ellul, ’, s, cl...","[’, re-reading, jacques, ellul, ’, classic, “,...","[’, re-reading, jacques, ellul, ’, classic, “,...","[i’m, reread, jacqu, ellul’s, class, “propagan..."
1,Who rewrote their #Covid vaccine recommendatio...,AlexBerenson,New York,2021-06-22 21:57:32,1407457752725823488,Twitter for iPhone,1269,385,302,[Covid],who rewrote their #covid vaccine recommendatio...,"[who, rewrote, their, #, covid, vaccine, recom...","[rewrote, covid, vaccine, recommendation, gett...","[rewrote, covid, vaccine, recommendation, gett...","[rewrot, covid, vaccin, recommend, get, caught..."
2,70% of @KingCountyWA residents 12+ have now co...,KCPubHealth,"Seattle & King County, WA",2021-06-22 19:43:36,1407424048833859584,Twitter Web App,650,171,298,"[COVID, vaccination]",70% of residents 12+ have now completed their ...,"[70, %, of, residents, 12+, have, now, complet...","[70, resident, 12+, completed, covid, vaccinat...","[70, resident, 12+, completed, covid, vaccinat...","[70, resid, 12, complet, covid, vaccin, series..."
3,RT @HackneyCarers: ➡️ Have you had your first ...,okwithmydecay,London,2021-06-23 21:28:01,1407812713888223232,Twitter for Android,0,5,140,[],rt have you had your first dose of astrazeneca...,"[rt, have, you, had, your, first, dose, of, as...","[rt, first, dose, astrazeneca, covid-19, vacci...","[rt, first, dose, astrazeneca, covid-19, vacci...","[rt, first, dos, astrazenec, covid19, vaccin, ..."
4,RT @pravenovice: Totally safe bro #covid #vacc...,TinkaraNoc1,Slovenija,2021-06-23 21:27:54,1407812685299920906,Twitter for Android,0,1,73,"[covid, vaccine]",rt totally safe bro #covid #vaccine https://t....,"[rt, totally, safe, bro, #, covid, #, vaccine,...","[rt, totally, safe, bro, covid, vaccine, http,...","[rt, totally, safe, bro, covid, vaccine, http,...","[rt, tot, saf, bro, covid, vaccin, httpstconby..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,RT @DavidHarrisAJC: When I wonder how long we’...,heyman_von,,2021-06-22 16:09:43,1407370223691304962,Twitter for iPad,0,13447,139,[COVID],rt when i wonder how long we’ll be in #covid l...,"[rt, when, i, wonder, how, long, we, ’, ll, be...","[rt, wonder, long, ’, covid, lockdown, think, ...","[rt, wonder, long, ’, covid, lockdown, think, ...","[rt, wond, long, we’ll, covid, lockdown, think..."
996,RT @AlexBerenson: THREAD\n\n19-year-old Simone...,re_dewine,USA,2021-06-22 16:09:37,1407370198747865096,Twitter for iPhone,0,5572,140,[Covid],rt thread 19-year-old simone scott was excited...,"[rt, thread, 19-year-old, simone, scott, was, ...","[rt, thread, 19-year-old, simone, scott, wa, e...","[rt, thread, 19-year-old, simone, scott, wa, e...","[rt, thread, 19yearold, simon, scot, excit, ge..."
997,RT @chiefsforchange: CFC member @SchwinnTeach ...,GaleMorrisonEd,"Philadelphia, PA",2021-06-22 16:09:32,1407370174995566597,Twitter Web App,0,8,140,[Covid],rt cfc member says #covid recovery work is gro...,"[rt, cfc, member, says, #, covid, recovery, wo...","[rt, cfc, member, say, covid, recovery, work, ...","[rt, cfc, member, say, covid, recovery, work, ...","[rt, cfc, memb, say, covid, recovery, work, gr..."
998,RT @_lokeshsharma: #Rajasthan #CoronaUpdate\n\...,premsinghgaur11,अजमेर राजस्थान,2021-06-22 16:09:31,1407370173800017923,Twitter for Android,0,255,139,"[Rajasthan, CoronaUpdate, COVID, Jaipur, Recov...",rt #rajasthan #coronaupdate#covid cases report...,"[rt, #, rajasthan, #, coronaupdate, #, covid, ...","[rt, rajasthan, coronaupdate, covid, case, rep...","[rt, rajasthan, coronaupdate, covid, case, rep...","[rt, rajasth, coronaupdatecovid, cas, report, ..."


In [None]:
# save to new csv
from google.colab import files
tweet_covid.to_csv('tweet_covid.csv')
files.download("tweet_covid.csv")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

## Covid19 Dataframe Cleaned and Saved

In [None]:
tweets_covid19 = tw.Cursor(api.search,
                   q = search_covid19,
                   lang ='en',
                   fromDate = '2020/02/01',
                   toDate = '2020/12/31',
                   tweet_mode = 'extended',
                   min_retweets = 50,
                   result_type = 'mixed').items(1000)

tweets_covid19

<tweepy.cursor.ItemIterator at 0x7fc17f1033d0>

In [None]:
tweet_covid19 = pd.DataFrame(data=[[tweet.full_text, tweet.user.screen_name, tweet.user.location, tweet.created_at, tweet.id, tweet.source, tweet.favorite_count, tweet.retweet_count, len(tweet.full_text)]for tweet in tweets_covid19])

In [None]:
tweet_covid19.columns = ['tweet', 'user', 'location', 'date', 'id', 'source', 'favourites', 'retweets', 'tweet_length']

In [None]:
tweet_covid19 = tweet_cleaning(tweet_covid19)
tweet_covid19

Unnamed: 0,tweet,user,location,date,id,source,favourites,retweets,tweet_length,hashtag,cleaned,tokens,lemmatized,stemmed,split
0,Terkini. 23 Jun. Kes baharu hari ini 5️⃣2️⃣4️⃣...,KKMPutrajaya,Putrajaya,2021-06-23 06:09:03,1407581446626177025,Twitter for iPhone,5468,5004,61,[COVID19],terkini.jun. kes baharu hari ini 5⃣2⃣4⃣4⃣ #cov...,"[terkini.jun, ., kes, baharu, hari, ini, 5⃣2⃣4...","[terkini.jun, kes, baharu, hari, ini, 5⃣2⃣4⃣4⃣...","[terkini.jun, kes, baharu, hari, ini, 5⃣2⃣4⃣4⃣...","[terkinijun, kes, baharu, har, in, 5⃣2⃣4⃣4⃣, c..."
1,The idea behind our White Paper report on #COV...,RahulGandhi,"12, Tughlak Lane, New Delhi",2021-06-22 06:33:56,1407225321200435201,Twitter Web App,33747,8692,229,[COVID19],the idea behind our white paper report on #cov...,"[the, idea, behind, our, white, paper, report,...","[idea, behind, white, paper, report, covid19, ...","[idea, behind, white, paper, report, covid19, ...","[ide, behind, whit, pap, report, covid19, prov..."
2,Terkini. 22 Jun. Kes baharu hari ini 4️⃣7️⃣4️⃣...,KKMPutrajaya,Putrajaya,2021-06-22 06:24:50,1407223030418350083,Twitter for iPhone,6950,5573,75,"[COVID19, MenangBersama]",terkini.jun. kes baharu hari ini 4⃣7⃣4⃣3⃣ #cov...,"[terkini.jun, ., kes, baharu, hari, ini, 4⃣7⃣4...","[terkini.jun, kes, baharu, hari, ini, 4⃣7⃣4⃣3⃣...","[terkini.jun, kes, baharu, hari, ini, 4⃣7⃣4⃣3⃣...","[terkinijun, kes, baharu, har, in, 4⃣7⃣4⃣3⃣, c..."
3,"RT @kawataru_j: Tokyo reported 619 / 6,773 (Av...",the_south_side,Japan,2021-06-23 21:28:31,1407812840765804546,Twitter Web App,0,3,139,[COVID19],"rt tokyo reported/ 6,773 (average number of pc...","[rt, tokyo, reported/, 6,773, (, average, numb...","[rt, tokyo, reported/, 6,773, average, number,...","[rt, tokyo, reported/, 6,773, average, number,...","[rt, tokyo, report, 6773, av, numb, pcr, test,..."
4,RT @KrishnanAmrish: I have been waiting for a ...,GreyKaulotu,Fiji,2021-06-23 21:28:31,1407812839209766917,Twitter for Android,0,8,140,[COVID19],rt i have been waiting for a journalist to ask...,"[rt, i, have, been, waiting, for, a, journalis...","[rt, waiting, journalist, ask, type, question,...","[rt, waiting, journalist, ask, type, question,...","[rt, wait, journ, ask, typ, quest, covid19, fi..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,Reports suggest that the Republic of #Congo ha...,DevReimagined,Beijing,2021-06-22 06:13:03,1407220065745858566,Twitter Web App,3,5,273,"[Congo, debt, China, COVID19, africaunconstrai...",reports suggest that the republic of #congo ha...,"[reports, suggest, that, the, republic, of, #,...","[report, suggest, republic, congo, ha, success...","[report, suggest, republic, congo, ha, success...","[report, suggest, republ, congo, success, requ..."
996,RT @WorldBank: Why is vaccination critical for...,pmrnabawan,"Nabawan, Sabah",2021-06-22 06:13:02,1407220064219123715,Twitter Web App,0,24,140,[],rt why is vaccination critical for countries' ...,"[rt, why, is, vaccination, critical, for, coun...","[rt, vaccination, critical, country, economic,...","[rt, vaccination, critical, country, economic,...","[rt, vaccin, crit, country, econom, recovery, ..."
997,RT @COVIDNewsByMIB: #IndiaFightsCorona:\n\n📍𝑴𝒐...,ROB_Patna,"Patna, India",2021-06-22 06:13:02,1407220063728504832,Twitter for Android,0,60,140,[IndiaFightsCorona],"rt #indiafightscorona: (86,16,373) . together ...","[rt, #, indiafightscorona, :, (, 86,16,373, ),...","[rt, indiafightscorona, 86,16,373, together, c...","[rt, indiafightscorona, 86,16,373, together, c...","[rt, indiafightscoron, 8616373, togeth, can…]"
998,RT @earth_yug: Scientists have found that 414 ...,SustainTrends,,2021-06-22 06:13:02,1407220062411362307,sustaintrends,0,3,140,[],rt scientists have found thatmillion plastic w...,"[rt, scientists, have, found, thatmillion, pla...","[rt, scientist, found, thatmillion, plastic, w...","[rt, scientist, found, thatmillion, plastic, w...","[rt, sci, found, thatmil, plast, wast, item, w..."


In [None]:
# save to new csv
from google.colab import files
tweet_covid19.to_csv('tweet_covid19.csv')
files.download("tweet_covid19.csv")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

## Virus Dataframe Cleaned and Saved

In [None]:
tweets_virus = tw.Cursor(api.search,
                   q = search_virus,
                   lang ='en',
                   fromDate = '2020/02/01',
                   toDate = '2020/12/31',
                   tweet_mode = 'extended',
                   min_retweets = 50,
                   result_type = 'mixed').items(1000)

tweets_virus

<tweepy.cursor.ItemIterator at 0x7fc17d9a1850>

In [None]:
tweet_virus = pd.DataFrame(data=[[tweet.full_text, tweet.user.screen_name, tweet.user.location, tweet.created_at, tweet.id, tweet.source, tweet.favorite_count, tweet.retweet_count, len(tweet.full_text)]for tweet in tweets_virus])

In [None]:
tweet_virus.columns = ['tweet', 'user', 'location', 'date', 'id', 'source', 'favourites', 'retweets', 'tweet_length']

In [None]:
tweet_virus = tweet_cleaning(tweet_virus)
tweet_virus

Unnamed: 0,tweet,user,location,date,id,source,favourites,retweets,tweet_length,hashtag,cleaned,tokens,lemmatized,stemmed,split
0,🦠 The #COVID19 pandemic continues to worry pub...,France24_en,"Paris, France",2021-06-23 11:00:01,1407654670806315013,TweetDeck,22,15,299,"[COVID19, WorldHealthOrganization, virus, Euro...",🦠 the #covid19 pandemic continues to worry pub...,"[🦠, the, #, covid19, pandemic, continues, to, ...","[🦠, covid19, pandemic, continues, worry, publi...","[🦠, covid19, pandemic, continues, worry, publi...","[🦠, covid19, pandem, continu, worry, publ, hea..."
1,#Mosquitoes infected with the West Nile Fever ...,Jerusalem_Post,Israel,2021-06-23 10:27:18,1407646438729859074,Twitter Web App,12,4,114,"[Mosquitoes, virus, Israel]",#mosquitoes infected with the west nile fever ...,"[#, mosquitoes, infected, with, the, west, nil...","[mosquito, infected, west, nile, fever, virus,...","[mosquito, infected, west, nile, fever, virus,...","[mosquito, infect, west, nil, fev, vir, found,..."
2,RT @DrSusanNasif: #vQuiz\nNAME THE #VIRUS🤓\n \...,DrSusanNasif,"Basilicata, 🇮🇹",2021-06-23 21:28:35,1407812855789932548,Twitter for Android,0,2,140,"[vQuiz, VIRUS]",rt #vquiz name the #virus🤓 -it establishes a l...,"[rt, #, vquiz, name, the, #, virus🤓, -it, esta...","[rt, vquiz, name, virus🤓, -it, establishes, li...","[rt, vquiz, name, virus🤓, -it, establishes, li...","[rt, vqu, nam, virus🤓, est, lifelong, lat, asy..."
3,Drs. Tom &amp; Andy EXPOSE the PSEUDOSCIENCE o...,Regenetics,,2021-06-23 21:24:22,1407811795717341188,Twitter Web App,0,0,290,"[VIRUS, MAGNETISM, NANOTECH, SPIKEPROTEIN, ger...",drs. tom &amp; andy expose the pseudoscience o...,"[drs, ., tom, &, amp, ;, andy, expose, the, ps...","[drs, tom, amp, andy, expose, pseudoscience, v...","[drs, tom, amp, andy, expose, pseudoscience, v...","[drs, tom, amp, andy, expos, pseudoscy, vir, m..."
4,"RT @TerryTyler4: July 26th, 2024: the day the ...",jasonwrite,Saturn,2021-06-23 21:24:01,1407811705250455556,Twitter for Android,0,8,140,[PostApocalyptic],"rt july 26th, 2024: the day the world changed....","[rt, july, 26th, ,, 2024, :, the, day, the, wo...","[rt, july, 26th, 2024, day, world, changed, fo...","[rt, july, 26th, 2024, day, world, changed, fo...","[rt, july, 26th, 2024, day, world, chang, fore..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,Played #Virus 6 times on 2021-06-20 https://...,G_Petiso,Bahia pirata de Las Rozas,2021-06-21 06:37:48,1406863906556059649,BoardGameGeek,2,0,70,"[Virus, bggplay]",played #virustimes on 2021-06-20 https://t.co/...,"[played, #, virustimes, on, 2021-06-20, https,...","[played, virustimes, 2021-06-20, http, //t.co/...","[played, virustimes, 2021-06-20, http, //t.co/...","[play, virustim, 20210620, httpstcoxfl35tqyl3,..."
996,RT @PanguF: Section II \n（2/5）attack between t...,Sally35697970,,2021-06-21 06:37:21,1406863796073762816,Twitter for iPhone,0,52,140,"[plots, Taiwan]",rt section ii 2/5attack between the end of thi...,"[rt, section, ii, 2/5attack, between, the, end...","[rt, section, ii, 2/5attack, end, summer, begi...","[rt, section, ii, 2/5attack, end, summer, begi...","[rt, sect, ii, 25attack, end, sum, begin, fal,..."
997,UK Scientists Warn of ‘Miserable Winter’ Due t...,latestly,"Mumbai, India",2021-06-21 06:34:48,1406863151686066177,TweetDeck,4,3,149,"[UK, scientist, Experts, Virus, Lockdown, COVI...",uk scientists warn of ‘miserable winter’ due t...,"[uk, scientists, warn, of, ‘, miserable, winte...","[uk, scientist, warn, ‘, miserable, winter, ’,...","[uk, scientist, warn, ‘, miserable, winter, ’,...","[uk, sci, warn, ‘miserable, winter’, due, new,..."
998,RT @sunitanar: In the deadly race between the ...,Dr_Aqsa_Shaikh,"Delhi, India",2021-06-21 06:28:12,1406861492633313281,Twitter Web App,0,6,140,"[virus, vaccine]","rt in the deadly race between the #virus, its ...","[rt, in, the, deadly, race, between, the, #, v...","[rt, deadly, race, virus, variant, vaccine, pr...","[rt, deadly, race, virus, variant, vaccine, pr...","[rt, dead, rac, vir, vary, vaccin, pric, deter..."


In [None]:
# save to new csv
from google.colab import files
tweet_virus.to_csv('tweet_virus.csv')
files.download("tweet_virus.csv")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

## Vaccine Dataframe Cleaned and Saved

In [None]:
tweets_vaccine = tw.Cursor(api.search,
                   q = search_vaccine,
                   lang ='en',
                   fromDate = '2020/02/01',
                   toDate = '2020/12/31',
                   tweet_mode = 'extended',
                   min_retweets = 50,
                   result_type = 'mixed').items(1000)

tweets_vaccine

<tweepy.cursor.ItemIterator at 0x7fc17c1e5790>

In [None]:
tweet_vaccine = pd.DataFrame(data=[[tweet.full_text, tweet.user.screen_name, tweet.user.location, tweet.created_at, tweet.id, tweet.source, tweet.favorite_count, tweet.retweet_count, len(tweet.full_text)]for tweet in tweets_vaccine])

In [None]:
tweet_vaccine.columns = ['tweet', 'user', 'location', 'date', 'id', 'source', 'favourites', 'retweets', 'tweet_length']

In [None]:
tweet_vaccine = tweet_cleaning(tweet_vaccine)
tweet_vaccine

Unnamed: 0,tweet,user,location,date,id,source,favourites,retweets,tweet_length,hashtag,cleaned,tokens,lemmatized,stemmed,split
0,#Twitter locked the account of conservative-le...,EpochTimes,"New York, USA",2021-06-22 20:40:01,1407438246221684746,Buffer,182,117,263,"[Twitter, Vaccine, COVID19]",#twitter locked the account of conservative-le...,"[#, twitter, locked, the, account, of, conserv...","[twitter, locked, account, conservative-leanin...","[twitter, locked, account, conservative-leanin...","[twit, lock, account, conservativel, new, webs..."
1,Another big #vaccine day. Over 5.2 Million dos...,samirsaran,New Delhi,2021-06-22 14:37:44,1407347074073600006,Twitter for iPhone,606,74,147,"[vaccine, Vaccinate, MaskUp, COVID19, India]",another big #vaccine day. over 5.2 million dos...,"[another, big, #, vaccine, day, ., over, 5.2, ...","[another, big, vaccine, day, 5.2, million, do,...","[another, big, vaccine, day, 5.2, million, do,...","[anoth, big, vaccin, day, 52, mil, dos, atpm, ..."
2,These are the only #vaccine numbers we should ...,menakadoshi,,2021-06-22 11:32:46,1407300527554105350,Twitter for Android,705,114,262,[vaccine],these are the only #vaccine numbers we should ...,"[these, are, the, only, #, vaccine, numbers, w...","[vaccine, number, focused, vaccination, start,...","[vaccine, number, focused, vaccination, start,...","[vaccin, numb, focus, vaccin, start, dat, jant..."
3,#VladimirPutin may come in from the cold.\n\nA...,AgentOrangeDDR,DDR - NYC - DC (ja & да),2021-06-23 21:29:09,1407812999742726144,Twitter for Android,0,0,129,"[VladimirPutin, COVID19, vaccine]",#vladimirputin may come in from the cold. afri...,"[#, vladimirputin, may, come, in, from, the, c...","[vladimirputin, may, come, cold, africa, face,...","[vladimirputin, may, come, cold, africa, face,...","[vladimirputin, may, com, cold, afric, fac, th..."
4,RT @PHE_uk: The #COVID19 Dashboard has been up...,AceHealthNews,United Kingdom,2021-06-23 21:28:49,1407812913818222596,Twitter for iPad,0,47,140,[COVID19],rt the #covid19 dashboard has been updated: ht...,"[rt, the, #, covid19, dashboard, has, been, up...","[rt, covid19, dashboard, ha, updated, http, //...","[rt, covid19, dashboard, ha, updated, http, //...","[rt, covid19, dashboard, upd, httpstcoxhspoytg..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,RT @ArmonaiteA: #Lithuania will share 20K Astr...,LichardYang,台灣,2021-06-22 09:47:42,1407274086347153408,Twitter Web App,0,598,140,"[Lithuania, vaccine, Taiwan]",rt #lithuania will share 20k astrazeneca #vacc...,"[rt, #, lithuania, will, share, 20k, astrazene...","[rt, lithuania, share, 20k, astrazeneca, vacci...","[rt, lithuania, share, 20k, astrazeneca, vacci...","[rt, lithuan, shar, 20k, astrazenec, vaccin, d..."
996,Christian living today is absorbed from the #v...,syntagm,Stellenbosch,2021-06-22 09:47:36,1407274060569010180,Twitter for Android,1,0,268,"[virus, vaccine, futureGrace, coronavirus, fai...",christian living today is absorbed from the #v...,"[christian, living, today, is, absorbed, from,...","[christian, living, today, absorbed, virus, va...","[christian, living, today, absorbed, virus, va...","[christian, liv, today, absorb, vir, vaccin, p..."
997,RT @21WIRE: The myth of the 'asymptomatic spre...,fialottameknuff,,2021-06-22 09:47:29,1407274029610844160,Twitter Web App,0,1335,143,[],rt the myth of the 'asymptomatic spread' &amp;...,"[rt, the, myth, of, the, 'asymptomatic, spread...","[rt, myth, 'asymptomatic, spread, amp, ``, asy...","[rt, myth, 'asymptomatic, spread, amp, ``, asy...","[rt, myth, asymptom, spread, amp, asymptom, su..."
998,RT @ArmonaiteA: #Lithuania will share 20K Astr...,winter0109,Taiwan,2021-06-22 09:47:28,1407274026108526595,Twitter for Android,0,598,140,"[Lithuania, vaccine, Taiwan]",rt #lithuania will share 20k astrazeneca #vacc...,"[rt, #, lithuania, will, share, 20k, astrazene...","[rt, lithuania, share, 20k, astrazeneca, vacci...","[rt, lithuania, share, 20k, astrazeneca, vacci...","[rt, lithuan, shar, 20k, astrazenec, vaccin, d..."


In [None]:
# save to new csv
from google.colab import files
tweet_vaccine.to_csv('tweet_vaccine.csv')
files.download("tweet_vaccine.csv")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

## Pandemic Dataframe Cleaned and Saved

In [None]:
tweets_pandemic = tw.Cursor(api.search,
                   q = search_pandemic,
                   lang ='en',
                   fromDate = '2020/02/01',
                   toDate = '2020/12/31',
                   tweet_mode = 'extended',
                   min_retweets = 50,
                   result_type = 'mixed').items(1000)

tweets_pandemic

<tweepy.cursor.ItemIterator at 0x7fc17acaaad0>

In [None]:
tweet_pandemic = pd.DataFrame(data=[[tweet.full_text, tweet.user.screen_name, tweet.user.location, tweet.created_at, tweet.id, tweet.source, tweet.favorite_count, tweet.retweet_count, len(tweet.full_text)]for tweet in tweets_pandemic])

In [None]:
tweet_pandemic.columns = ['tweet', 'user', 'location', 'date', 'id', 'source', 'favourites', 'retweets', 'tweet_length']

In [None]:
tweet_pandemic = tweet_cleaning(tweet_pandemic)
tweet_pandemic

Unnamed: 0,tweet,user,location,date,id,source,favourites,retweets,tweet_length,hashtag,cleaned,tokens,lemmatized,stemmed,split
0,"On #TravelDayOfAction, I will join the industr...",HuwMerriman,Bexhill and Battle,2021-06-23 09:02:49,1407625178507624448,Twitter Web App,913,373,209,"[TravelDayOfAction, SpeakUpForTravel, pandemic]","on #traveldayofaction, i will join the industr...","[on, #, traveldayofaction, ,, i, will, join, t...","[traveldayofaction, join, industry, speakupfor...","[traveldayofaction, join, industry, speakupfor...","[traveldayofact, join, industry, speakupfortra..."
1,Govt led by Hon'ble PM Sh @narendramodi Ji is ...,drharshvardhan,Delhi,2021-06-22 12:59:37,1407322384076468229,Twitter for iPhone,392,80,287,"[pandemic, COVID19]",govt led by hon'ble pm sh ji is more than rece...,"[govt, led, by, hon'ble, pm, sh, ji, is, more,...","[govt, led, hon'ble, pm, sh, ji, receptive, co...","[govt, led, hon'ble, pm, sh, ji, receptive, co...","[govt, led, honbl, pm, sh, ji, receiv, constru..."
2,The 🇫🇷🇮🇳🇦🇺 trilateral dialogue now expands to ...,FranceinIndia,New Delhi,2021-06-23 08:57:51,1407623929032679429,Twitter Web App,742,123,305,"[G20, digital, ClimateAction, pandemic]",the trilateral dialogue now expands to g20 coo...,"[the, trilateral, dialogue, now, expands, to, ...","[trilateral, dialogue, expands, g20, coordinat...","[trilateral, dialogue, expands, g20, coordinat...","[tril, dialog, expand, g20, coordin, today, se..."
3,@crypto A First Wave-The Emergence. Pandemic R...,Lisa69620724,United States,2021-06-23 21:29:19,1407813042138660864,Twitter for iPhone,0,0,312,"[afirstwave, pandemic, lisamariemeadows]",a first wave-the emergence. pandemic romance. ...,"[a, first, wave-the, emergence, ., pandemic, r...","[first, wave-the, emergence, pandemic, romance...","[first, wave-the, emergence, pandemic, romance...","[first, waveth, emerg, pandem, rom, shelby, si..."
4,RT @ResearchSleep: A study of twins led by Was...,LuebeckPni,,2021-06-23 21:28:57,1407812948085641217,Twitter for iPhone,0,1,140,"[anxiety, depression]",rt a study of twins led by washington state un...,"[rt, a, study, of, twins, led, by, washington,...","[rt, study, twin, led, washington, state, univ...","[rt, study, twin, led, washington, state, univ...","[rt, study, twin, led, washington, stat, unive..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,"RT @frontline_india: Before the #pandemic, int...",SustainTrends,,2021-06-22 09:13:45,1407265543334240262,sustaintrends,0,2,140,[pandemic],"rt before the #pandemic, inter- and intra-coun...","[rt, before, the, #, pandemic, ,, inter-, and,...","[rt, pandemic, inter-, intra-country, inequali...","[rt, pandemic, inter-, intra-country, inequali...","[rt, pandem, int, intracountry, ineq, intol, d..."
996,"RT @frontline_india: Before the #pandemic, int...",hariom_bhai14,"New Delhi, India",2021-06-22 09:13:40,1407265519116357636,Twitter for Android,0,2,140,[pandemic],"rt before the #pandemic, inter- and intra-coun...","[rt, before, the, #, pandemic, ,, inter-, and,...","[rt, pandemic, inter-, intra-country, inequali...","[rt, pandemic, inter-, intra-country, inequali...","[rt, pandem, int, intracountry, ineq, intol, d..."
997,RT @Robyn_Grote: Shocking apparently #legal #...,Laine67492718,"Sydney, New South Wales",2021-06-22 09:13:01,1407265356553486337,Twitter for iPad,0,3,139,"[legal, age, discrimination, NDIS, polio, surv...",rt shocking apparently #legal #age #discrimina...,"[rt, shocking, apparently, #, legal, #, age, #...","[rt, shocking, apparently, legal, age, discrim...","[rt, shocking, apparently, legal, age, discrim...","[rt, shock, app, leg, ag, discrimin, ndi, also..."
998,RT @ThinkCREA: The #pandemic expedited CREA’s ...,yomegh,La la Land,2021-06-22 09:12:55,1407265332599787526,Twitter for Android,0,2,140,[pandemic],rt the #pandemic expedited crea’s ongoing tran...,"[rt, the, #, pandemic, expedited, crea, ’, s, ...","[rt, pandemic, expedited, crea, ’, ongoing, tr...","[rt, pandemic, expedited, crea, ’, ongoing, tr...","[rt, pandem, expedit, crea’s, ongo, transform,..."


In [None]:
# save to new csv
from google.colab import files
tweet_pandemic.to_csv('tweet_pandemic.csv')
files.download("tweet_pandemic.csv")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

## TO DO

To DO:
1. Load Seperate Datasets: coronavirus, covid, covid19, pandemic, virus
2. Save these Datasets to individual files
3. Clean the data: - remove stopwords, remove punctuation, make new column for hashtags, 
4. Explore all retweets: date and location
5. Identify hashtags that are indicative of conspiracy theories: make word cloud of these hashtags e.g. anti semitism, anti-chinese rhetoric, bat soup, general cover up, 5G
6. Isolate the tweets that have these hashtags and explore
7. conspiracies, hate speech and incitement to violence
8. group according to 5g, anti-chinese rhetoric, anti-semitic etc.. and have time series graph

#Hashtag EDA

## Hashtag EDA Coronvairus

In [None]:
# Create List with all hashtags in
def hashtags(df):

    hashtag_list = []

    for item in df:
        for value in item:
            hashtag_list.append(value)

    return hashtag_list

In [None]:
hashtags_list = hashtags(tweet_coronavirus.hashtag)

In [None]:
# convert all hashtags to lowercase to help filter duplicates
def lowercase(list):
    for i in range(len(list)):
        list[i] = list[i].lower()

    return list

In [None]:
hashtags_list = lowercase(hashtags_list)
hashtags_list

['coronavirus',
 'gaza',
 'coronavirus',
 'coronavirus',
 'covid19',
 'coronavirus',
 'staysafe',
 'covid19',
 'covidvaccine',
 'deltavariant',
 'vaccineswork',
 'vaccinated',
 'healthcare',
 'coronavirus',
 'coronavirus',
 'immunology',
 'breakingne',
 'coronavirus',
 'sandrasturm',
 'fu',
 'coronavirus',
 'fun',
 'love',
 'b1010',
 'dvd',
 'v',
 'coronavirus',
 'coronavirusupdate',
 'covid19',
 'coronaviruspandemic',
 'covid19',
 'sierraleone',
 'pandemic',
 'positive',
 'covid',
 'coronavirus',
 'covid19',
 'covid_19',
 'covid19',
 'coronavirus',
 'breaking',
 'covid19',
 'jeffzients',
 'coronavirus',
 'coronavirus',
 'oxygenexpress',
 'train',
 'jamnagar',
 'oxygen',
 'rahulgandhi',
 'covid19',
 'asia',
 'covid19',
 'coronavirus',
 'corona',
 'covid19nsw',
 'covidvaccination',
 'vaccine',
 'vaccineforall',
 'coronavirusupdates',
 'ravishkumar',
 'godimedia',
 'coronavirus',
 'britain',
 'brexit',
 'borishasfailedbritain',
 'borisjohnson',
 'loc',
 'massachusetts',
 'coronavirus',
 

In [None]:
hashtags_list_count = dict(zip(list(hashtags_list),[list(hashtags_list).count(i) for i in list(hashtags_list)]))
print(hashtags_list_count)

{'coronavirus': 647, 'gaza': 1, 'covid19': 354, 'staysafe': 8, 'covidvaccine': 7, 'deltavariant': 32, 'vaccineswork': 5, 'vaccinated': 9, 'healthcare': 6, 'immunology': 1, 'breakingne': 1, 'sandrasturm': 1, 'fu': 1, 'fun': 1, 'love': 4, 'b1010': 1, 'dvd': 1, 'v': 1, 'coronavirusupdate': 22, 'coronaviruspandemic': 22, 'sierraleone': 1, 'pandemic': 25, 'positive': 1, 'covid': 67, 'covid_19': 27, 'breaking': 11, 'jeffzients': 6, 'oxygenexpress': 6, 'train': 6, 'jamnagar': 6, 'oxygen': 6, 'rahulgandhi': 2, 'asia': 2, 'corona': 44, 'covid19nsw': 2, 'covidvaccination': 15, 'vaccine': 39, 'vaccineforall': 8, 'coronavirusupdates': 18, 'ravishkumar': 4, 'godimedia': 1, 'britain': 4, 'brexit': 4, 'borishasfailedbritain': 2, 'borisjohnson': 5, 'loc': 1, 'massachusetts': 2, 'amazon': 2, 'primeday': 2, 'numerator': 1, 'stockmarketnews': 1, 'shoppingevent': 1, 'adobe': 1, 'endthelockdown': 1, 'lockdownsmustend': 1, 'iran': 14, 'madhyapradesh': 4, '22jun2021': 1, 'mpfightscorona': 1, 'businesses': 2,

In [None]:
c = collections.OrderedDict(sorted(hashtags_list_count.items()))

In [None]:
c

OrderedDict([('100daysofcode', 3),
             ('100freeshares', 1),
             ('1shotnews', 1),
             ('2019ncov', 3),
             ('22jun2021', 1),
             ('77wabcradio', 1),
             ('a', 2),
             ('abc15', 1),
             ('abdala', 3),
             ('abdalavaccine', 2),
             ('acci_india', 1),
             ('ad', 1),
             ('adobe', 1),
             ('advertisng', 1),
             ('advice', 1),
             ('advisers', 1),
             ('africa', 4),
             ('afrique', 2),
             ('ageism', 1),
             ('ahmedabad', 1),
             ('ai', 1),
             ('aiadmk', 1),
             ('airborne', 1),
             ('airtransport', 1),
             ('albertbourla', 1),
             ('algeria', 1),
             ('algerie', 1),
             ('algorithmicbiologics', 1),
             ('alshorts', 1),
             ('amazon', 2),
             ('amygoodman', 1),
             ('analyse', 1),
             ('anantnag', 1),
    

In [None]:
collections.OrderedDict(sorted(hashtags_list_count.items(), key=lambda item: item[1]))

OrderedDict([('gaza', 1),
             ('immunology', 1),
             ('breakingne', 1),
             ('sandrasturm', 1),
             ('fu', 1),
             ('fun', 1),
             ('b1010', 1),
             ('dvd', 1),
             ('v', 1),
             ('sierraleone', 1),
             ('positive', 1),
             ('godimedia', 1),
             ('loc', 1),
             ('numerator', 1),
             ('stockmarketnews', 1),
             ('shoppingevent', 1),
             ('adobe', 1),
             ('endthelockdown', 1),
             ('lockdownsmustend', 1),
             ('22jun2021', 1),
             ('mpfightscorona', 1),
             ('level5', 1),
             ('voteofnoconfidence', 1),
             ('staythefhome', 1),
             ('antivaccinepas', 1),
             ('endgainoffunctionfunding', 1),
             ('tmc', 1),
             ('recoveryloanscheme', 1),
             ('rls', 1),
             ('wpro', 1),
             ('countydurham', 1),
             ('kbkinfographic

In [None]:
coronavirus_hashtags_list = ['borishasfailedbritain', 'boristheliar', 'bitcoin', 'bill', 'cancer', 'capitolriots', 'china', 'chinaliedpeopledied', 'chinesevirus', 'chinesevirus', 'communism', 'crimesagainsthumanity', 'crypto', 'cryptocurrency', 'doomsdaytwt', 'fakenews', 'federalgovernment', 'fda', 'firefauci', 'firefaucilegislation', 'fauci', 'fungalinfections', 'futureindanger', 'g7', 'g72021leaders', 'g7summit2021', 'government', 'grenfell', 'hackingduringthecoronaviruspandemic', 'hiv', 'lableaktheory','lableak', 'magneticpower', 'moronavirus', 'trumpvirus', 'tyranny', 'notocoronavirusvaccines', 'robots', 'russia', 'trumpisguilty', 'tyranny', 'whitesupremacy', 'wuhan', 'wuhanlab', 'xijinping']

In [None]:
#count of words in tweets
words = tweet_coronavirus.stack().str.split("[^\w+]").explode().tolist()

In [None]:
count_words = dict(zip(list(words),[list(words).count(i) for i in list(words)]))
print(count_words)

KeyboardInterrupt: ignored

## Hashtag EDA Covid

In [None]:
def get_hashtags_list(df):
    h_list = hashtags(df.hashtag)
    h_list_lower = lowercase(h_list)
    h_list_count = dict(zip(list(h_list_lower),[list(h_list_lower).count(i) for i in list(h_list_lower)]))
    h = h_list_count
    return h

In [None]:
covid_hashtags = get_hashtags_list(tweet_covid)

In [None]:
covid_hashtags

{'1a': 1,
 '2a': 1,
 '3x3annual': 1,
 '50swomen': 1,
 'a24': 1,
 'a24news': 1,
 'absolutejoke': 1,
 'ad': 1,
 'afirstwave': 34,
 'africafirst': 1,
 'africanews': 1,
 'agility': 1,
 'ahrq': 1,
 'ai': 2,
 'aircare02': 1,
 'airevolution': 1,
 'airport': 1,
 'america': 1,
 'american': 1,
 'americanjournal': 1,
 'amethi': 1,
 'analytics': 1,
 'andhra': 1,
 'antibodies': 1,
 'antiviral': 1,
 'art': 1,
 'artificialintelligence': 1,
 'arunachalcovidupdate': 2,
 'asakhe': 1,
 'asia': 1,
 'asmicrobe': 1,
 'assnchat': 1,
 'associationlife': 1,
 'astrazeneca': 2,
 'astrazenica': 7,
 'asymptomatic': 1,
 'atlanta': 1,
 'august': 1,
 'australia': 1,
 'awesome': 1,
 'backstabber': 1,
 'basemetals': 1,
 'bbcr4today': 1,
 'beatcovid': 1,
 'behibagh': 1,
 'belairviv': 1,
 'bengal': 1,
 'biden': 3,
 'bidenbordercrisis': 1,
 'bidensamerica': 1,
 'bigpharma': 5,
 'bigtech': 1,
 'billgates': 2,
 'billionaireboom': 4,
 'billionaires': 1,
 'billy': 1,
 'bitcoin': 1,
 'bitechnology': 1,
 'blockchain': 2,
 'bomb

In [None]:
covid_hashtags

{'1a': 1,
 '2a': 1,
 '3x3annual': 1,
 '50swomen': 1,
 'a24': 1,
 'a24news': 1,
 'absolutejoke': 1,
 'ad': 1,
 'afirstwave': 34,
 'africafirst': 1,
 'africanews': 1,
 'agility': 1,
 'ahrq': 1,
 'ai': 2,
 'aircare02': 1,
 'airevolution': 1,
 'airport': 1,
 'america': 1,
 'american': 1,
 'americanjournal': 1,
 'amethi': 1,
 'analytics': 1,
 'andhra': 1,
 'antibodies': 1,
 'antiviral': 1,
 'art': 1,
 'artificialintelligence': 1,
 'arunachalcovidupdate': 2,
 'asakhe': 1,
 'asia': 1,
 'asmicrobe': 1,
 'assnchat': 1,
 'associationlife': 1,
 'astrazeneca': 2,
 'astrazenica': 7,
 'asymptomatic': 1,
 'atlanta': 1,
 'august': 1,
 'australia': 1,
 'awesome': 1,
 'backstabber': 1,
 'basemetals': 1,
 'bbcr4today': 1,
 'beatcovid': 1,
 'behibagh': 1,
 'belairviv': 1,
 'bengal': 1,
 'biden': 3,
 'bidenbordercrisis': 1,
 'bidensamerica': 1,
 'bigpharma': 5,
 'bigtech': 1,
 'billgates': 2,
 'billionaireboom': 4,
 'billionaires': 1,
 'billy': 1,
 'bitcoin': 1,
 'bitechnology': 1,
 'blockchain': 2,
 'bomb

## Hashtag EDA Covid19

In [None]:
covid19_hashtags = get_hashtags_list(tweet_covid19)
covid19_hashtags

{'30seconds': 1,
 '3rdwave': 1,
 '7dayavg': 2,
 '7yearsofseva': 1,
 'ab_healthandwellnesscentres': 1,
 'abpoli': 1,
 'accountabilityand': 2,
 'actforequal': 1,
 'afghan': 1,
 'africa': 2,
 'africancsossurvey': 1,
 'africaunconstrained': 1,
 'ai': 1,
 'aif2021': 2,
 'albertaforall': 1,
 'allah': 1,
 'amarnathyatra': 4,
 'amazingproduct': 2,
 'amazon': 1,
 'america': 1,
 'americans': 1,
 'amerix': 1,
 'amtz': 2,
 'antimask': 1,
 'anxious': 1,
 'apocalypse': 1,
 'armys': 1,
 'artist': 1,
 'arunachalpradesh': 1,
 'asmano': 1,
 'assam': 2,
 'asıoldum': 1,
 'auspol': 2,
 'auspol2021': 1,
 'autophagy': 1,
 'awam': 1,
 'awareness': 1,
 'ayurvedic': 1,
 'baby': 1,
 'backtoschool': 1,
 'bakerylife': 1,
 'bakeryproducts': 1,
 'bame': 1,
 'banff': 1,
 'bangkok': 1,
 'bangladesh': 1,
 'bbc': 1,
 'bbcbreakfast': 2,
 'bbmp': 1,
 'bct': 1,
 'beautiful': 1,
 'before3rdwave': 21,
 'beita': 1,
 'bengalassemblyelections': 1,
 'bengaluru': 2,
 'berlin': 1,
 'biden': 1,
 'bihar': 2,
 'biontech': 1,
 'biowea

## Hashtag EDA Virus

In [None]:
virus_hashtags = get_hashtags_list(tweet_virus)
virus_hashtags

{'covid19': 231,
 'worldhealthorganization': 2,
 'virus': 724,
 'euro2021': 2,
 'mosquitoes': 1,
 'israel': 1,
 'vquiz': 1,
 'magnetism': 1,
 'nanotech': 1,
 'spikeprotein': 3,
 'germtheory': 1,
 'virology': 4,
 'pcr': 2,
 'postapocalyptic': 6,
 'corona': 41,
 'covid2019': 5,
 'coronavirus': 110,
 'canada': 4,
 'covid19canada': 6,
 'coronaviruscanada': 4,
 'coronavirusfinland': 4,
 'covid19fi': 4,
 'canadacoronavirus': 5,
 'covidー19': 5,
 'covid19finland': 5,
 'epidemic': 10,
 'covid__19': 4,
 'finland': 4,
 'covid2019finland': 3,
 'finlandcoronavirus': 3,
 'koronafi': 3,
 'wuhan': 67,
 'meateatersvirus': 2,
 'pandemics': 3,
 'force': 2,
 'health': 29,
 'pentagon': 2,
 'conflict': 1,
 'disease': 11,
 'emergency': 1,
 'employees': 1,
 'military': 1,
 'pandemic': 63,
 'personalprotectiveequipment': 1,
 'publichealth': 9,
 'secretary': 1,
 'vaccination': 23,
 'vegan': 1,
 'phage': 2,
 'bioinformatics': 1,
 'microbiology': 2,
 'uv': 3,
 'bacteria': 5,
 'vir': 1,
 'india': 71,
 'deltavarian

## Hashtag EDA Vaccine

In [None]:
vaccine_hashtags = get_hashtags_list(tweet_vaccine)
vaccine_hashtags

{'100daysofcode': 1,
 '324': 1,
 '5g': 11,
 'abdala': 2,
 'abhousingrealtypvtltd': 1,
 'abroad': 1,
 'actress': 1,
 'adjuvants': 1,
 'advertising': 1,
 'africa': 4,
 'africaresistance': 1,
 'ai': 1,
 'alberta': 1,
 'amrishpuri': 1,
 'antivaxxer': 1,
 'antivirals': 1,
 'apple': 1,
 'ardhinternational': 1,
 'army': 1,
 'arunachalpradesh': 1,
 'ashecon2021': 1,
 'asheville': 6,
 'asiapacific': 2,
 'assam': 1,
 'astrazeneca': 24,
 'astrazenecavaccine': 2,
 'aussies': 1,
 'australia': 2,
 'australians': 1,
 'baani': 1,
 'baanigirlsethnicwear': 1,
 'bangalore': 1,
 'bbc': 1,
 'beast': 3,
 'bharatbiotec': 3,
 'bharatbiotech': 5,
 'biden': 1,
 'bigpharma': 1,
 'billgates': 1,
 'biomarkers': 1,
 'bioreagents': 1,
 'biotech': 7,
 'biotechnology': 1,
 'bjp': 2,
 'blockchain': 1,
 'blog': 1,
 'blogging': 1,
 'blooddonorday': 1,
 'bolsonaro': 1,
 'bonds': 1,
 'brazil': 1,
 'breaking': 3,
 'breakingnews': 2,
 'bringsomefun': 1,
 'buffer': 1,
 'canada': 2,
 'cancer': 1,
 'cantwait': 1,
 'cdc': 1,
 'c

## Hashtag EDA Pandemic

In [None]:
pandemic_hashtags = get_hashtags_list(tweet_pandemic)
pandemic_hashtags

{'traveldayofaction': 1,
 'speakupfortravel': 1,
 'pandemic': 653,
 'covid19': 234,
 'g20': 1,
 'digital': 3,
 'climateaction': 1,
 'afirstwave': 4,
 'lisamariemeadows': 4,
 'anxiety': 5,
 'depression': 3,
 'charity': 2,
 'help': 1,
 'crisis': 4,
 'smallbusiness': 5,
 'health': 21,
 'wellbeing': 1,
 'cdp4recovery': 1,
 'ivychamp': 1,
 'suriya': 1,
 'jyothika': 1,
 'covidvaccination': 9,
 'vaccine': 16,
 'tuesdayvibes': 1,
 'tuesdaythoughts': 1,
 'career': 3,
 'jobs': 4,
 'work': 2,
 'workingfromhome': 4,
 'workout': 1,
 'business': 5,
 'humanresources': 1,
 'coronavirus': 70,
 'corona': 11,
 'pandemia': 3,
 'diversity': 1,
 'khou11': 6,
 'fauciemails': 1,
 'fauci': 2,
 'wuhanlableak': 1,
 'usa': 6,
 'wuhan': 2,
 'nstnation': 13,
 'recoveryplan': 1,
 'muhyiddin': 1,
 'labor': 1,
 'household': 1,
 'italy': 1,
 'lableaktheory': 1,
 'coronavirusorigin': 1,
 'coronaviruslableak': 1,
 'china': 19,
 'covid': 55,
 'fortune500': 1,
 'remoteworklife': 1,
 'remotework': 1,
 'diverceety': 1,
 'cli

## Total Chosen Hashtags

In [None]:
complete_hashtags_list = ['chinaliedpeopledied', 'chinesevirus', 'fauci', 'lableaktheory', 'lableak', 'magneticpower', 'notocoronavirusvaccines', 'wuhanlab', 'ccp_is_terrorist', 'ccpbiowarfare', 'fakenewsalert', 'fbi', 'kungflu', 'plandemic', 'sideeffects', 'statemedia', 'surveillancestate', 'terrorism', 'terrorists', 'theliberalmediaistheenemy', 'vaccinesideeffects', 'covidbraindamage', 'disinformation', '5g', 'aliens', 'antivax', 'billgates', 'biologicalweapon', 'bioweapon', 'bioweapons', 'fakenews' 'capitalismisthevirus', 'ccp_is_terrorist', 'ccpliedpeopledied', 'chinamustpay', 'chinazi', 'deathsentenceforfauci', 'democratsarecorrupt', 'democratsareevil', 'fauciliedpeopledied', 'gatesfoundation', 'usaexposed', 'ccpvirus', 'hoax', 'bigpharma', 'chinaagenda', 'cyberattack', 'cybersecurity', 'cyberthreats', 'faucigate', 'communismisthevirus', 'fakenews', 'fakedemic', 'chinavirus', 'coverup', 'father_of_corona', 'fatherofcorona', 'moronavirus', 'plague', 'report_china', 'report_china_father_of_corona', 'reportchina', 'hydroxychloroquine', '5gmast', 'magnetic', 'batsoup', 'deepstate', 'covid19hoax', 'trumpvirus']

In [None]:
complete_hashtags_list

['chinaliedpeopledied',
 'chinesevirus',
 'fauci',
 'lableaktheory',
 'lableak',
 'magneticpower',
 'notocoronavirusvaccines',
 'wuhanlab',
 'ccp_is_terrorist',
 'ccpbiowarfare',
 'fakenewsalert',
 'fbi',
 'kungflu',
 'plandemic',
 'sideeffects',
 'statemedia',
 'surveillancestate',
 'terrorism',
 'terrorists',
 'theliberalmediaistheenemy',
 'vaccinesideeffects',
 'covidbraindamage',
 'disinformation',
 '5g',
 'aliens',
 'antivax',
 'billgates',
 'biologicalweapon',
 'bioweapon',
 'bioweapons',
 'fakenewscapitalismisthevirus',
 'ccp_is_terrorist',
 'ccpliedpeopledied',
 'chinamustpay',
 'chinazi',
 'deathsentenceforfauci',
 'democratsarecorrupt',
 'democratsareevil',
 'fauciliedpeopledied',
 'gatesfoundation',
 'usaexposed',
 'ccpvirus',
 'hoax',
 'bigpharma',
 'chinaagenda',
 'cyberattack',
 'cybersecurity',
 'cyberthreats',
 'faucigate',
 'communismisthevirus',
 'fakenews',
 'fakedemic',
 'chinavirus',
 'coverup',
 'father_of_corona',
 'fatherofcorona',
 'moronavirus',
 'plague',
 'r

I now want to join the following dataframes together, and make a word cloud looking at the count of the complete_hashtag_lists

In [None]:
coronavirus = pd.read_csv("/content/tweet_coronavirus.csv")
covid = pd.read_csv("/content/tweet_covid.csv")
covid19 = pd.read_csv("/content/tweet_covid19.csv")
pandemic = pd.read_csv("/content/tweet_pandemic.csv")
vaccine = pd.read_csv("/content/tweet_vaccine.csv")
virus = pd.read_csv("/content/tweet_virus.csv")

In [None]:
df1 = coronavirus.append(covid, ignore_index = True)
df2 = df1.append(covid19, ignore_index = True)
df3 = df2.append(pandemic, ignore_index = True)
df4 = df3.append(vaccine, ignore_index = True)
df = df4.append(virus, ignore_index = True)

In [None]:
df.shape

(6000, 17)

In [None]:
df.to_csv('covid.csv')

In [None]:
import ast

def conspiracy(x):
    consp_list = np.array([])

    x = ast.literal_eval(x)

    for i in x:
        if i.lower() in complete_hashtags_list:
            consp_list = np.append(consp_list, [i])
            
    print(consp_list)
    return consp_list

In [None]:
#create hashtag2, which only includes hashtags that are in complete_hashtags_list
df['hashtag2'] = df.hashtag.apply(conspiracy)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
['bigpharma' 'fauci']
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
['WuhanLab']
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
['BillGates']
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
['WuhanLab']
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
['disi

In [None]:
df.head()

Unnamed: 0.1,Unnamed: 0,tweet,user,location,date,id,source,favourites,retweets,tweet_length,hashtag,cleaned,tokens,lemmatized,stemmed,split,hashtag2
0,0,New daily cases &amp; deaths of #coronavirus i...,Omar_Gaza,Palestine,2021-06-22 16:54:21,1407381456305938443,Twitter for Android,709,398,101,"['coronavirus', 'Gaza']",new daily cases &amp; deaths of #coronavirus i...,"['new', 'daily', 'cases', '&', 'amp', ';', 'de...","['new', 'daily', 'case', 'amp', 'death', 'coro...","['new', 'daily', 'case', 'amp', 'death', 'coro...","['new', 'dai', 'cas', 'amp', 'death', 'coronav...",[]
1,1,#coronavirus I’m feeling emotionally very fr...,JohnBoweActor,,2021-06-23 18:19:07,1407765175168647176,Twitter for iPhone,598,180,263,['coronavirus'],#coronavirus i’m feeling emotionally very frai...,"['#', 'coronavirus', 'i', '’', 'm', 'feeling',...","['coronavirus', '’', 'feeling', 'emotionally',...","['coronavirus', '’', 'feeling', 'emotionally',...","['coronavir', 'i’m', 'feel', 'emot', 'frail', ...",[]
2,2,Communist China produced a very inferior vacci...,SenJohnKennedy,Louisiana,2021-06-23 13:51:13,1407697757838663684,Twitter Media Studio,444,238,215,['coronavirus'],communist china produced a very inferior vacci...,"['communist', 'china', 'produced', 'a', 'very'...","['communist', 'china', 'produced', 'inferior',...","['communist', 'china', 'produced', 'inferior',...","['commun', 'chin', 'produc', 'infery', 'vaccin...",[]
3,3,RT @CoronaRecovery: 303044 people recovered fr...,viralvideovlogs,കേരളം,2021-06-23 21:27:22,1407812550369198082,Naattuvartha,0,1,138,[],rtpeople recovered from corona today. total co...,"['rtpeople', 'recovered', 'from', 'corona', 't...","['rtpeople', 'recovered', 'corona', 'today', '...","['rtpeople', 'recovered', 'corona', 'today', '...","['rtpeople', 'recov', 'coron', 'today', 'tot',...",[]
4,4,303044 people recovered from Corona today. Tot...,CoronaRecovery,,2021-06-23 21:27:20,1407812540025946117,corona-recoveries,0,1,151,"['COVID19', 'coronavirus', 'StaySafe']",303044 people recovered from corona today. tot...,"['303044', 'people', 'recovered', 'from', 'cor...","['303044', 'people', 'recovered', 'corona', 't...","['303044', 'people', 'recovered', 'corona', 't...","['303044', 'peopl', 'recov', 'coron', 'today',...",[]
