In [1]:
import json
import pandas as pd
import preprocessor as p
import nltk
from nltk import word_tokenize, FreqDist
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import TweetTokenizer

import spacy

ModuleNotFoundError: No module named 'preprocessor'

In [None]:
try:
    nlp = spacy.load("en")
except OSError:
    nlp = spacy.load("en_core_web_sm")

In [None]:
tweets = []
for line in open('data/cdc_twitter_covid.json', 'r', encoding='utf-8'):
    tweets.append(json.loads(line))
    
df = pd.DataFrame(tweets)

In [None]:
df.columns

In [None]:
df = df[['id', 'date', 'time', 'username', 'tweet', 'mentions','urls', 'photos', 'hashtags', 'link', 'quote_url']]

In [None]:
tweets_df = df[['id', 'date', 'time', 'username', 'tweet', 'hashtags']]
tweets_df

In [None]:
print('--- Print the Basic Info of the data ----')
print(tweets_df.info())
print(tweets_df.shape)

print('--- Print the Head/Tail of the data -----')
print(tweets_df.head())
print('------------------------')
print(tweets_df.tail())

In [None]:
# remove URLs, emojis, smileys, mentions, hashtags, and reserved words
for i,v in enumerate(tweets_df['tweet']):
    p.set_options(p.OPT.URL, p.OPT.EMOJI, p.OPT.SMILEY, p.OPT.MENTION, p.OPT.HASHTAG, p.OPT.RESERVED) # options: p.OPT.URL, p.OPT.MENTION, p.OPT.HASHTAG,
                                                                                                #p.OPT.RESERVED, p.OPT.EMOJI, p.OPT.SMILEY,
                                                                                                #p.OPT.NUMBER
    tweets_df.loc[i, 'tweet'] = p.clean(v)
    tweets_df.loc[i, 'tweet'] = tweets_df.loc[i, "tweet"]

In [None]:
tweets_df.loc[0, "tweet"]

In [None]:
# Remove extra white spaces, punctuation and apply lower casing
tweets_df['tweet'] = tweets_df['tweet'].str.lower().str.replace('[^\w\s]',' ').str.replace('\s\s+', ' ')
tweets_df.loc[0, "tweet"]

In [None]:
# lemmatize and tokenize
def word_tokenize(word_list, model=nlp, MAX_LEN=1500000):   
    tokenized = []
    if type(word_list) == list and len(word_list) == 1:
        word_list = word_list[0]

    if type(word_list) == list:
        word_list = ' '.join([str(elem) for elem in word_list]) 
    # since we're only tokenizing, I remove RAM intensive operations and increase max text size

    model.max_length = MAX_LEN
    doc = model(word_list, disable=["parser", "tagger", "ner"])
    
    for token in doc:
        if not token.is_punct and len(token.text.strip()) > 0:
            tokenized.append(token.text)
    return tokenized

In [None]:
def normalizeTokens(word_list, extra_stop=[], model=nlp, lemma=True, MAX_LEN=1500000):
    normalized = []
    if type(word_list) == list and len(word_list) == 1:
        word_list = word_list[0]

    if type(word_list) == list:
        word_list = ' '.join([str(elem) for elem in word_list]) 

    # since we're only normalizing, I remove RAM intensive operations and increase max text size

    model.max_length = MAX_LEN
    doc = model(word_list.lower(), disable=["parser", "tagger", "ner"])

    if len(extra_stop) > 0:
        for stopword in extra_stop:
            lexeme = nlp.vocab[stopword]
            lexeme.is_stop = True

    # check if we want lemmas or not earlier to avoid checking every time we loop
    if lemma:
        for w in doc:
            # if it's not a stop word or punctuation mark, add it to our article
            if w.text != '\n' and not w.is_stop and not w.is_punct and not w.like_num and len(w.text.strip()) > 0:
            # we add the lematized version of the word
                normalized.append(str(w.lemma_))
    else:
        for w in doc:
            # if it's not a stop word or punctuation mark, add it to our article
            if w.text != '\n' and not w.is_stop and not w.is_punct and not w.like_num and len(w.text.strip()) > 0:
            # we add the lematized version of the word
                normalized.append(str(w.text.strip()))

    return normalized

In [None]:
tweets_df['tokenized_tweet'] = tweets_df['tweet'].apply(lambda x: word_tokenize(x))
tweets_df['normalized_tokens'] = tweets_df['tokenized_tweet'].apply(lambda x: normalizeTokens(x))

In [None]:
tweets_df

In [None]:
# save tweets as txt file
#tweets_df["tweet"].to_csv("tweet" + '.txt', index=False)

In [None]:
tweets_df['year'] = tweets_df['date'].apply(lambda x: x.split('-')[0])
tweets_df['month'] = tweets_df['date'].apply(lambda x: x.split('-')[1])
tweets_df['date'] = tweets_df['date'].apply(lambda x: x.split('-')[2])

In [None]:
tweets_df

In [None]:
%store tweets_df

In [None]:
#save cleaned texts as txt file
tweets_df.to_csv("cleaned_tweets" + '.txt', index=False)

In [None]:
tweets01 = tweets_df[((tweets_df["month"] == '02') |(tweets_df["month"] == '03')) & (tweets_df["year"] == '2020')]
tweets01 = tweets01.reset_index(drop=True)
tweets01

In [None]:
tweets02 = tweets_df[((tweets_df["month"] == '04') |(tweets_df["month"] == '05') | (tweets_df["month"] == '06')) & (tweets_df["year"] == '2020')]
tweets02 = tweets02.reset_index(drop=True)
tweets02

In [None]:
tweets03 = tweets_df[((tweets_df["month"] == '07') |(tweets_df["month"] == '08') | (tweets_df["month"] == '09')) & (tweets_df["year"] == '2020')]
tweets03 = tweets03.reset_index(drop=True)
tweets03

In [None]:
tweets04 = tweets_df[((tweets_df["month"] == '10') |(tweets_df["month"] == '11') | (tweets_df["month"] == '12')) & (tweets_df["year"] == '2020')]
tweets04 = tweets04.reset_index(drop=True)
tweets04

In [None]:
tweets0121 = tweets_df[tweets_df["year"] == '2021']
tweets0121 = tweets0121.reset_index(drop=True)
tweets0121

In [None]:
%store tweets0121

In [None]:
%store tweets01

In [None]:
%store tweets02

In [None]:
%store tweets03

In [None]:
%store tweets04

In [None]:
cdc_tweets = []
for line in open('data/cdc_twitter_since_2020.json', 'r', encoding='utf-8'):
    cdc_tweets.append(json.loads(line))
    
cdc_df = pd.DataFrame(cdc_tweets)
cdc_df = cdc_df[['id', 'date', 'time', 'tweet']]
cdc_df['year'] = cdc_df['date'].apply(lambda x: x.split('-')[0])
cdc_df['month'] = cdc_df['date'].apply(lambda x: x.split('-')[1])
cdc_df['date'] = cdc_df['date'].apply(lambda x: x.split('-')[2])
cdc_df

In [None]:
all01 = cdc_df[((cdc_df["month"] == '02') |(cdc_df["month"] == '03')) & (cdc_df["year"] == '2020')]
all01 = all01.reset_index(drop=True)

In [None]:
all01 = all01.drop(all01[all01['date']<"25"].index)
all01 = all01.reset_index(drop=True)
all01

In [None]:
all02 = cdc_df[((cdc_df["month"] == '04') |(cdc_df["month"] == '05') |(cdc_df["month"] == '06')) & (cdc_df["year"] == '2020')]
all02 = all02.reset_index(drop=True)
all02

In [None]:
all03 = cdc_df[((cdc_df["month"] == '07') |(cdc_df["month"] == '08') |(cdc_df["month"] == '09')) & (cdc_df["year"] == '2020')]
all03 = all03.reset_index(drop=True)
all03

In [None]:
all04 = cdc_df[((cdc_df["month"] == '10') |(cdc_df["month"] == '11') |(cdc_df["month"] == '12')) & (cdc_df["year"] == '2020')]
all04 = all04.reset_index(drop=True)
all04

In [None]:
all0121 = cdc_df[cdc_df["year"] == '2021']
all0121 = all0121.reset_index(drop=True)
all0121

# plots

In [None]:
import matplotlib.pyplot as plt
import matplotlib

matplotlib.rcParams.update({'font.family' : 'Times New Roman', 'font.size': 23})
fig = plt.figure(figsize = (12,8))
label_list = ['2020 02.25-03', '2020 04-06', '2020 07-09', '2020 10-12', '2021 01-02.22']
num_list1 = [len(tweets01), len(tweets02), len(tweets03), len(tweets04), len(tweets0121)]
num_list2 = [len(all01)-len(tweets01), len(all02)-len(tweets02), len(all03)-len(tweets03), 
             len(all04)-len(tweets04), len(all0121)-len(tweets0121)]
num_list3 = [len(all01), len(all02), len(all03), len(all04), len(all0121)]
x = range(len(num_list1))
rects1 = plt.bar(x=x, height=num_list1, width=0.45, alpha=0.8, label="COVID-related")
rects2 = plt.bar(x=x, height=num_list2, width=0.45, label="Total", bottom=num_list1)

count = 0
count2 = 0
for i in num_list1:
    plt.text(count,i+0.5, str('{:.2f}'.format(num_list1[count]/num_list3[count] *100)) +'%', \
    ha='center') #位置，高度，内容，居中
    count+=1 #增加百分比
for i in num_list3: 
    plt.text(count2,i+20, str('{}'.format(num_list3[count2])), ha='center')
    count2+=1
    
plt.ylim(0, 900)
plt.ylabel("Number")
plt.xticks(x, label_list)
plt.xlabel("Time")
plt.title("Number of CDC Tweets by Quarters")
plt.legend()
plt.show()