In [350]:
import json
import pandas as pd
import preprocessor as p
import nltk
from nltk import word_tokenize, FreqDist
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import TweetTokenizer

In [351]:
tweets = []
for line in open('data/cdc_twitter_covid.json', 'r', encoding='utf-8'):
    tweets.append(json.loads(line))
    
df = pd.DataFrame(tweets)

In [352]:
df.columns

Index(['id', 'conversation_id', 'created_at', 'date', 'time', 'timezone',
       'user_id', 'username', 'name', 'place', 'tweet', 'language', 'mentions',
       'urls', 'photos', 'replies_count', 'retweets_count', 'likes_count',
       'hashtags', 'cashtags', 'link', 'retweet', 'quote_url', 'video',
       'thumbnail', 'near', 'geo', 'source', 'user_rt_id', 'user_rt',
       'retweet_id', 'reply_to', 'retweet_date', 'translate', 'trans_src',
       'trans_dest'],
      dtype='object')

In [353]:
df = df[['id', 'date', 'time', 'username', 'tweet', 'mentions','urls', 'photos', 'hashtags', 'link', 'quote_url']]

In [354]:
tweets_df = df[['id', 'date', 'time', 'username', 'tweet', 'hashtags']]
tweets_df

Unnamed: 0,id,date,time,username,tweet,hashtags
0,1363921515922796547,2021-02-22,13:40:04,cdcgov,#DYK? COVID-19 and flu can both cause fever an...,"[dyk, covid19]"
1,1363885801755533313,2021-02-22,11:18:09,cdcgov,A new @CDCMMWR looks at #COVID19 in Georgia el...,[covid19]
2,1362865816765661190,2021-02-19,15:45:06,cdcgov,Officials are concerned about 3 #COVID19 varia...,[covid19]
3,1362849192130080772,2021-02-19,14:39:02,cdcgov,Getting a #COVID19 vaccine is important for ad...,[covid19]
4,1362510972162867206,2021-02-18,16:15:04,cdcgov,#DYK? You need two #COVID19 mRNA vaccine doses...,"[dyk, covid19]"
...,...,...,...,...,...,...
426,1233891883195211780,2020-02-29,18:08:44,cdcgov,Reports of community spread of #COVID19 in Cal...,[covid19]
427,1233891113628557313,2020-02-29,18:05:40,cdcgov,"CDC, @WADeptHealth &amp; @KCPubHealth also rep...",[covid]
428,1233891108654108672,2020-02-29,18:05:39,cdcgov,"Today, @WADeptHealth and CDC reported the deat...","[covid, coronavirus]"
429,1233486735578976257,2020-02-28,15:18:49,cdcgov,CDC does not currently recommend the general p...,"[covid19, ppe]"


In [355]:
print('--- Print the Basic Info of the data ----')
print(tweets_df.info())
print(tweets_df.shape)

print('--- Print the Head/Tail of the data -----')
print(tweets_df.head())
print('------------------------')
print(tweets_df.tail())

--- Print the Basic Info of the data ----
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 431 entries, 0 to 430
Data columns (total 6 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        431 non-null    int64 
 1   date      431 non-null    object
 2   time      431 non-null    object
 3   username  431 non-null    object
 4   tweet     431 non-null    object
 5   hashtags  431 non-null    object
dtypes: int64(1), object(5)
memory usage: 20.3+ KB
None
(431, 6)
--- Print the Head/Tail of the data -----
                    id        date      time username  \
0  1363921515922796547  2021-02-22  13:40:04   cdcgov   
1  1363885801755533313  2021-02-22  11:18:09   cdcgov   
2  1362865816765661190  2021-02-19  15:45:06   cdcgov   
3  1362849192130080772  2021-02-19  14:39:02   cdcgov   
4  1362510972162867206  2021-02-18  16:15:04   cdcgov   

                                               tweet        hashtags  
0  #DYK? COVID-19 and flu can

In [356]:
# remove URLs, emojis, smileys, mentions, hashtags, and reserved words
for i,v in enumerate(tweets_df['tweet']):
    p.set_options(p.OPT.URL, p.OPT.EMOJI, p.OPT.SMILEY, p.OPT.MENTION, p.OPT.HASHTAG, p.OPT.RESERVED) # options: p.OPT.URL, p.OPT.MENTION, p.OPT.HASHTAG,
                                                                                                #p.OPT.RESERVED, p.OPT.EMOJI, p.OPT.SMILEY,
                                                                                                #p.OPT.NUMBER
    tweets_df.loc[i, 'tweet'] = p.clean(v)
    tweets_df.loc[i, 'tweet'] = tweets_df.loc[i, "tweet"]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(loc, value)


In [357]:
tweets_df.loc[0, "tweet"]

'? COVID-19 and flu can both cause fever and cough, but shortness of breath is more common with than flu. Use this chart to learn more about the similarities and differences of COVID-19 and flu. Learn more: .'

In [358]:
# Remove extra white spaces, punctuation and apply lower casing
tweets_df['tweet'] = tweets_df['tweet'].str.lower().str.replace('[^\w\s]',' ').str.replace('\s\s+', ' ')
tweets_df.loc[0, "tweet"]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tweets_df['tweet'] = tweets_df['tweet'].str.lower().str.replace('[^\w\s]',' ').str.replace('\s\s+', ' ')


' covid 19 and flu can both cause fever and cough but shortness of breath is more common with than flu use this chart to learn more about the similarities and differences of covid 19 and flu learn more '

In [359]:
# lemmatize and tokenize
lemmatizer = nltk.stem.WordNetLemmatizer()
w_tokenizer =  TweetTokenizer()

def lemmatize_text(text):
    return [(lemmatizer.lemmatize(w)) for w in w_tokenizer.tokenize((text))]

words = tweets_df['tweet'].apply(lemmatize_text)
tweets_df['tokenized_tweet'] = pd.DataFrame(words)

# remove stopwords
stop_words = set(stopwords.words('english'))
tweets_df['removed_stopwords'] = tweets_df['tokenized_tweet'].apply(lambda x: [item for item in x if item not in stop_words])
tweets_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tweets_df['tokenized_tweet'] = pd.DataFrame(words)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tweets_df['removed_stopwords'] = tweets_df['tokenized_tweet'].apply(lambda x: [item for item in x if item not in stop_words])


Unnamed: 0,id,date,time,username,tweet,hashtags,tokenized_tweet,removed_stopwords
0,1363921515922796547,2021-02-22,13:40:04,cdcgov,covid 19 and flu can both cause fever and cou...,"[dyk, covid19]","[covid, 19, and, flu, can, both, cause, fever,...","[covid, 19, flu, cause, fever, cough, shortnes..."
1,1363885801755533313,2021-02-22,11:18:09,cdcgov,a new looks at in georgia elementary schools t...,[covid19],"[a, new, look, at, in, georgia, elementary, sc...","[new, look, georgia, elementary, school, study..."
2,1362865816765661190,2021-02-19,15:45:06,cdcgov,officials are concerned about 3 variants detec...,[covid19],"[official, are, concerned, about, 3, variant, ...","[official, concerned, 3, variant, detected, u,..."
3,1362849192130080772,2021-02-19,14:39:02,cdcgov,getting a vaccine is important for adults with...,[covid19],"[getting, a, vaccine, is, important, for, adul...","[getting, vaccine, important, adult, certain, ..."
4,1362510972162867206,2021-02-18,16:15:04,cdcgov,you need two mrna vaccine doses to get the mo...,"[dyk, covid19]","[you, need, two, mrna, vaccine, dos, to, get, ...","[need, two, mrna, vaccine, dos, get, protectio..."
...,...,...,...,...,...,...,...,...
426,1233891883195211780,2020-02-29,18:08:44,cdcgov,reports of community spread of in california o...,[covid19],"[report, of, community, spread, of, in, califo...","[report, community, spread, california, oregon..."
427,1233891113628557313,2020-02-29,18:05:40,cdcgov,cdc amp also reported 2 ppl have tested positi...,[covid],"[cdc, amp, also, reported, 2, ppl, have, teste...","[cdc, amp, also, reported, 2, ppl, tested, pos..."
428,1233891108654108672,2020-02-29,18:05:39,cdcgov,today and cdc reported the death of a person w...,"[covid, coronavirus]","[today, and, cdc, reported, the, death, of, a,...","[today, cdc, reported, death, person, hospital..."
429,1233486735578976257,2020-02-28,15:18:49,cdcgov,cdc does not currently recommend the general p...,"[covid19, ppe]","[cdc, doe, not, currently, recommend, the, gen...","[cdc, doe, currently, recommend, general, publ..."


In [360]:
# save tweets as txt file
#tweets_df["tweet"].to_csv("tweet" + '.txt', index=False)