In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn
import re
import nltk
import tqdm


from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from tqdm.notebook import trange, tqdm

In [3]:
tweets_df = pd.read_csv('../input/health_tweets.csv')

In [36]:
# create df of handle and tweet only to easily see how tweet is formatted
simple_df  = tweets_df[['username', 'tweet']]
simple_df.head() 

Unnamed: 0,username,tweet
0,bbchealth,How the UK’s coronavirus epidemic compares to ...
1,bbchealth,Health workers on frontline to be tested in En...
2,bbchealth,Coronavirus: Protective gear guidance 'to be u...
3,bbchealth,Coronavirus: What are ventilators and why are ...
4,bbchealth,Coronavirus: 'Act early to save more than 30 m...


## Tweets before cleaning

In [60]:
sample = simple_df.sample(n=10, random_state = 0) 

In [61]:
sample.style.set_properties(subset=['tweet'], **{'width': '500px'})

Unnamed: 0,username,tweet
72711,goodhealth,"Cauliflower fried rice with veggies is a colorful, delicious meal to whip up anytime: https://trib.al/nYpZHZc pic.twitter.com/a2DSuRttYu"
140746,NYTHealth,Tiger Woods' bionic spine is something of a medical miracle. https://nyti.ms/2HrW5b2
1946,bbchealth,Mother's Asperger's poem strikes a chord on social media http://bbc.in/2BTRD3u
113570,NBCNewsHealth,What causes healthy young mothers to have a heart attack? http://nbcnews.to/2id6xoL
20827,cnnhealth,U.S. schools: Still separate and unequal  http://cnn.it/1UDZ2oT pic.twitter.com/OGsWHKL0Kq
95010,KHNews,Suicide rates in the U.S. have risen nearly 30 percent since 1999. http://ow.ly/6G1l30konQC via @LizSzabo
108908,NBCNewsHealth,"When it comes to vaccines, celebrities often call the shots https://nbcnews.to/2Jldpis"
99210,KHNews,Single-Payer Health Care On Colorado Ballot In 2016 http://khne.ws/39hOFb
102640,latimeshealth,New guidelines advise more women to consider testing for breast cancer gene risk https://lat.ms/2Helcio
33072,EverydayHealth,Be aware! People taking these medicines and drugs are at a 30 percent increased risk of dementia: https://trib.al/B9Z3G2L pic.twitter.com/8uCvFTeSiW


In [14]:
# functions for removing links and user info
def remove_links(tweet):
    '''Takes a string and removes web links from it'''
    tweet = re.sub(r'http\S+', '', tweet)   # remove http links
    tweet = re.sub(r'bit.ly/\S+', '', tweet) # remove bitly links
    tweet = tweet.strip('[link]')   # remove [links]
    tweet = re.sub(r'pic.twitter\S+','', tweet)
    return tweet

def remove_users(tweet):
    '''Takes a string and removes retween and @user information'''
    tweet = re.sub('(RT\s@[A-Za-z]+[A-Za-z0-9-_]+)', '', tweet) # remove retweet
    tweet = re.sub('(@[A-Za-z]+[A-Za-z0-9-_]+)', '', tweet) # remove tweeted at
    return tweet


### Using Stemming for data cleaning

In [28]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\bicaj\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

In [54]:
stopwords = nltk.corpus.stopwords.words('english')
# word_rooter = nltk.stem.snowball.PorterStemmer(ignore_stopwords=False).stem
punctuation = '!"$%&\'()*+,-./:;<=>?[\\]^_`{|}~•@'         # define a string of punctuation symbols

# main function to clean tweet
def clean_tweet(tweet, bigrams=False):  # master function to clean tweet
    tweet = remove_users(tweet)
    tweet = remove_links(tweet)
    tweet = tweet.lower() # lower case
    tweet = re.sub('['+punctuation + ']+', ' ', tweet) # strip punctuation
    tweet = re.sub('\s+', ' ', tweet) #remove double spacing
    tweet = re.sub('([0-9]+)', '', tweet) # remove numbers
    tweet_token_list = [word for word in tweet.split(' ')
                            if word not in stopwords] # remove stopwords
#     tweet_token_list = [word_rooter(word) if '#' not in word else word
#                         for word in tweet_token_list] # apply word rooter
    if bigrams:
        tweet_token_list = tweet_token_list+[tweet_token_list[i]+'_'+tweet_token_list[i+1]
                                            for i in range(len(tweet_token_list)-1)]
    tweet = ' '.join(tweet_token_list)
    return tweet


In [81]:
simple_clean_df_stem = simple_df.tweet.apply(clean_tweet).to_frame()

#### Tweets after cleaning (stemming)

In [85]:
sample_clean_stem = simple_clean_df_stem.sample(n=10, random_state=0)

In [86]:
sample_clean_stem.style.set_properties(subset=['tweet'], **{'width': '500px'})

Unnamed: 0,tweet
72711,cauliflower fry rice veggies colorful delicious meal whip anytime
140746,tiger woods bionic spine medical miracle
1946,mother asperger poem strike chord social media
113570,cause healthy young mother heart attack
20827,school separate unequal
95010,suicide rat rise nearly percent
108908,come vaccines celebrities shots
99210,single payer health care colorado ballot
102640,guidelines advise women consider test breast cancer gene risk
33072,aware people take medicine drug percent increase risk dementia


### Using Lemmatization for data cleaning

In [76]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\bicaj\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\wordnet.zip.


True

In [83]:
import gensim
stopwords = nltk.corpus.stopwords.words('english')
punctuation = '!"$%&\'()*+,-./:;<=>?[\\]^_`{|}~•@'         # define a string of punctuation symbols

## functions leveraged from https://towardsdatascience.com/nlp-extracting-the-main-topics-from-your-dataset-using-lda-in-minutes-21486f5aa925

def lemmatize_stemming(tweet):
    return WordNetLemmatizer().lemmatize(tweet, pos='v')

# tokenize and lemmatize
def lemmatize(tweet):
    result=[]
    for token in gensim.utils.simple_preprocess(tweet):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:   # drops words with 3 or less characters
            result.append(lemmatize_stemming(token))
    return result

def clean_tweet(tweet, bigrams=False):   # master function to clean tweet
    tweet = remove_users(tweet)
    tweet = remove_links(tweet)
    tweet = tweet.lower() # lower case
    tweet = re.sub('['+punctuation + ']+', ' ', tweet) # strip punctuation
    tweet = re.sub('\s+', ' ', tweet) #remove double spacing
    tweet = re.sub('([0-9]+)', '', tweet) # remove numbers
    
    tweet_token_list = lemmatize(tweet)  # apply lemmatization and tokenization

    if bigrams:
        tweet_token_list = tweet_token_list+[tweet_token_list[i]+'_'+tweet_token_list[i+1]
                                            for i in range(len(tweet_token_list)-1)]
    tweet = ' '.join(tweet_token_list)
    return tweet




In [84]:
simple_clean_df_lemma = simple_df.tweet.apply(clean_tweet).to_frame()  

#### Tweets after cleaning (lemmatization)

In [87]:
sample_clean_lemma = simple_clean_df_lemma.sample(n=10, random_state=0)

In [88]:
sample_clean_lemma.style.set_properties(subset=['tweet'], **{'width': '500px'})

Unnamed: 0,tweet
72711,cauliflower fry rice veggies colorful delicious meal whip anytime
140746,tiger woods bionic spine medical miracle
1946,mother asperger poem strike chord social media
113570,cause healthy young mother heart attack
20827,school separate unequal
95010,suicide rat rise nearly percent
108908,come vaccines celebrities shots
99210,single payer health care colorado ballot
102640,guidelines advise women consider test breast cancer gene risk
33072,aware people take medicine drug percent increase risk dementia


In [89]:
# ## alternate approach to cleaning tweet data
# ## taken from https://towardsdatascience.com/topic-modeling-of-2019-hr-tech-conference-twitter-d16cf75895b6

# from nltk.stem import WordNetLemmatizer
# from nltk.corpus import stopwords, wordnet
# import nltk

# def get_wordnet_pos(word):
#     """
#     Map POS tag to first character lemmatize() accepts
#     """
#     tag = nltk.pos_tag([word])[0][1][0].upper()
#     tag_dict = {"J": wordnet.ADJ,
#                 "N": wordnet.NOUN,
#                 "V": wordnet.VERB,
#                 "R": wordnet.ADV}

#     return tag_dict.get(tag, wordnet.NOUN)

# text_clean = text.lower()
# # Remove non-alphabet
# text_clean = re.sub(r'[^a-zA-Z]|(\w+:\/\/\S+)',' ', text_clean).split()    
# # Remove short words (length < 3)
# text_clean = [w for w in text_clean if len(w)>2]
# # Lemmatize text with the appropriate POS tag
# lemmatizer = WordNetLemmatizer()
# text_clean = [lemmatizer.lemmatize(w, get_wordnet_pos(w)) for w in text_clean]
# # Filter out stop words in English 
# stops = set(stopwords.words('english')).union(additional_stop_words)
# text_clean = [w for w in text_clean if w not in stops]