In [84]:
import pandas as pd 
import numpy as np
import nltk
import re
from nltk.tokenize import WordPunctTokenizer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer


tweets = pd.read_csv('data/harvey_tweets.csv')

  interactivity=interactivity, compiler=compiler, result=result)


In [89]:
f = tweets['text'].isnull()
tweets = tweets[~f]

## Tweet Cleaning Function
### Cleans all the noise in the tweets 

In [90]:
def lemmatize(x):
    lemmatizer = WordNetLemmatizer()

    return lemmatizer.lemmatize(lemmatizer.lemmatize(x, pos='v'))

def tweet_clean(text):
    '''function to clean each tweet
    - Remove www nad http patterns
    - Remove numbers and symbols
    - Negation handling
    - Lowercase
    - Word stemming
    - Remove words with less than 2 characters
    '''
    stop_words = set(stopwords.words('english'))
    tok = WordPunctTokenizer()
    
#    remove invalid symbol ("\ufffd")
    try:
        bom_removed = text.decode("utf-8-sig").replace(u"\ufffd", "")
    except:
        bom_removed = text
    
#    remove mentioned username and links(https and www) patterns
    user_pat = r'@[A-Za-z0-9_]+'
    http_pat1 = r'http://[^ ]+'
    http_pat2 = r'https://[^ ]+'
    www_pat = r'www.[^ ]+'
    combined_pat = r'|'.join((user_pat, http_pat1, http_pat2, www_pat)) 
    pat_removed = re.sub(combined_pat, '', bom_removed)
    lower_text = pat_removed.lower()
    
#    handle negation patterns
    negations_dic = {"isn't":"is not", "aren't":"are not", "wasn't":"was not", 
                     "weren't":"were not","haven't":"have not","hasn't":"has not",
                     "hadn't":"had not","won't":"will not","wouldn't":"would not", 
                     "don't":"do not", "doesn't":"does not","didn't":"did not",
                     "can't":"can not","couldn't":"could not","shouldn't":"should not",
                     "mightn't":"might not", "mustn't":"must not"}
    neg_pat = re.compile(r'\b(' + '|'.join(negations_dic.keys()) + r')\b')
    neg_handled = neg_pat.sub(lambda x: negations_dic[x.group()], lower_text)
    
#    remove non-letter characters
    letters_only = re.sub("[^a-zA-Z0-9]", " ", neg_handled)
    
#    token and lemmatize words
    words = [lemmatize(x) for x in tok.tokenize(letters_only)]
#    filter out tokens with less than 1 characters
    words = [x for x in words if len(x) > 1]
#    filter out tokens of stopping words and meaningless words
    meaningless_words = []
#    stop_words = set(stopwords.words('english'))
    words = [x for x in words if not x in meaningless_words]
    
    return (" ".join(words)).strip()

In [93]:
# tweets['text'] = tweets['text'].apply(tweet_clean)
tweets['text'] = tweets['text'].apply(tweet_clean)

In [123]:
tweets.drop_duplicates(subset ="text", 
                     keep = False, inplace = True)
f = tweets['text'].str.contains(r'\b(hurricane|harvey|tornado|storm|texasstrong)\b')
tweets[f]


Unnamed: 0,index,user_id,created_at,text,state,county
5426,14,{'$numberLong': '2307660186'},8/20/2017,my dollface take the world by storm do what yo...,Texas,Waller County
7431,800,30782296,8/20/2017,it be great to meet the next senator from the ...,Texas,Brazos County
7697,292,350474324,8/20/2017,abram tx sun aug 20th pm forecast tonight most...,Texas,Hidalgo County
7698,293,492491698,8/20/2017,adam garden tx sun aug 20th pm forecast tonigh...,Texas,Cameron County
7699,294,352837528,8/20/2017,addicks barker tx sun aug 20th pm forecast ton...,Texas,Harris County
...,...,...,...,...,...,...
436546,824,85232875,9/20/2017,prayer for all of the people that have be effe...,Texas,Gillespie County
436650,101,{'$numberLong': '2863330414'},9/20/2017,harvey benefit concert,Texas,Galveston County
437895,881,190833439,9/20/2017,super rad storm come through stoke to almost b...,Texas,Harris County
437922,31,{'$numberLong': '808703232012910593'},9/20/2017,bellydance event with fusiondance fundraiser t...,Texas,Fort Bend County


In [120]:
def top_topics(tweets, meaningless_words, top_k):

    tok = WordPunctTokenizer()
    texts = tweets.text
    meaningless_words = meaningless_words + ['hurricane', 'harvey', 'houstonstrong', 'due',
                                             'hurricaneharvey', 'amp', 'houston', 'traffic',
                                             'texas', 'tornado', 'storm', 'cdt', 'texasstrong']
    stop_words = list(set(stopwords.words('english')))
    meaningfull_list = []
    for text in texts:
        text_words = [x for x in tok.tokenize(text) if not x in stop_words + meaningless_words]
        # for tweet with at least one meaningfull word
        if len(text_words) > 0:
            if len(text_words) == 1:
                # remove word with less than 2 characters
                if len(text_words[0]) > 2:
                    meaningfull_list.append(text_words[0])
            else:
                text_words = [x for x in text_words if len(x) > 2]
                meaningfull_list.append((" ".join(text_words)).strip())
    
    if len(meaningfull_list) >= 1:
        cvec = CountVectorizer()
        X = cvec.fit_transform(meaningfull_list)
        tn = pd.Series(cvec.get_feature_names())
        tf = np.sum(X, axis=0)
        tf = np.squeeze(np.asarray(tf))
        idx = np.argsort(-tf)
        if len(tn) > top_k:
            top_words = pd.Series.tolist(tn[idx[:top_k]])
        else:
            top_words = pd.Series.tolist(tn[idx])
        top_words = [top_words[ii] + '_' + str(tf[idx[ii]]) for ii in range(len(top_words))]
    else:
        top_words = np.nan
    
    return top_words

In [133]:
# for i,row in tweets.iterrows():
#     print(row['text'])

tweets.tail

<bound method NDFrame.tail of        index                                user_id created_at  \
5426      14          {'$numberLong': '2307660186'}  8/20/2017   
7431     800                               30782296  8/20/2017   
7697     292                              350474324  8/20/2017   
7698     293                              492491698  8/20/2017   
7699     294                              352837528  8/20/2017   
...      ...                                    ...        ...   
436546   824                               85232875  9/20/2017   
436650   101          {'$numberLong': '2863330414'}  9/20/2017   
437895   881                              190833439  9/20/2017   
437922    31  {'$numberLong': '808703232012910593'}  9/20/2017   
438270   509  {'$numberLong': '805790223812743172'}  9/20/2017   

                                                     text  state  \
5426    my dollface take the world by storm do what yo...  Texas   
7431    it be great to meet the next sena