## Imports

In [1]:
import pandas as pd
import numpy as np
import csv
from collections import defaultdict
import pickle
import preprocessor as p 
import spacy

## Get Data

df_pge = pge blackout trending

df_wf = wildfire trending

In [2]:
df_pge = pd.read_csv('data/pge_shutdown.csv', delimiter = ',', quotechar = '|', 
                 names = ['date', 'tweet', 'users', 'followers', 'location'])

In [3]:
df_pge.head()

Unnamed: 0,date,tweet,users,followers,location
0,2019-11-19 02:34:26,b'Im gonna miss those very distant lights soon...,girloneonegirl,725,Yer moms
1,2019-11-19 02:34:13,b'I have empathy for people affected by the we...,mr_goblins948,1812,"California, USA"
2,2019-11-19 02:26:24,"b'at least my house wont be affected, right? #...",Xxepicgamer_xX,216,North Siberian Gulag
3,2019-11-19 02:14:47,b'Here we go again. Every damn time I go to co...,Quesozitto,33,I'm lost too.
4,2019-11-19 02:10:15,b'@lakatzzz @RedwoodGirl @yellowsnpdragon Ridi...,Cheryl4SaveCali,2117,"Sacramento, CA"


In [2]:
df_wf = pd.read_csv('data/wildfire_recent.csv', delimiter = ',', quotechar = '|', 
                 names = ['date', 'tweet', 'users', 'followers', 'location'])

In [3]:
df_wf.head()

Unnamed: 0,date,tweet,users,followers,location
0,2019-11-19 18:15:35,Sad. Because of this Summer's wildfire and sub...,RyanBernhart_Wx,36,"Maricopa, AZ"
1,2019-11-19 18:15:16,Huge Flow Country wildfire 'doubled #Scotland'...,CWL_BeGreen,1043,
2,2019-11-19 18:14:42,Suburban sprawl and climate change complicate ...,babday,2,
3,2019-11-19 18:13:39,"In 2018, over 8 million acres were burned by w...",FavaFinancialGr,188,"Totowa, New Jersey"
4,2019-11-19 18:13:37,"In 2018, over 8 million acres were burned by w...",ElkAgencyIns,12,"Elk River, Minnesota"


In [4]:
df_wf.shape

(9662, 5)

In [6]:
# lowercase
#df_pge['tweet'] = [x.lower() for x in df_pge['tweet']]
df_wf['tweet'] = [x.lower() for x in df_wf['tweet']]

In [7]:
# remove 'like wildfire' 
for i,tweet in enumerate(df_wf['tweet']):
    if 'like wildfire' in tweet:
        df_wf = df_wf.drop(i)

In [8]:
df_wf.shape

(8970, 5)

## Tokenize using preprocessor

In [25]:
tokens = [p.tokenize(tweet) for tweet in df_wf['tweet']]
parsed_tweet = [p.parse(tweet) for tweet in df_wf['tweet']]
clean = [p.clean(tweet) for tweet in df_wf['tweet']]

In [11]:
# drop duplicates?
df_wf = df_wf.drop_duplicates(subset = 'tweet')

In [14]:
df_wf = df_wf.reset_index(drop = True)

In [18]:
df_wf['emojis'] = [tweet.emojis for tweet in parsed_tweet]

In [19]:
df_wf['hashtags'] = [tweet.hashtags for tweet in parsed_tweet]

In [20]:
df_wf.location.value_counts()

United States          120
California, USA         89
London, England         89
New York, NY            88
Los Angeles, CA         86
                      ... 
in your dreams...        1
Marble Falls, TX         1
Bangladesh 🇧🇩            1
Hamilton, ON CANADA      1
Walt Disney Land         1
Name: location, Length: 3028, dtype: int64

In [21]:
from spacy.lang.en import English
from spacy.lang.en.stop_words import STOP_WORDS
import string
# Create our list of punctuation marks
punctuations = string.punctuation

# Create our list of stopwords
nlp = spacy.load('en')
stop_words = spacy.lang.en.stop_words.STOP_WORDS

# Load English tokenizer, tagger, parser, NER and word vectors
parser = English()

In [22]:
def spacy_tokenizer(sentence):
    # Creating our token object, which is used to create documents with linguistic annotations.
    mytokens = parser(sentence)

    # Lemmatizing each token and converting each token into lowercase
    mytokens = [ word.lemma_.lower().strip() if word.lemma_ != "-PRON-" else word.lower_ for word in mytokens ]

    # Removing stop words
    mytokens = [ word for word in mytokens if word not in stop_words and word not in punctuations ]
    
    # Remove mentions, hashtags, urls
    mytokens = [word for word in mytokens if '$' not in word]
    
    # Remove '...'
    mytokens = [word for word in mytokens if '…' not in word]

    # return preprocessed list of tokens
    return mytokens

In [27]:
df_wf['clean_tweet'] = [' '.join(spacy_tokenizer(tweet)) for tweet in tokens]

In [30]:
df_wf['tokens'] = [tweet.split(' ') for tweet in df_wf['clean_tweet']]

In [31]:
df_wf.head()

Unnamed: 0,date,tweet,users,followers,location,emojis,hashtags,clean_tweet,tokens
0,2019-11-19 18:15:35,sad. because of this summer's wildfire and sub...,RyanBernhart_Wx,36,"Maricopa, AZ",,,sad summer wildfire subsequent flash flooding ...,"[sad, summer, wildfire, subsequent, flash, flo..."
1,2019-11-19 18:15:16,huge flow country wildfire 'doubled #scotland'...,CWL_BeGreen,1043,,,[(36:45) => #scotland],huge flow country wildfire doubled emissions m...,"[huge, flow, country, wildfire, doubled, emiss..."
2,2019-11-19 18:14:42,suburban sprawl and climate change complicate ...,babday,2,,,,suburban sprawl climate change complicate wild...,"[suburban, sprawl, climate, change, complicate..."
3,2019-11-19 18:13:39,"in 2018, over 8 million acres were burned by w...",FavaFinancialGr,188,"Totowa, New Jersey",,,2018 8 million acres burned wildfire working a...,"[2018, 8, million, acres, burned, wildfire, wo..."
4,2019-11-19 18:13:37,"in 2018, over 8 million acres were burned by w...",ElkAgencyIns,12,"Elk River, Minnesota",,,2018 8 million acres burned wildfire working a...,"[2018, 8, million, acres, burned, wildfire, wo..."


In [33]:
for i,tweet in enumerate(df_wf['clean_tweet']):
    if 'michael' in tweet:
        df_wf = df_wf.drop(i)
    if 'youngadultfemalevocalistoftheyear' in tweet:
        df_wf = df_wf.drop(i)

In [34]:
df_wf = df_wf.reset_index(drop = True)

In [36]:
df_wf.shape

(8792, 9)

In [37]:
df_wf.head()

Unnamed: 0,date,tweet,users,followers,location,emojis,hashtags,clean_tweet,tokens
0,2019-11-19 18:15:35,sad. because of this summer's wildfire and sub...,RyanBernhart_Wx,36,"Maricopa, AZ",,,sad summer wildfire subsequent flash flooding ...,"[sad, summer, wildfire, subsequent, flash, flo..."
1,2019-11-19 18:15:16,huge flow country wildfire 'doubled #scotland'...,CWL_BeGreen,1043,,,[(36:45) => #scotland],huge flow country wildfire doubled emissions m...,"[huge, flow, country, wildfire, doubled, emiss..."
2,2019-11-19 18:14:42,suburban sprawl and climate change complicate ...,babday,2,,,,suburban sprawl climate change complicate wild...,"[suburban, sprawl, climate, change, complicate..."
3,2019-11-19 18:13:39,"in 2018, over 8 million acres were burned by w...",FavaFinancialGr,188,"Totowa, New Jersey",,,2018 8 million acres burned wildfire working a...,"[2018, 8, million, acres, burned, wildfire, wo..."
4,2019-11-19 18:13:37,"in 2018, over 8 million acres were burned by w...",ElkAgencyIns,12,"Elk River, Minnesota",,,2018 8 million acres burned wildfire working a...,"[2018, 8, million, acres, burned, wildfire, wo..."


In [38]:
df_wf = df_wf.drop_duplicates(subset = 'clean_tweet')

In [40]:
df_wf = df_wf.reset_index(drop = True)

In [41]:
with open('data/df_recent.pickle', 'wb') as to_write:
    pickle.dump(df_wf, to_write)

## Get Topics

In [35]:
from sklearn.feature_extraction import text 
from sklearn.feature_extraction.text import TfidfVectorizer 
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.decomposition import TruncatedSVD

In [182]:
def tfidf(documents):
    my_additional_stop_words = ['wildfire', 'like', 'pron']
    stop_words = text.ENGLISH_STOP_WORDS.union(my_additional_stop_words)
    cv_tfidf = TfidfVectorizer(analyzer = 'word',ngram_range = (2,3), min_df = 3, stop_words=stop_words, token_pattern = "\\b[a-z][a-z]+\\b") 
    doc_word = cv_tfidf.fit_transform(documents)
    words = cv_tfidf.get_feature_names()
    id2word = dict((v, k) for k, v in cv_tfidf.vocabulary_.items())
    return doc_word, words, id2word, cv_tfidf

In [197]:
def sk_lda(doc_word, n_topics):
    lda = LatentDirichletAllocation(n_components=n_topics,random_state=0)
    doc_topic = lda.fit_transform(doc_word)
    return lda, doc_topic, lda.exp_dirichlet_component_

In [198]:
def dim_lsa(doc_word, no_topics):
    '''
    This function takes a sparse matrix map of documents to words and reduces the dimensions
    to topics. It returns an array of documents mapped to topics by "relatedness". Each row in the array
    has (no_topics) items in it.
    --------------------
    Inputs: sparse matrix, int
    Outputs: model, array, list
    '''
    lsa = TruncatedSVD(no_topics)
    doc_topic = lsa.fit_transform(doc_word)
    return lsa, doc_topic, lsa.explained_variance_ratio_

In [184]:
def display_topics(model, words, no_top_words, topic_names=None):
    '''
    This function takes a dim reduction model, words, number of words to display, and topic_names 
    (default= none). It returns strings of topics.
    ----------------
    Input: function, list, int
    Output: strs
    '''
    for ix, topic in enumerate(model.components_):
        if not topic_names or not topic_names[ix]:
            print("\nTopic ", ix)
        else:
            print("\nTopic: '",topic_names[ix],"'")
        print(", ".join([words[i]
                        for i in topic.argsort()[:-no_top_words - 1:-1]]))

In [278]:
doc_word, words, id2word, cv_tfidf = tfidf(df_wf['clean_tweet'])

In [192]:
lda, doc_topic, variance = sk_lda(doc_word, 6)

In [193]:
display_topics(lda, words, 10)


Topic  0
artificial intelligence, california wildfires, southern california, south wales, new south wales, new south, year ago, gender reveal, company training, california million

Topic  1
california blackout, blackout crisis, california blackout crisis, blackout crisis blame, crisis blame, pg amp, los angeles, california dream, darkens california, policy totally

Topic  2
climate change, pg amp, social media, power crisis, california faces, forest service, wildfires california, sen hearing, faces fraught path, fraught path

Topic  3
kennedy peak, air quality, pg amp, gender reveals, state regulators, attendance business opportunity, rsvp ur, rsvp ur competition, opportunity rsvp ur, ur competition

Topic  4
doubled scotland, scotland emissions, doubled scotland emissions, flow country, huge flow, huge flow country, country doubled, flow country doubled, country doubled scotland, pg amp

Topic  5
home prevention, tips home, tips home prevention, help restore, acres burned, million ac

## LSA works better

In [199]:
lsa, doc_topic, variance = dim_lsa(doc_word, 6)

In [201]:
display_topics(lsa, words, 10, topic_names = ['Restoration', 'Scotland', 'Home Tips',
                                             'PG&E', 'Ca Forest Policy', 'Climate Change'])


Topic: ' Restoration '
help restore, acres burned, million acres burned, american forests help, restore acres, restore acres da, acres da, working american, working american forests, acres burned working

Topic: ' Scotland '
scotland emissions, doubled scotland, doubled scotland emissions, flow country, huge flow country, huge flow, flow country doubled, country doubled, country doubled scotland, bbc news

Topic: ' Home Tips '
home prevention, tips home prevention, tips home, stay safe, warner bros, warner bros studios, bros studios, studios evacuated, studios evacuated nearby, evacuated nearby

Topic: ' PG&E '
pg amp, compensation victims, billion compensation, billion compensation victims, amp offering, pg amp offering, offering billion, amp offering billion, offering billion compensation, amp offer

Topic: ' Ca Forest Policy '
policy totally, totally backfired, policy totally backfired, california policy totally, california policy, totally backfired native, know fix, backfired nati

In [295]:
df_wf.head()

Unnamed: 0,date,tweet,users,followers,location,emojis,hashtags,clean_tweet
0,2019-11-19 18:15:35,sad. because of this summer's wildfire and sub...,RyanBernhart_Wx,36,"Maricopa, AZ",,,sad summer wildfire subsequent flash flooding ...
1,2019-11-19 18:15:16,huge flow country wildfire 'doubled #scotland'...,CWL_BeGreen,1043,,,[(36:45) => #scotland],huge flow country wildfire doubled emissions m...
2,2019-11-19 18:14:42,suburban sprawl and climate change complicate ...,babday,2,,,,suburban sprawl climate change complicate wild...
3,2019-11-19 18:13:39,"in 2018, over 8 million acres were burned by w...",FavaFinancialGr,188,"Totowa, New Jersey",,,million acres burned wildfire working american...
4,2019-11-19 18:13:37,"in 2018, over 8 million acres were burned by w...",ElkAgencyIns,12,"Elk River, Minnesota",,,million acres burned wildfire working american...


## Trying GSDMM

In [203]:
from gsdmm import MovieGroupProcess

In [296]:
# Train STTM model
# Init of the Gibbs Sampling Dirichlet Mixture Model algorithm
# K = number of potential topic (which we don't know a priori)
# alpha = 
# beta = 
# n_iters = number of iterations to 
mgp = MovieGroupProcess(K=15, alpha=0.1, beta=0.1, n_iters=30)
vocab = words
n_terms = len(vocab)
y = mgp.fit(tokens, n_terms)
# Save model
with open('trained_models/gsdmm_rec.pickle', 'wb') as f:
    pickle.dump(mgp, f)
    f.close()

In stage 0: transferred 7249 clusters with 15 clusters populated
In stage 1: transferred 3534 clusters with 15 clusters populated
In stage 2: transferred 2184 clusters with 15 clusters populated
In stage 3: transferred 1687 clusters with 15 clusters populated
In stage 4: transferred 1406 clusters with 15 clusters populated
In stage 5: transferred 1290 clusters with 15 clusters populated
In stage 6: transferred 1272 clusters with 15 clusters populated
In stage 7: transferred 1257 clusters with 15 clusters populated
In stage 8: transferred 1164 clusters with 15 clusters populated
In stage 9: transferred 1144 clusters with 15 clusters populated
In stage 10: transferred 1143 clusters with 15 clusters populated
In stage 11: transferred 1110 clusters with 15 clusters populated
In stage 12: transferred 1090 clusters with 15 clusters populated
In stage 13: transferred 1099 clusters with 15 clusters populated
In stage 14: transferred 1067 clusters with 15 clusters populated
In stage 15: transfe

In [218]:
from collections import defaultdict

In [293]:
def top_words(num_top_words, topic_names = None):
    cluster_lst =[mgp.choose_best_label(x) for x in tokens]
    dict_clusters = defaultdict(list)
    for i,v in enumerate(cluster_lst):
        dict_clusters[str(v[0])].append(df_wf.loc[i,'clean_tweet'])
    dict_counts = defaultdict(int)
    ix = 0
    for key, value in dict_clusters.items():
        try:
            doc_word, words, id2word, cv_tfidf = tfidf(value)
            new = doc_word.toarray().sum(axis = 0)
        except:
            new = 'empty'
        if new == 'empty':
            print("\nTopic ", key)
        elif not topic_names or not topic_names[ix]:
            print("\nTopic ", key)
            print(", ".join([words[i]
                        for i in new.argsort()[:-num_top_words - 1:-1]]))
        else:
            print('\nTopic: {}'.format(topic_names[ix]))
            print(", ".join([words[i]
                        for i in new.argsort()[:-num_top_words - 1:-1]]))
        ix += 1

In [209]:
mgp.cluster_doc_count

[0, 852, 1375, 0, 0, 0, 0, 0, 6398, 258]

In [294]:
doc_count = np.array(mgp.cluster_doc_count)
print('Number of documents per topic :', doc_count)
print('*'*20)
# Topics sorted by the number of document they are allocated to
top_index = doc_count.argsort()[-10:][::-1]
print('Most important clusters (by number of docs inside):', top_index)
print('*'*20)
# Show the top 5 words in term frequency for each cluster 

top_words(15)

Number of documents per topic : [ 635 1367  759  615 1286  645  761  858 1597  360]
********************
Most important clusters (by number of docs inside): [8 1 4 7 6 2 5 0 3 9]
********************


  
  
  
  
  



Topic  4
home prevention, tips home, tips home prevention, climate change, pg amp, long term, air quality, social media, stay safe, risk analysis, california wildfires, red cross, issues california, southern california, mitigation awards

Topic  3
doubled scotland, flow country, scotland emissions, doubled scotland emissions, huge flow, huge flow country, country doubled, flow country doubled, country doubled scotland, blackout crisis, california blackout crisis, california blackout, blackout crisis blame, crisis blame, bbc news

Topic  5
kennedy peak, pg amp, update nov, national park, studios evacuated, nov martinez, update nov martinez, entering planetary, earth entering planetary, earth entering, evacuated nearby, studios evacuated nearby, bros studios evacuated, warner bros, warner bros studios

Topic  9
acres burned, burned working, million acres burned, million acres, help restore acres, help restore, restore acres, restore acres da, forests help, burned working american, fores

  
  
  
  



Topic  8
gender reveal, years ago, insight cards, social media, rick grimes, gender reveals, humor cards, climate change, integration prevention, stay safe, burn baby, christmas song, life cards, love life, pg amp

Topic  1
australia having, having cataclysmic, australia having cataclysmic, having cataclysmic season, cataclysmic season, trump regret syndrome, trump regret, regret syndrome, regret syndrome spreading, syndrome spreading, attendance business, rsvp ur competition, rsvp ur, business opportunity rsvp, business opportunity

Topic  7
pg amp, artificial intelligence, los angeles, company training, million acres, training artificial intelligence, training artificial, california million, california million acres, artificial intelligence scour, intelligence scour, year ago, acres forest, million acres forest, company training artificial

Topic  2
climate change, kamala harris, state emergency, costs climb, companies flagging risk, risk suppression, risk suppression costs, compani

  


In [297]:
doc_count = np.array(mgp.cluster_doc_count)
print('Number of documents per topic :', doc_count)
print('*'*20)
# Topics sorted by the number of document they are allocated to
top_index = doc_count.argsort()[-10:][::-1]
print('Most important clusters (by number of docs inside):', top_index)
print('*'*20)
# Show the top 5 words in term frequency for each cluster 

top_words(15)

Number of documents per topic : [ 665  943  449  812  780  614  499 1308  486  392  582  203  388  340
  422]
********************
Most important clusters (by number of docs inside): [ 7  1  3  4  0  5 10  6  8  2]
********************


  
  
  
  
  
  
  



Topic  0
climate change, new technology, planetary age, earth entering planetary, earth entering, entering planetary, companies flagging risk, companies flagging, flagging risk, risk suppression costs, suppression costs climb, costs climb, risk suppression, flagging risk suppression, suppression costs

Topic  6
doubled scotland, scotland emissions, doubled scotland emissions, flow country, huge flow country, huge flow, country doubled, flow country doubled, country doubled scotland, bbc news, news huge flow, news huge, bbc news huge, wine country, return california

Topic  8
pg amp, threat cut, gavin newsom, reduce risk, gov gavin newsom, gov gavin, newsom punches, gavin newsom punches, cut aid, threat cut aid, newsom punches trump, punches trump, southern california, punches trump threat, trump threat cut

Topic  11
working american forests, burned working, working american, help restore acres, help restore, forests help restore, forests help, million acres burned, restore acres, res

  
  
  
  
  
  
  
  
