# Monitoring changes in related words over time.

### This notebook will show how words related to a particular word will change over time deltas

In [24]:
import pandas as pd
import numpy as np
from gensim.models import Word2Vec
from nltk.tokenize import TweetTokenizer
from nltk.corpus import stopwords
from sklearn.decomposition import PCA
from math import ceil
import string
from itertools import combinations
import networkx as nx
import re

In [2]:
columns = ['tweet_id','timestamp','tweet_text','user_id',
           'tweet_coords','tweet_coords_list','tweet_long','tweet_lat','location',
           'enc_url','tweet_lang','hashtags']
tweet_full = pd.read_csv(r'./tweetCoords.csv',
                         header=None,
                         names=columns,
                         parse_dates=[1],
                         infer_datetime_format=True,
                         index_col='timestamp')

In [3]:
tweet_stops = stopwords.words('english')
tweet_tokenizer = TweetTokenizer(strip_handles=True,preserve_case=False,reduce_len=True)

writing a custom text cleaner. Currently configured to remove all punctuation, _except #_.

In [86]:
def clean_tweet(tweet):
#     convert case:
    tweet = tweet.lower()
#     remove urls:
    tweet = re.sub('https?://\S+','',tweet)
#     remove @mentions: 
    tweet = re.sub('@\w+','',tweet)
#     remove punctuation, but not hashtags:
    tweet = tweet.translate(tweet.maketrans('','',string.punctuation.replace("#","")))
#     remove punctuation, including hashtags:
#     tweet = tweet.translate(tweet.maketrans('','',string.punctuation))
    return tweet
    

What's the word we're comparing similarity to?

In [4]:
search_term = "irma"

Starting here, begin the iteration over times.

In [87]:
related_words = pd.DataFrame()
tweet_date = pd.to_datetime("2017-09-09 00:00:00")
date_delta = pd.Timedelta("24HR")
end_date = pd.to_datetime("2017-09-13 00:00:00")

In [79]:
top_num_words = 20

currently there is an incompatibility between gensim and numpy > 1.13

In [88]:
for tweet_day in pd.date_range(start = tweet_date, end = end_date, freq = date_delta):
    
    tweet_text = tweet_full.loc[tweet_day:tweet_day + date_delta,"tweet_text"]
    min_count = ceil(len(tweet_text) * .001) # words whose count must exceed some percent of the number of tweets.
    print(str(tweet_day)+": "+str(len(tweet_text))+" tweets ("+str(min_count)+" occurrence threshold)") # this line is just here for diagnostic purposes.
    
    tweets_tokens = tweet_text.apply(lambda x: [clean_tweet(word) for word in tweet_tokenizer.tokenize(x) if word not in tweet_stops])
    
    vector_model = Word2Vec(tweets_tokens, min_count=min_count, sg=1, window=4)
    word_matrix = vector_model.wv[vector_model.wv.vocab]
    pca = PCA(n_components=2)
    result = pca.fit_transform(word_matrix)
    terms_from_range = pd.DataFrame.from_records(vector_model.wv.most_similar(search_term,topn=top_num_words),columns=[tweet_day,"Score"])
    related_words = pd.concat([related_words,terms_from_range],axis=1)

2017-09-09 00:00:00: 20758 tweets (21 occurrence threshold)


  if np.issubdtype(vec.dtype, np.int):


2017-09-10 00:00:00: 22953 tweets (23 occurrence threshold)


  if np.issubdtype(vec.dtype, np.int):


2017-09-11 00:00:00: 16322 tweets (17 occurrence threshold)


  if np.issubdtype(vec.dtype, np.int):


2017-09-12 00:00:00: 15109 tweets (16 occurrence threshold)


  if np.issubdtype(vec.dtype, np.int):


2017-09-13 00:00:00: 16248 tweets (17 occurrence threshold)


  if np.issubdtype(vec.dtype, np.int):


In [8]:
# vector_model.wv.get_vector("storm").shape

In [9]:
# vector_model.wv.similarity("storm","rain")

In [10]:
# vector_model.wv.vocab

In [89]:
related_words

Unnamed: 0,2017-09-09 00:00:00,Score,2017-09-10 00:00:00,Score.1,2017-09-11 00:00:00,Score.2,2017-09-12 00:00:00,Score.3,2017-09-13 00:00:00,Score.4
0,ready,0.914332,2017,0.75214,post,0.895802,post,0.953647,hurricane,0.93773
1,coming,0.894841,hits,0.737797,#hurricaneimra,0.863874,#irma,0.935578,#irma,0.932928
2,waiting,0.874294,4,0.737545,survived,0.831251,hurricane,0.926696,#hurricaneirma,0.909171
3,hurricane,0.870247,ready,0.737191,hotel,0.828027,#hurricane,0.921193,post,0.902726
4,hits,0.834883,#hurricanirma,0.735357,aftermath,0.825391,#hurricaneirma,0.918967,survived,0.88803
5,getting,0.824868,#hurricaineirma,0.733338,cat,0.822935,aftermath,0.907256,#hurricane,0.887565
6,prep,0.822447,hotel,0.731651,#afterirma,0.821335,survived,0.895524,home,0.87443
7,party,0.81782,#,0.726781,bye,0.81589,#florida,0.892683,hit,0.866483
8,#hurricanirma,0.815411,9,0.726467,affected,0.815633,#hurricaneirma2017,0.882732,#florida,0.864766
9,#irmahurricane2017,0.807195,cat,0.719833,#irmageddon,0.812782,#aftermath,0.863851,relief,0.851677


In [90]:
related_words.iloc[:,0::2]

Unnamed: 0,2017-09-09 00:00:00,2017-09-10 00:00:00,2017-09-11 00:00:00,2017-09-12 00:00:00,2017-09-13 00:00:00
0,ready,2017,post,post,hurricane
1,coming,hits,#hurricaneimra,#irma,#irma
2,waiting,4,survived,hurricane,#hurricaneirma
3,hurricane,ready,hotel,#hurricane,post
4,hits,#hurricanirma,aftermath,#hurricaneirma,survived
5,getting,#hurricaineirma,cat,aftermath,#hurricane
6,prep,hotel,#afterirma,survived,home
7,party,#,bye,#florida,hit
8,#hurricanirma,9,affected,#hurricaneirma2017,#florida
9,#irmahurricane2017,cat,#irmageddon,#aftermath,relief


In [13]:
tweet_text[(tweet_text.str.contains(r"\bstorm\b",regex=True)) & (tweet_text.str.contains(r"\bdamage\b",regex=True))].values

array(['We hope everyone weathered the storm with minimal damage. We are still without power,  we will  https://t.co/uL0OXckOde',
       'Driving around after the storm to assess the damage #HurricaneHooligans #ILoveNathan #Drive @  https://t.co/gb9Ngq5tNE',
       'This is the worst of our storm damage at our office. Our beautiful double Oak Tree came down in  https://t.co/GSvORD7x8A',
       'District leaders say in terms of damage, their schools dodged the blunt of the storm and had very minimal damage',
       'Blocked due to storm damage in #FortLauderdaleBeach on SR-A1A from Las Olas Blvd to Sunrise Blvd #SFLtraffic https://t.co/US2j7IOBPL',
       'The cleanup is well underway   some photos of the damage sustained during the storm. https://t.co/0rtame5JtV',
       'The storm is over and we are very grateful for having minimal impact or damage. We lost power  https://t.co/hOItayJS7u',
       '#BLP responding to our Naples are clients for after-storm damage assessments.  https://t

In [14]:
tweet_text[(tweet_text.str.contains(r"\bstorm\b",regex=True)) & (tweet_text.str.contains(r"\bhelping\b",regex=True))].values

array(['Proud @UF alumnus  and Gator Dad  helping out in #Hurricane #Irma Shelter during and after the storm   with  https://t.co/uW88pSLu70'],
      dtype=object)

In [15]:
word_pairs = list(combinations(list(vector_model.wv.vocab.keys()),2))

In [16]:
tweet_graph = nx.Graph()

In [17]:
for pair in word_pairs:
    edge_weight = vector_model.wv.similarity(pair[0],pair[1])
    if edge_weight > .85:
        tweet_graph.add_edge(pair[0],pair[1],weight=edge_weight)

<bound method Graph.get_edge_data of <networkx.classes.graph.Graph object at 0x11bd88828>>

In [18]:
tweet_graph.add_nodes_from(vector_model.wv.vocab.keys())

In [19]:
nx.write_gexf(tweet_graph,path=r'./tweet_graph.gexf')