# Monitoring changes in related words over time.

### This notebook will show how words related to a particular word will change over time deltas

In [1]:
import pandas as pd
import numpy as np
from gensim.models import Word2Vec
from nltk.tokenize import TweetTokenizer
from nltk.corpus import stopwords
from sklearn.decomposition import PCA
from math import ceil

In [2]:
columns = ['tweet_id','timestamp','tweet_text','user_id',
           'tweet_coords','tweet_coords_list','tweet_long','tweet_lat','location',
           'enc_url','tweet_lang','hashtags']
tweet_full = pd.read_csv(r'./tweetCoords.csv',
                         header=None,
                         names=columns,
                         parse_dates=[1],
                         infer_datetime_format=True,
                         index_col='timestamp')

In [3]:
tweet_stops = stopwords.words('english')
tweet_tokenizer = TweetTokenizer(strip_handles=True,preserve_case=False,reduce_len=True)

What's the word we're comparing similarity to?

In [36]:
search_term = "storm"

Starting here, begin the iteration over times.

In [37]:
related_words = pd.DataFrame()
tweet_date = pd.to_datetime("2017-09-08 00:00:00")
date_delta = pd.Timedelta("24HR")
end_date = pd.to_datetime("2017-09-15 00:00:00")

In [38]:
for tweet_day in pd.date_range(start = tweet_date, end = end_date, freq = date_delta):
    
    tweet_text = tweet_full.loc[tweet_day:tweet_day + date_delta,"tweet_text"]
    
    print(str(tweet_day)+": "+str(len(tweet_text))+" tweets") # this line is just here for diagnostic purposes.
    
    tweets_tokens = tweet_text.apply(lambda x: [word for word in tweet_tokenizer.tokenize(x) if word not in tweet_stops])
    min_count = ceil(len(tweet_text) * .001) # words whose count must exceed some percent of the number of tweets.
    vector_model = Word2Vec(tweets_tokens, min_count=min_count, sg=1, window=4)
    word_matrix = vector_model.wv[vector_model.wv.vocab]
    pca = PCA(n_components=2)
    result = pca.fit_transform(word_matrix)
    terms_from_range = pd.DataFrame.from_records(vector_model.wv.most_similar(search_term),columns=[tweet_day,"Score"])
    related_words = pd.concat([related_words,terms_from_range],axis=1)

2017-09-08 00:00:00: 20164 tweets
2017-09-09 00:00:00: 20758 tweets
2017-09-10 00:00:00: 22953 tweets
2017-09-11 00:00:00: 16322 tweets
2017-09-12 00:00:00: 15109 tweets
2017-09-13 00:00:00: 16248 tweets
2017-09-14 00:00:00: 16514 tweets
2017-09-15 00:00:00: 17743 tweets


In [39]:
related_words

Unnamed: 0,2017-09-08 00:00:00,Score,2017-09-09 00:00:00,Score.1,2017-09-10 00:00:00,Score.2,2017-09-11 00:00:00,Score.3,2017-09-12 00:00:00,Score.4,2017-09-13 00:00:00,Score.5,2017-09-14 00:00:00,Score.6,2017-09-15 00:00:00,Score.7
0,calm,0.936064,calm,0.927173,tropical,0.877387,surge,0.944283,damage,0.98874,office,0.972699,electricity,0.996038,helping,0.980871
1,#hurricaneirma,0.934504,surge,0.866847,surge,0.856434,hurricane,0.930984,hit,0.987096,trees,0.972546,went,0.99567,dinner,0.978501
2,prep,0.920048,strong,0.859473,media,0.838925,tropical,0.909733,lucky,0.987071,keys,0.967498,thanks,0.995544,days,0.977953
3,evacuating,0.907014,saturday,0.856436,#mfl,0.835683,#jax,0.899832,home,0.986378,church,0.954924,though,0.993982,open,0.977447
4,boarded,0.905606,coast,0.854741,emergency,0.833033,media,0.893433,day,0.983006,group,0.952785,able,0.993279,class,0.977212
5,strong,0.904527,#hurricaneirma2017,0.851909,radio,0.82489,emergency,0.867221,beautiful,0.982517,pool,0.951897,blessed,0.993239,damage,0.976585
6,preparation,0.903605,hits,0.843962,#jax,0.81573,flood,0.864031,clean,0.982391,aftermath,0.951302,support,0.99298,thru,0.973948
7,path,0.899562,#irmageddon,0.842036,keys,0.812176,public,0.859681,neighborhood,0.982319,sun,0.950247,yet,0.992978,school,0.973565
8,preparing,0.899037,forecast,0.839418,#mlb,0.810452,call,0.84914,yesterday,0.980564,beautiful,0.950134,away,0.992833,hours,0.972584
9,beautiful,0.897466,floridians,0.833346,force,0.802016,mngr,0.841656,help,0.978959,#irmarecovery,0.948391,gets,0.991959,party,0.972425


In [40]:
related_words.iloc[:,0::2]

Unnamed: 0,2017-09-08 00:00:00,2017-09-09 00:00:00,2017-09-10 00:00:00,2017-09-11 00:00:00,2017-09-12 00:00:00,2017-09-13 00:00:00,2017-09-14 00:00:00,2017-09-15 00:00:00
0,calm,calm,tropical,surge,damage,office,electricity,helping
1,#hurricaneirma,surge,surge,hurricane,hit,trees,went,dinner
2,prep,strong,media,tropical,lucky,keys,thanks,days
3,evacuating,saturday,#mfl,#jax,home,church,though,open
4,boarded,coast,emergency,media,day,group,able,class
5,strong,#hurricaneirma2017,radio,emergency,beautiful,pool,blessed,damage
6,preparation,hits,#jax,flood,clean,aftermath,support,thru
7,path,#irmageddon,keys,public,neighborhood,sun,yet,school
8,preparing,forecast,#mlb,call,yesterday,beautiful,away,hours
9,beautiful,floridians,force,mngr,help,#irmarecovery,gets,party


In [55]:
tweet_text[(tweet_text.str.contains(r"\bstorm\b",regex=True)) & (tweet_text.str.contains(r"\bhelping\b",regex=True))].values

array(['A ray of sunshine in the storm   : @MiamiHEAT    amp  @ThisIsUD are helping some of #HurricaneIrma  s most vulnerable v  https://t.co/DQAtZfZY8m'],
      dtype=object)

In [58]:
tweet_text[(tweet_text.str.contains(r"\bstorm\b",regex=True)) & (tweet_text.str.contains(r"\bopen\b",regex=True))].values

array(['Christ  Blue did great during storm. However others not so lucky. My best to all recovering. We have power but no AC. Will open soon ',
       'All the Cheetah locations are back open after the storm   Great place to watch the fight tomorrow  https://t.co/XubXk9M2Bd'],
      dtype=object)