# Monitoring changes in related words over time.

### This notebook will show how words related to a particular word will change over time deltas

In [1]:
import pandas as pd
import numpy as np
from gensim.models import Word2Vec
from nltk.tokenize import TweetTokenizer
from nltk.corpus import stopwords
from sklearn.decomposition import PCA
from math import ceil

In [2]:
columns = ['tweet_id','timestamp','tweet_text','user_id',
           'tweet_coords','tweet_coords_list','tweet_long','tweet_lat','location',
           'enc_url','tweet_lang','hashtags']
tweet_full = pd.read_csv(r'./tweetCoords.csv',
                         header=None,
                         names=columns,
                         parse_dates=[1],
                         infer_datetime_format=True,
                         index_col='timestamp')

In [3]:
tweet_stops = stopwords.words('english')
tweet_tokenizer = TweetTokenizer(strip_handles=True,preserve_case=False,reduce_len=True)

What's the word we're comparing similarity to?

In [4]:
search_term = "storm"

Starting here, begin the iteration over times.

In [5]:
related_words = pd.DataFrame()
tweet_date = pd.to_datetime("2017-09-08 00:00:00")
date_delta = pd.Timedelta("24HR")
end_date = pd.to_datetime("2017-09-15 00:00:00")

In [6]:
for tweet_day in pd.date_range(start = tweet_date, end = end_date, freq = date_delta):
    
    tweet_text = tweet_full.loc[tweet_day:tweet_day + date_delta,"tweet_text"]
    min_count = ceil(len(tweet_text) * .001) # words whose count must exceed some percent of the number of tweets.
    print(str(tweet_day)+": "+str(len(tweet_text))+" tweets ("+str(min_count)+" occurrence threshold)") # this line is just here for diagnostic purposes.
    
    tweets_tokens = tweet_text.apply(lambda x: [word for word in tweet_tokenizer.tokenize(x) if word not in tweet_stops])
    
    vector_model = Word2Vec(tweets_tokens, min_count=min_count, sg=1, window=4)
    word_matrix = vector_model.wv[vector_model.wv.vocab]
    pca = PCA(n_components=2)
    result = pca.fit_transform(word_matrix)
    terms_from_range = pd.DataFrame.from_records(vector_model.wv.most_similar(search_term),columns=[tweet_day,"Score"])
    related_words = pd.concat([related_words,terms_from_range],axis=1)

2017-09-08 00:00:00: 20164 tweets (21 min. occur.)
2017-09-09 00:00:00: 20758 tweets (21 min. occur.)
2017-09-10 00:00:00: 22953 tweets (23 min. occur.)
2017-09-11 00:00:00: 16322 tweets (17 min. occur.)
2017-09-12 00:00:00: 15109 tweets (16 min. occur.)
2017-09-13 00:00:00: 16248 tweets (17 min. occur.)
2017-09-14 00:00:00: 16514 tweets (17 min. occur.)
2017-09-15 00:00:00: 17743 tweets (18 min. occur.)


In [7]:
vector_model.wv.get_vector("storm").shape

(100,)

In [8]:
vector_model.wv.similarity("storm","rain")

0.42750670674483926

In [15]:
vector_model.wv.vocab

{'closed': <gensim.models.keyedvectors.Vocab at 0x10a78e588>,
 '-': <gensim.models.keyedvectors.Vocab at 0x10a78e630>,
 '75': <gensim.models.keyedvectors.Vocab at 0x10a78e668>,
 'nb': <gensim.models.keyedvectors.Vocab at 0x10a78e908>,
 'us': <gensim.models.keyedvectors.Vocab at 0x10a78e9b0>,
 '#traffic': <gensim.models.keyedvectors.Vocab at 0x10a78ea58>,
 'lol': <gensim.models.keyedvectors.Vocab at 0x10a78ea90>,
 '...': <gensim.models.keyedvectors.Vocab at 0x10a78eac8>,
 'bro': <gensim.models.keyedvectors.Vocab at 0x10a78eb00>,
 '@': <gensim.models.keyedvectors.Vocab at 0x10a78eba8>,
 'tonight': <gensim.models.keyedvectors.Vocab at 0x10a78ebe0>,
 '.': <gensim.models.keyedvectors.Vocab at 0x10a78ec18>,
 'always': <gensim.models.keyedvectors.Vocab at 0x10a78ec50>,
 'good': <gensim.models.keyedvectors.Vocab at 0x10a78ec88>,
 'see': <gensim.models.keyedvectors.Vocab at 0x10a78e4a8>,
 'working': <gensim.models.keyedvectors.Vocab at 0x10a78e4e0>,
 'hard': <gensim.models.keyedvectors.Vocab at

In [9]:
related_words

Unnamed: 0,2017-09-08 00:00:00,Score,2017-09-09 00:00:00,Score.1,2017-09-10 00:00:00,Score.2,2017-09-11 00:00:00,Score.3,2017-09-12 00:00:00,Score.4,2017-09-13 00:00:00,Score.5,2017-09-14 00:00:00,Score.6,2017-09-15 00:00:00,Score.7
0,calm,0.946281,calm,0.943091,tropical,0.875614,surge,0.943424,clean,0.990157,office,0.968514,thanks,0.997291,damage,0.993091
1,prep,0.920893,surge,0.887837,media,0.8508,tropical,0.915255,hit,0.989387,pool,0.963841,came,0.995409,helping,0.986727
2,#hurricaneirma,0.902258,strong,0.885639,surge,0.84954,hurricane,0.908068,neighborhood,0.98855,morning,0.96377,yet,0.994434,made,0.982256
3,closed,0.899946,moving,0.854483,#mfl,0.842923,#jax,0.898779,yesterday,0.984505,keys,0.961589,internet,0.993909,survived,0.981104
4,surge,0.896471,comes,0.843686,emergency,0.827143,public,0.88781,little,0.983867,beautiful,0.961508,hours,0.992944,waiting,0.977319
5,boarded,0.894355,coast,0.841448,keys,0.822264,media,0.87227,damage,0.983343,church,0.959983,soon,0.992583,since,0.977078
6,morning,0.885743,#irmageddon,0.840087,#jax,0.821312,emergency,0.872264,lucky,0.98251,survived,0.95626,running,0.992406,clean,0.976272
7,preparing,0.884445,riding,0.83891,calm,0.818328,state,0.871991,house,0.981776,trees,0.955672,friday,0.992199,kick,0.975968
8,beautiful,0.883107,heading,0.838861,#mlb,0.815823,call,0.87035,keys,0.981255,aftermath,0.95537,coming,0.991928,arrivals,0.97592
9,prepared,0.881543,starting,0.837706,hurricane,0.815274,flood,0.84939,survived,0.980314,city,0.952378,full,0.991785,wonderful,0.975642


In [10]:
related_words.iloc[:,0::2]

Unnamed: 0,2017-09-08 00:00:00,2017-09-09 00:00:00,2017-09-10 00:00:00,2017-09-11 00:00:00,2017-09-12 00:00:00,2017-09-13 00:00:00,2017-09-14 00:00:00,2017-09-15 00:00:00
0,calm,calm,tropical,surge,clean,office,thanks,damage
1,prep,surge,media,tropical,hit,pool,came,helping
2,#hurricaneirma,strong,surge,hurricane,neighborhood,morning,yet,made
3,closed,moving,#mfl,#jax,yesterday,keys,internet,survived
4,surge,comes,emergency,public,little,beautiful,hours,waiting
5,boarded,coast,keys,media,damage,church,soon,since
6,morning,#irmageddon,#jax,emergency,lucky,survived,running,clean
7,preparing,riding,calm,state,house,trees,friday,kick
8,beautiful,heading,#mlb,call,keys,aftermath,coming,arrivals
9,prepared,starting,hurricane,flood,survived,city,full,wonderful


In [11]:
tweet_text[(tweet_text.str.contains(r"\bstorm\b",regex=True)) & (tweet_text.str.contains(r"\bdamage\b",regex=True))].values

array(['Miss Lawton talks about storm damage @ Prime F. Osborn III Convention Center https://t.co/eNgbX55jsg'],
      dtype=object)

In [12]:
tweet_text[(tweet_text.str.contains(r"\bstorm\b",regex=True)) & (tweet_text.str.contains(r"\bhelping\b",regex=True))].values

array(['A ray of sunshine in the storm   : @MiamiHEAT    amp  @ThisIsUD are helping some of #HurricaneIrma  s most vulnerable v  https://t.co/DQAtZfZY8m'],
      dtype=object)