# Monitoring changes in related words over time.

### This notebook will show how words related to a particular word will change over time deltas

In [1]:
import pandas as pd
import numpy as np
from gensim.models import Word2Vec
from nltk.tokenize import TweetTokenizer
from nltk.corpus import stopwords
from sklearn.decomposition import PCA
from math import ceil

In [2]:
columns = ['tweet_id','timestamp','tweet_text','user_id',
           'tweet_coords','tweet_coords_list','tweet_long','tweet_lat','location',
           'enc_url','tweet_lang','hashtags']
tweet_full = pd.read_csv(r'./tweetCoords.csv',
                         header=None,
                         names=columns,
                         parse_dates=[1],
                         infer_datetime_format=True,
                         index_col='timestamp')

In [3]:
tweet_stops = stopwords.words('english')
tweet_tokenizer = TweetTokenizer(strip_handles=True,preserve_case=False,reduce_len=True)

What's the word we're comparing similarity to?

In [4]:
search_term = "storm"

Starting here, begin the iteration over times.

In [5]:
related_words = pd.DataFrame()
tweet_date = pd.to_datetime("2017-09-08 00:00:00")
date_delta = pd.Timedelta("24HR")
end_date = pd.to_datetime("2017-09-15 00:00:00")

In [6]:
for tweet_day in pd.date_range(start = tweet_date, end = end_date, freq = date_delta):
    
    tweet_text = tweet_full.loc[tweet_day:tweet_day + date_delta,"tweet_text"]
    min_count = ceil(len(tweet_text) * .001) # words whose count must exceed some percent of the number of tweets.
    print(str(tweet_day)+": "+str(len(tweet_text))+" tweets ("+str(min_count)+" min. occur.)") # this line is just here for diagnostic purposes.
    
    tweets_tokens = tweet_text.apply(lambda x: [word for word in tweet_tokenizer.tokenize(x) if word not in tweet_stops])
    
    vector_model = Word2Vec(tweets_tokens, min_count=min_count, sg=1, window=4)
    word_matrix = vector_model.wv[vector_model.wv.vocab]
    pca = PCA(n_components=2)
    result = pca.fit_transform(word_matrix)
    terms_from_range = pd.DataFrame.from_records(vector_model.wv.most_similar(search_term),columns=[tweet_day,"Score"])
    related_words = pd.concat([related_words,terms_from_range],axis=1)

2017-09-08 00:00:00: 20164 tweets (21 min. occur.)
2017-09-09 00:00:00: 20758 tweets (21 min. occur.)
2017-09-10 00:00:00: 22953 tweets (23 min. occur.)
2017-09-11 00:00:00: 16322 tweets (17 min. occur.)
2017-09-12 00:00:00: 15109 tweets (16 min. occur.)
2017-09-13 00:00:00: 16248 tweets (17 min. occur.)
2017-09-14 00:00:00: 16514 tweets (17 min. occur.)
2017-09-15 00:00:00: 17743 tweets (18 min. occur.)


In [7]:
vector_model.wv.get_vector("storm").shape

(100,)

In [8]:
vector_model.wv.similarity("storm","rain")

0.4087950509641167

In [9]:
related_words

Unnamed: 0,2017-09-08 00:00:00,Score,2017-09-09 00:00:00,Score.1,2017-09-10 00:00:00,Score.2,2017-09-11 00:00:00,Score.3,2017-09-12 00:00:00,Score.4,2017-09-13 00:00:00,Score.5,2017-09-14 00:00:00,Score.6,2017-09-15 00:00:00,Score.7
0,calm,0.939268,calm,0.917164,tropical,0.879456,surge,0.956646,clean,0.990171,office,0.968487,yet,0.995299,damage,0.98706
1,#hurricaneirma,0.918313,surge,0.901439,surge,0.879012,hurricane,0.940288,damage,0.987693,keys,0.958086,everything,0.995188,dinner,0.985004
2,prep,0.908281,moving,0.864993,media,0.860002,tropical,0.922494,night,0.983459,pool,0.954473,stay,0.994914,safe,0.983374
3,boarded,0.901334,starting,0.863811,#mfl,0.843112,#jax,0.904742,neighborhood,0.983269,city,0.953033,bless,0.994528,made,0.983272
4,surge,0.897785,bands,0.860578,emergency,0.831926,media,0.903248,hit,0.982942,trees,0.953003,coming,0.993859,helping,0.982448
5,preparing,0.894898,strong,0.858571,radio,0.826941,emergency,0.902743,business,0.981624,church,0.951759,amazing,0.993832,open,0.981105
6,wonderful,0.892834,coast,0.852048,keys,0.818169,public,0.871364,lucky,0.980107,morning,0.950282,car,0.993646,school,0.979846
7,closed,0.889641,emergency,0.848531,station,0.816247,flood,0.871241,mess,0.978858,welcome,0.946896,soon,0.993372,needs,0.979159
8,evacuating,0.885936,winds,0.846713,#mlb,0.811979,call,0.856584,home,0.978282,restaurant,0.946679,though,0.993266,ride,0.97907
9,preparation,0.885911,outer,0.842498,#jax,0.807619,#tbw,0.851716,live,0.977298,downtown,0.944685,thanks,0.99227,party,0.978826


In [10]:
related_words.iloc[:,0::2]

Unnamed: 0,2017-09-08 00:00:00,2017-09-09 00:00:00,2017-09-10 00:00:00,2017-09-11 00:00:00,2017-09-12 00:00:00,2017-09-13 00:00:00,2017-09-14 00:00:00,2017-09-15 00:00:00
0,calm,calm,tropical,surge,clean,office,yet,damage
1,#hurricaneirma,surge,surge,hurricane,damage,keys,everything,dinner
2,prep,moving,media,tropical,night,pool,stay,safe
3,boarded,starting,#mfl,#jax,neighborhood,city,bless,made
4,surge,bands,emergency,media,hit,trees,coming,helping
5,preparing,strong,radio,emergency,business,church,amazing,open
6,wonderful,coast,keys,public,lucky,morning,car,school
7,closed,emergency,station,flood,mess,welcome,soon,needs
8,evacuating,winds,#mlb,call,home,restaurant,though,ride
9,preparation,outer,#jax,#tbw,live,downtown,thanks,party


In [20]:
tweet_text[(tweet_text.str.contains(r"\bstorm\b",regex=True)) & (tweet_text.str.contains(r"\bdamage\b",regex=True))].values

array(['Miss Lawton talks about storm damage @ Prime F. Osborn III Convention Center https://t.co/eNgbX55jsg'],
      dtype=object)

In [21]:
tweet_text[(tweet_text.str.contains(r"\bstorm\b",regex=True)) & (tweet_text.str.contains(r"\bhelping\b",regex=True))].values

array(['A ray of sunshine in the storm   : @MiamiHEAT    amp  @ThisIsUD are helping some of #HurricaneIrma  s most vulnerable v  https://t.co/DQAtZfZY8m'],
      dtype=object)