# Monitoring changes in related words over time.

### This notebook will show how words related to a particular word will change over time deltas

In [17]:
import pandas as pd
import numpy as np
from gensim.models import Word2Vec
from nltk.tokenize import TweetTokenizer
from nltk.corpus import stopwords
from sklearn.decomposition import PCA
from math import ceil
import string
from itertools import combinations
import networkx as nx

In [2]:
columns = ['tweet_id','timestamp','tweet_text','user_id',
           'tweet_coords','tweet_coords_list','tweet_long','tweet_lat','location',
           'enc_url','tweet_lang','hashtags']
tweet_full = pd.read_csv(r'./tweetCoords.csv',
                         header=None,
                         names=columns,
                         parse_dates=[1],
                         infer_datetime_format=True,
                         index_col='timestamp')

In [3]:
tweet_stops = stopwords.words('english')
tweet_tokenizer = TweetTokenizer(strip_handles=True,preserve_case=False,reduce_len=True)

What's the word we're comparing similarity to?

In [4]:
search_term = "storm"

Starting here, begin the iteration over times.

In [5]:
related_words = pd.DataFrame()
tweet_date = pd.to_datetime("2017-09-08 00:00:00")
date_delta = pd.Timedelta("24HR")
end_date = pd.to_datetime("2017-09-15 00:00:00")

In [6]:
for tweet_day in pd.date_range(start = tweet_date, end = end_date, freq = date_delta):
    
    tweet_text = tweet_full.loc[tweet_day:tweet_day + date_delta,"tweet_text"]
    min_count = ceil(len(tweet_text) * .001) # words whose count must exceed some percent of the number of tweets.
    print(str(tweet_day)+": "+str(len(tweet_text))+" tweets ("+str(min_count)+" occurrence threshold)") # this line is just here for diagnostic purposes.
    
    tweets_tokens = tweet_text.apply(lambda x: [word for word in tweet_tokenizer.tokenize(x) if word not in tweet_stops])
    
    vector_model = Word2Vec(tweets_tokens, min_count=min_count, sg=1, window=4)
    word_matrix = vector_model.wv[vector_model.wv.vocab]
    pca = PCA(n_components=2)
    result = pca.fit_transform(word_matrix)
    terms_from_range = pd.DataFrame.from_records(vector_model.wv.most_similar(search_term),columns=[tweet_day,"Score"])
    related_words = pd.concat([related_words,terms_from_range],axis=1)

2017-09-08 00:00:00: 20164 tweets (21 occurrence threshold)


  if np.issubdtype(vec.dtype, np.int):


2017-09-09 00:00:00: 20758 tweets (21 occurrence threshold)


  if np.issubdtype(vec.dtype, np.int):


2017-09-10 00:00:00: 22953 tweets (23 occurrence threshold)


  if np.issubdtype(vec.dtype, np.int):


2017-09-11 00:00:00: 16322 tweets (17 occurrence threshold)


  if np.issubdtype(vec.dtype, np.int):


2017-09-12 00:00:00: 15109 tweets (16 occurrence threshold)


  if np.issubdtype(vec.dtype, np.int):


2017-09-13 00:00:00: 16248 tweets (17 occurrence threshold)


  if np.issubdtype(vec.dtype, np.int):


2017-09-14 00:00:00: 16514 tweets (17 occurrence threshold)


  if np.issubdtype(vec.dtype, np.int):


2017-09-15 00:00:00: 17743 tweets (18 occurrence threshold)


  if np.issubdtype(vec.dtype, np.int):


In [7]:
vector_model.wv.get_vector("storm").shape

(100,)

In [8]:
vector_model.wv.similarity("storm","rain")

0.4524647

In [9]:
vector_model.wv.vocab

{'closed': <gensim.models.keyedvectors.Vocab at 0x116a465f8>,
 '-': <gensim.models.keyedvectors.Vocab at 0x116a460b8>,
 '75': <gensim.models.keyedvectors.Vocab at 0x116a460f0>,
 'nb': <gensim.models.keyedvectors.Vocab at 0x116a46390>,
 'us': <gensim.models.keyedvectors.Vocab at 0x116a46438>,
 '#traffic': <gensim.models.keyedvectors.Vocab at 0x116a464e0>,
 'lol': <gensim.models.keyedvectors.Vocab at 0x116a46518>,
 '...': <gensim.models.keyedvectors.Vocab at 0x116a46550>,
 'bro': <gensim.models.keyedvectors.Vocab at 0x116a46588>,
 '@': <gensim.models.keyedvectors.Vocab at 0x116a46630>,
 'tonight': <gensim.models.keyedvectors.Vocab at 0x116a46668>,
 '.': <gensim.models.keyedvectors.Vocab at 0x116a466a0>,
 'always': <gensim.models.keyedvectors.Vocab at 0x116a466d8>,
 'good': <gensim.models.keyedvectors.Vocab at 0x116a46710>,
 'see': <gensim.models.keyedvectors.Vocab at 0x116a43c50>,
 'working': <gensim.models.keyedvectors.Vocab at 0x116a43ef0>,
 'hard': <gensim.models.keyedvectors.Vocab at

In [10]:
related_words

Unnamed: 0,2017-09-08 00:00:00,Score,2017-09-09 00:00:00,Score.1,2017-09-10 00:00:00,Score.2,2017-09-11 00:00:00,Score.3,2017-09-12 00:00:00,Score.4,2017-09-13 00:00:00,Score.5,2017-09-14 00:00:00,Score.6,2017-09-15 00:00:00,Score.7
0,calm,0.942603,calm,0.904079,surge,0.882396,surge,0.956966,little,0.985379,trees,0.97444,coming,0.996403,damage,0.990491
1,#hurricaneirma,0.912989,surge,0.889002,tropical,0.87441,hurricane,0.926301,damage,0.984955,office,0.969558,prayers,0.995456,dinner,0.986405
2,prep,0.912867,coast,0.846407,#mfl,0.851504,tropical,0.915412,clean,0.984423,crew,0.966794,damage,0.994434,monday,0.984298
3,surge,0.900904,saturday,0.83781,media,0.849009,#jax,0.895759,beautiful,0.984024,beautiful,0.965348,thankful,0.993758,helping,0.983224
4,preparation,0.895158,strong,0.834855,emergency,0.84186,emergency,0.885126,survived,0.981107,keys,0.961949,thanks,0.993008,party,0.981091
5,closed,0.895115,update,0.832628,force,0.826428,media,0.874752,neighborhood,0.978003,affected,0.96143,hit,0.992436,services,0.980572
6,morning,0.890518,starting,0.829458,#jax,0.824605,flood,0.865326,yesterday,0.974235,store,0.960061,little,0.992361,beautiful,0.979855
7,beautiful,0.886049,west,0.828581,riding,0.820802,public,0.86433,hit,0.970064,took,0.958806,wonderful,0.992318,clean,0.979544
8,saturday,0.880122,keys,0.827929,keys,0.819796,calm,0.850048,mess,0.968926,clean,0.957923,evening,0.992303,action,0.979271
9,ready,0.877215,moving,0.826247,#mlb,0.81485,#tbw,0.841802,keys,0.968905,neighborhood,0.957394,big,0.991963,weekend,0.978543


In [11]:
related_words.iloc[:,0::2]

Unnamed: 0,2017-09-08 00:00:00,2017-09-09 00:00:00,2017-09-10 00:00:00,2017-09-11 00:00:00,2017-09-12 00:00:00,2017-09-13 00:00:00,2017-09-14 00:00:00,2017-09-15 00:00:00
0,calm,calm,surge,surge,little,trees,coming,damage
1,#hurricaneirma,surge,tropical,hurricane,damage,office,prayers,dinner
2,prep,coast,#mfl,tropical,clean,crew,damage,monday
3,surge,saturday,media,#jax,beautiful,beautiful,thankful,helping
4,preparation,strong,emergency,emergency,survived,keys,thanks,party
5,closed,update,force,media,neighborhood,affected,hit,services
6,morning,starting,#jax,flood,yesterday,store,little,beautiful
7,beautiful,west,riding,public,hit,took,wonderful,clean
8,saturday,keys,keys,calm,mess,clean,evening,action
9,ready,moving,#mlb,#tbw,keys,neighborhood,big,weekend


In [12]:
tweet_text[(tweet_text.str.contains(r"\bstorm\b",regex=True)) & (tweet_text.str.contains(r"\bdamage\b",regex=True))].values

array(['Miss Lawton talks about storm damage @ Prime F. Osborn III Convention Center https://t.co/eNgbX55jsg'],
      dtype=object)

In [13]:
tweet_text[(tweet_text.str.contains(r"\bstorm\b",regex=True)) & (tweet_text.str.contains(r"\bhelping\b",regex=True))].values

array(['A ray of sunshine in the storm   : @MiamiHEAT    amp  @ThisIsUD are helping some of #HurricaneIrma  s most vulnerable v  https://t.co/DQAtZfZY8m'],
      dtype=object)

In [16]:
word_pairs = list(combinations(list(vector_model.wv.vocab.keys()),2))

In [24]:
tweet_graph = nx.Graph()

In [25]:
for pair in word_pairs:
    edge_weight = vector_model.wv.similarity(pair[0],pair[1])
    if edge_weight > .85:
        tweet_graph.add_edge(pair[0],pair[1],weight=edge_weight)

In [26]:
tweet_graph.add_nodes_from(vector_model.wv.vocab.keys())

In [27]:
nx.write_gexf(tweet_graph,path=r'./tweet_graph.gexf')