# Monitoring changes in related words over time.

### This notebook will show how words related to a particular word will change over time deltas

In [1]:
import pandas as pd
import numpy as np
from gensim.models import Word2Vec
from nltk.tokenize import TweetTokenizer
from nltk.corpus import stopwords
from sklearn.decomposition import PCA
from math import ceil
import string
from itertools import combinations
import networkx as nx
import re

In [2]:
columns = ['tweet_id','timestamp','tweet_text','user_id',
           'tweet_coords','tweet_coords_list','tweet_long','tweet_lat','location',
           'enc_url','tweet_lang','hashtags']
tweet_full = pd.read_csv(r'./tweetCoords.csv',
                         header=None,
                         names=columns,
                         parse_dates=[1],
                         infer_datetime_format=True,
                         index_col='timestamp')

In [3]:
tweet_stops = stopwords.words('english')
tweet_tokenizer = TweetTokenizer(strip_handles=True,preserve_case=False,reduce_len=True)

writing a custom text cleaner. Currently configured to remove all punctuation, _except #_.

In [94]:
def clean_tweet(tweet):
#     convert case:
    tweet = tweet.lower()
#     remove URLs:
    tweet = re.sub('https?://\S+','',tweet)
#     remove @mentions, including those with a leading '-' or '.' : 
    tweet = re.sub('[-\.]?@\w+','',tweet)
#     remove punctuation, but not hashtags:
    tweet = tweet.translate(tweet.maketrans('','',string.punctuation.replace("#","")))
#     remove non-hashtag '#'.
    tweet = re.sub('#\B','',tweet)
#     remove 'amp', indicating decoded ampersand character
    tweet = re.sub('\\bamp\\b','',tweet)
#     remove punctuation, including hashtags:
#     tweet = tweet.translate(tweet.maketrans('','',string.punctuation))
    return tweet
    

In [96]:
re_text = "this is ! A TWEET with @some .@random amp @@extra #stuff ##in IT!?@>#! "
print(clean_tweet(re_text))

this is  a tweet with     #stuff #in it 


What's the word we're comparing similarity to?

In [6]:
search_term = "irma"

Starting here, begin the iteration over times.

In [97]:
related_words = pd.DataFrame()
tweet_date = pd.to_datetime("2017-09-10 00:00:00")
date_delta = pd.Timedelta("3HR")
end_date = pd.to_datetime("2017-09-11 00:00:00")

In [98]:
top_num_words = 20 # number of words to include in cosine similarity ordered list
pct_occ_thresh = .001 # words must occur a number of times >= this percent of number of tweets.

currently there is an incompatibility between gensim and numpy > 1.13

In [99]:
for tweet_day in pd.date_range(start = tweet_date, end = end_date, freq = date_delta):
    
    tweet_text = tweet_full.loc[tweet_day:tweet_day + date_delta,"tweet_text"]
    min_count = ceil(len(tweet_text) * pct_occ_thresh)
    print(str(tweet_day)+": "+str(len(tweet_text))+" tweets ("+str(min_count)+" occurrence threshold)") # this line is just here for diagnostic purposes.
    
    tweets_tokens = tweet_text.apply(lambda x: [clean_tweet(word) for word in tweet_tokenizer.tokenize(x) if word not in tweet_stops])
    
    vector_model = Word2Vec(tweets_tokens, min_count=min_count, sg=1, window=3)
    word_matrix = vector_model.wv[vector_model.wv.vocab]
    pca = PCA(n_components=2)
    result = pca.fit_transform(word_matrix)
    terms_from_range = pd.DataFrame.from_records(vector_model.wv.most_similar(search_term,topn=top_num_words),columns=[tweet_day,"Cos_Sim"])
    related_words = pd.concat([related_words,terms_from_range],axis=1)

2017-09-10 00:00:00: 3970 tweets (4 occurrence threshold)


  if np.issubdtype(vec.dtype, np.int):


2017-09-10 03:00:00: 2502 tweets (3 occurrence threshold)


  if np.issubdtype(vec.dtype, np.int):


2017-09-10 06:00:00: 927 tweets (1 occurrence threshold)


  if np.issubdtype(vec.dtype, np.int):


2017-09-10 09:00:00: 1224 tweets (2 occurrence threshold)


  if np.issubdtype(vec.dtype, np.int):


2017-09-10 12:00:00: 2880 tweets (3 occurrence threshold)


  if np.issubdtype(vec.dtype, np.int):


2017-09-10 15:00:00: 4087 tweets (5 occurrence threshold)


  if np.issubdtype(vec.dtype, np.int):


2017-09-10 18:00:00: 3752 tweets (4 occurrence threshold)


  if np.issubdtype(vec.dtype, np.int):


2017-09-10 21:00:00: 3613 tweets (4 occurrence threshold)


  if np.issubdtype(vec.dtype, np.int):


2017-09-11 00:00:00: 3378 tweets (4 occurrence threshold)


  if np.issubdtype(vec.dtype, np.int):


In [32]:
# vector_model.wv.get_vector("storm").shape

In [33]:
# vector_model.wv.similarity("storm","rain")

In [34]:
# vector_model.wv.vocab

In [100]:
related_words

Unnamed: 0,2017-09-10 00:00:00,Cos_Sim,2017-09-10 03:00:00,Cos_Sim.1,2017-09-10 06:00:00,Cos_Sim.2,2017-09-10 09:00:00,Cos_Sim.3,2017-09-10 12:00:00,Cos_Sim.4,2017-09-10 15:00:00,Cos_Sim.5,2017-09-10 18:00:00,Cos_Sim.6,2017-09-10 21:00:00,Cos_Sim.7,2017-09-11 00:00:00,Cos_Sim.8
0,hurricane,0.99948,like,0.9997,,0.997606,,0.99969,#irma,0.999341,#hurricaneirma,0.998511,like,0.998867,#irma,0.998715,still,0.999252
1,like,0.999457,hurricane,0.999646,hurricane,0.997102,like,0.999661,#hurricaneirma,0.999301,hurricane,0.99835,get,0.998845,#florida,0.998295,wind,0.998625
2,go,0.999427,go,0.999613,like,0.996992,hurricane,0.999658,go,0.999116,go,0.998307,go,0.998791,#hurricane,0.998105,lost,0.998487
3,good,0.99941,shit,0.999612,#irma,0.996341,st,0.999608,#miami,0.999102,get,0.998182,#irma,0.998778,still,0.997399,storm,0.998408
4,see,0.999397,#miami,0.999607,wind,0.995833,go,0.999567,hurricane,0.998962,#irma,0.998089,stay,0.998618,safe,0.997388,like,0.998216
5,us,0.99939,good,0.999604,miami,0.99583,en,0.999563,today,0.998934,like,0.997798,us,0.998534,#hurricaneirma,0.997373,county,0.998099
6,back,0.999365,time,0.999601,get,0.995769,going,0.999556,see,0.998934,#hurricane,0.997726,#miami,0.998457,power,0.997181,#hurricaneirma,0.998013
7,lol,0.999349,much,0.999589,key,0.995768,us,0.99955,morning,0.998914,power,0.997639,right,0.998452,like,0.996548,rain,0.997814
8,get,0.999348,right,0.999588,sleep,0.99564,#irma,0.999548,family,0.998873,us,0.99741,house,0.998419,go,0.996481,get,0.997781
9,going,0.999306,got,0.999587,reports,0.995601,blvd,0.999538,everyone,0.998853,miami,0.997269,one,0.998415,right,0.99631,us,0.997438


In [101]:
related_words.iloc[:,0::2]

Unnamed: 0,2017-09-10 00:00:00,2017-09-10 03:00:00,2017-09-10 06:00:00,2017-09-10 09:00:00,2017-09-10 12:00:00,2017-09-10 15:00:00,2017-09-10 18:00:00,2017-09-10 21:00:00,2017-09-11 00:00:00
0,hurricane,like,,,#irma,#hurricaneirma,like,#irma,still
1,like,hurricane,hurricane,like,#hurricaneirma,hurricane,get,#florida,wind
2,go,go,like,hurricane,go,go,go,#hurricane,lost
3,good,shit,#irma,st,#miami,get,#irma,still,storm
4,see,#miami,wind,go,hurricane,#irma,stay,safe,like
5,us,good,miami,en,today,like,us,#hurricaneirma,county
6,back,time,get,going,see,#hurricane,#miami,power,#hurricaneirma
7,lol,much,key,us,morning,power,right,like,rain
8,get,right,sleep,#irma,family,us,house,go,get
9,going,got,reports,blvd,everyone,miami,one,right,us


In [37]:
# tweet_text[(tweet_text.str.contains(r"\bstorm\b",regex=True)) & (tweet_text.str.contains(r"\bdamage\b",regex=True))].values

In [38]:
# tweet_text[(tweet_text.str.contains(r"\bstorm\b",regex=True)) & (tweet_text.str.contains(r"\bhelping\b",regex=True))].values

Comparing words to hashtags:

In [67]:
tweet_full[tweet_full.tweet_text.str.contains("amp")].tweet_text

timestamp
2017-09-01 00:00:06    First @TBBuccaneers with my bride @carrie_duna...
2017-09-01 00:00:23    I m at Louis Pappas Market Cafe: Shoppes at Ci...
2017-09-01 00:00:23    Don t try  amp  talk 2 me when it s convenient...
2017-09-01 00:02:27    Smoked Mussels, Confit Potatoes, Green Onion B...
2017-09-01 00:02:40    current weather in Tampa: overcast clouds, 87 ...
2017-09-01 00:02:48            If you drink  amp  drive you re stupid af
2017-09-01 00:03:08    Come to Tampa please  Y all are awesome https:...
2017-09-01 00:03:09    AP Psych peeps - Typo on the vocab list: CORRE...
2017-09-01 00:04:01    Just posted a video @ Clark s Fish Camp Seafoo...
2017-09-01 00:04:26     Free guest listlink below  Bday celebrations ...
2017-09-01 00:05:00    #HappyIndependenceDay  #Trinidad  amp  #Tobago...
2017-09-01 00:07:19    Here at the EB-5 Visa  amp  Investment Immigra...
2017-09-01 00:08:44       Tap for brands      M.A.R.S  REPUBLIK  Men ...
2017-09-01 00:09:08    @10jayy__ @ZachAbo

In [39]:
word_list = tweet_full.loc["2017-09-10 09:00:00":"2017-09-11 09:00:00"].tweet_text.str.lower().str.split(r'\s+',expand=True).stack().value_counts()

In [40]:
hashtags_count = word_list[word_list.index.str[0] == '#']

In [41]:
hashtag_words = hashtags_count.index.str[1:].values

In [42]:
word_list

the                        5790
@                          4370
i                          4057
to                         3717
a                          3586
of                         3394
in                         3178
and                        3034
                           2811
is                         2761
my                         2334
florida                    2124
this                       2011
s                          1948
#hurricaneirma             1913
for                        1859
from                       1806
you                        1741
it                         1612
we                         1607
on                         1543
#irma                      1527
at                         1505
reports                    1386
t                          1351
irma                       1289
hurricane                  1257
are                        1166
mph                        1160
asos                       1118
                           ... 
season..

In [43]:
word_list['#hurricaneirma']

1913

In [44]:
word_list[word_list.index.str[1]=='@'].index

Index(['.@mayorgimenez', '.@cbs12', '.@flashgjr', '.@10newswtsp',
       '.@richarddymond', '.@drtiajolie', '#@abc', '.@realdonaldtrump',
       '.@andrewwulfeck:', '.@occc', '.@rborn83,', '.@miamidadecounty',
       '.@manateesheriff', '.@deadpool1973', '-@notcampbellmatt',
       '.@dukeenergy', '.@miamidadefire', '-@grant_gilmore', '.@jimsmallman',
       '.@thecwsupergirl', 'w@30.', '.@goabode', 'l@s', '.@jason_lanning',
       '.@tampaelectric', '.@nicoleebryan', '.@potus'],
      dtype='object')

In [45]:
word_list[hashtag_words].sort_values(ascending=False)

                                   2811.0
my                                 2334.0
florida                            2124.0
this                               2011.0
you                                1741.0
it                                 1612.0
we                                 1607.0
irma                               1289.0
hurricane                          1257.0
gust                               1074.0
me                                  952.0
power                               878.0
fl                                  847.0
wind                                656.0
storm                               630.0
our                                 563.0
now                                 524.0
rain                                499.0
down                                477.0
safe                                471.0
miami                               468.0
go                                  410.0
will                                393.0
beach                             

In [51]:
word_pairs = list(combinations(list(vector_model.wv.vocab.keys()),2))

In [52]:
tweet_graph = nx.Graph()

In [53]:
for pair in word_pairs:
    edge_weight = vector_model.wv.similarity(pair[0],pair[1])
    if edge_weight > .9:
        tweet_graph.add_edge(pair[0],pair[1],weight=edge_weight)

In [49]:
# tweet_graph.add_nodes_from(vector_model.wv.vocab.keys())

In [54]:
nx.write_gexf(tweet_graph,path=r'./tweet_graph.gexf')