# Monitoring changes in related words over time.

### This notebook will show how words related to a particular word will change over time deltas

In [1]:
import pandas as pd
import numpy as np
from gensim.models import Word2Vec
from nltk.tokenize import TweetTokenizer
from nltk.corpus import stopwords
from sklearn.decomposition import PCA
from math import ceil
import string
from itertools import combinations
import networkx as nx
import re

In [2]:
columns = ['tweet_id','timestamp','tweet_text','user_id',
           'tweet_coords','tweet_coords_list','tweet_long','tweet_lat','location',
           'enc_url','tweet_lang','hashtags']
tweet_full = pd.read_csv(r'./tweetCoords.csv',
                         header=None,
                         names=columns,
                         parse_dates=[1],
                         infer_datetime_format=True,
                         index_col='timestamp')

In [3]:
tweet_stops = stopwords.words('english')
tweet_tokenizer = TweetTokenizer(strip_handles=True,preserve_case=False,reduce_len=True)

writing a custom text cleaner. Currently configured to remove all punctuation, _except #_.

In [4]:
def clean_tweet(tweet):
#     convert case:
    tweet = tweet.lower()
#     remove URLs:
    tweet = re.sub('https?://\S+','',tweet)
#     remove @mentions, including those with a leading '-' or '.' : 
    tweet = re.sub('[-\.]?@\w+','',tweet)
#     remove punctuation, but not hashtags:
    tweet = tweet.translate(tweet.maketrans('','',string.punctuation.replace("#","")))
#     remove non-hashtag '#'.
    tweet = re.sub('#\B','',tweet)
#     remove punctuation, including hashtags:
#     tweet = tweet.translate(tweet.maketrans('','',string.punctuation))
    return tweet
    

In [5]:
re_text = "this is ! A TWEET with @some .@random @@extra #stuff ##in IT!?@>#! "
print(clean_tweet(re_text))

this is  a tweet with    #stuff #in it 


What's the word we're comparing similarity to?

In [6]:
search_term = "irma"

Starting here, begin the iteration over times.

In [7]:
related_words = pd.DataFrame()
tweet_date = pd.to_datetime("2017-09-11 00:00:00")
date_delta = pd.Timedelta("24HR")
end_date = pd.to_datetime("2017-09-12 00:00:00")

In [8]:
top_num_words = 20 # number of words to include in cosine similarity ordered list
pct_occ_thresh = .001 # words must occur a number of times >= this percent of number of tweets.

currently there is an incompatibility between gensim and numpy > 1.13

In [9]:
for tweet_day in pd.date_range(start = tweet_date, end = end_date, freq = date_delta):
    
    tweet_text = tweet_full.loc[tweet_day:tweet_day + date_delta,"tweet_text"]
    min_count = ceil(len(tweet_text) * pct_occ_thresh)
    print(str(tweet_day)+": "+str(len(tweet_text))+" tweets ("+str(min_count)+" occurrence threshold)") # this line is just here for diagnostic purposes.
    
    tweets_tokens = tweet_text.apply(lambda x: [clean_tweet(word) for word in tweet_tokenizer.tokenize(x) if word not in tweet_stops])
    
    vector_model = Word2Vec(tweets_tokens, min_count=min_count, sg=1, window=4)
    word_matrix = vector_model.wv[vector_model.wv.vocab]
    pca = PCA(n_components=2)
    result = pca.fit_transform(word_matrix)
    terms_from_range = pd.DataFrame.from_records(vector_model.wv.most_similar(search_term,topn=top_num_words),columns=[tweet_day,"Score"])
    related_words = pd.concat([related_words,terms_from_range],axis=1)

2017-09-11 00:00:00: 16322 tweets (17 occurrence threshold)


  if np.issubdtype(vec.dtype, np.int):


2017-09-12 00:00:00: 15109 tweets (16 occurrence threshold)


  if np.issubdtype(vec.dtype, np.int):


In [10]:
# vector_model.wv.get_vector("storm").shape

In [11]:
# vector_model.wv.similarity("storm","rain")

In [12]:
# vector_model.wv.vocab

In [13]:
related_words

Unnamed: 0,2017-09-11 00:00:00,Score,2017-09-12 00:00:00,Score.1
0,post,0.904861,post,0.955433
1,#hurricaneimra,0.882627,hurricane,0.943001
2,bye,0.85768,aftermath,0.930415
3,aftermath,0.853729,#irma,0.930365
4,hotel,0.849519,#hurricaneirma,0.911921
5,cat,0.849077,#hurricane,0.89504
6,#afterirma,0.846328,survived,0.894168
7,#naples,0.830033,#hurricaneirma2017,0.88142
8,affected,0.82973,#florida,0.859789
9,#irmageddon,0.829599,#irmahurricane,0.856164


In [14]:
related_words.iloc[:,0::2]

Unnamed: 0,2017-09-11 00:00:00,2017-09-12 00:00:00
0,post,post
1,#hurricaneimra,hurricane
2,bye,aftermath
3,aftermath,#irma
4,hotel,#hurricaneirma
5,cat,#hurricane
6,#afterirma,survived
7,#naples,#hurricaneirma2017
8,affected,#florida
9,#irmageddon,#irmahurricane


In [15]:
# tweet_text[(tweet_text.str.contains(r"\bstorm\b",regex=True)) & (tweet_text.str.contains(r"\bdamage\b",regex=True))].values

In [16]:
# tweet_text[(tweet_text.str.contains(r"\bstorm\b",regex=True)) & (tweet_text.str.contains(r"\bhelping\b",regex=True))].values

Comparing words to hashtags:

In [17]:
word_list = tweet_full.loc["2017-09-11 00:00:00":"2017-09-12 00:00:00"].tweet_text.str.lower().str.split(r'\s+',expand=True).stack().value_counts()

In [18]:
hashtags_count = word_list[word_list.index.str[0] == '#']

In [19]:
hashtag_words = hashtags_count.index.str[1:].values

In [20]:
word_list['#hurricane']

154

In [21]:
word_list[word_list.index.str[1]=='@'].index

Index(['.@realdonaldtrump', 'l@s', '.@tecoenergy', '.@seminoleso',
       '.@attcares', '.@10newswtsp', '.@marlinspark', '.@cargo5',
       '.@andrewwulfeck:', '.@nevilleray', '-@notcampbellmatt',
       '.@miamidadefire', '-@theslumpgod', '.@rborn83,', '.@insidefpl',
       '.@imkristenbell', '.@wflanightphotog', '.@ejmccrane:', '.@jorgeebro',
       '.@ejmccrane', '.@drtiajolie', '.@tampaelectric'],
      dtype='object')

In [22]:
word_list[hashtag_words].sort_values(ascending=False)

the                       4587.0
                          2194.0
we                        1515.0
florida                   1295.0
#hurricaneirma            1113.0
power                      913.0
#irma                      910.0
irma                       857.0
hurricane                  749.0
me                         688.0
our                        514.0
wind                       460.0
fl                         450.0
what                       422.0
after                      403.0
f                          335.0
storm                      332.0
got                        319.0
miami                      291.0
good                       279.0
house                      275.0
one                        271.0
safe                       270.0
rain                       265.0
us                         261.0
go                         260.0
open                       258.0
beach                      248.0
people                     243.0
through                    235.0
          

In [23]:
word_pairs = list(combinations(list(vector_model.wv.vocab.keys()),2))

In [24]:
tweet_graph = nx.Graph()

In [25]:
for pair in word_pairs:
    edge_weight = vector_model.wv.similarity(pair[0],pair[1])
    if edge_weight > .85:
        tweet_graph.add_edge(pair[0],pair[1],weight=edge_weight)

In [26]:
tweet_graph.add_nodes_from(vector_model.wv.vocab.keys())

In [27]:
nx.write_gexf(tweet_graph,path=r'./tweet_graph.gexf')