# Monitoring changes in related words over time.

### This notebook will show how words related to a particular word will change over time deltas

In [1]:
import pandas as pd
import numpy as np
from gensim.models import Word2Vec
from nltk.tokenize import TweetTokenizer
from nltk.corpus import stopwords
from sklearn.decomposition import PCA
from math import ceil
import string
from itertools import combinations
import networkx as nx
import re

In [2]:
columns = ['tweet_id','timestamp','tweet_text','user_id',
           'tweet_coords','tweet_coords_list','tweet_long','tweet_lat','location',
           'enc_url','tweet_lang','hashtags']
tweet_full = pd.read_csv(r'./tweetCoords.csv',
                         header=None,
                         names=columns,
                         parse_dates=[1],
                         infer_datetime_format=True,
                         index_col='timestamp')

In [3]:
tweet_stops = stopwords.words('english')
tweet_tokenizer = TweetTokenizer(strip_handles=True,preserve_case=False,reduce_len=True)

writing a custom text cleaner. Currently configured to remove all punctuation, _except #_.

In [4]:
def clean_tweet(tweet):
#     convert case:
    tweet = tweet.lower()
#     remove URLs:
    tweet = re.sub('https?://\S+','',tweet)
#     remove @mentions, including those with a leading '-' or '.' : 
    tweet = re.sub('[-\.]?@\w+','',tweet)
#     remove punctuation, but not hashtags:
    tweet = tweet.translate(tweet.maketrans('','',string.punctuation.replace("#","")))
#     remove non-hashtag '#'.
    tweet = re.sub('#\B','',tweet)
#     remove punctuation, including hashtags:
#     tweet = tweet.translate(tweet.maketrans('','',string.punctuation))
    return tweet
    

In [5]:
re_text = "this is ! A TWEET with @some .@random @@extra #stuff ##in IT!?@>#! "
print(clean_tweet(re_text))

this is  a tweet with    #stuff #in it 


What's the word we're comparing similarity to?

In [6]:
search_term = "irma"

Starting here, begin the iteration over times.

In [36]:
related_words = pd.DataFrame()
tweet_date = pd.to_datetime("2017-09-10 00:00:00")
date_delta = pd.Timedelta("24HR")
end_date = pd.to_datetime("2017-09-11 00:00:00")

In [29]:
top_num_words = 20 # number of words to include in cosine similarity ordered list
pct_occ_thresh = .001 # words must occur a number of times >= this percent of number of tweets.

currently there is an incompatibility between gensim and numpy > 1.13

In [38]:
for tweet_day in pd.date_range(start = tweet_date, end = end_date, freq = date_delta):
    
    tweet_text = tweet_full.loc[tweet_day:tweet_day + date_delta,"tweet_text"]
    min_count = ceil(len(tweet_text) * pct_occ_thresh)
    print(str(tweet_day)+": "+str(len(tweet_text))+" tweets ("+str(min_count)+" occurrence threshold)") # this line is just here for diagnostic purposes.
    
    tweets_tokens = tweet_text.apply(lambda x: [clean_tweet(word) for word in tweet_tokenizer.tokenize(x) if word not in tweet_stops])
    
    vector_model = Word2Vec(tweets_tokens, min_count=min_count, sg=1, window=7)
    word_matrix = vector_model.wv[vector_model.wv.vocab]
    pca = PCA(n_components=2)
    result = pca.fit_transform(word_matrix)
    terms_from_range = pd.DataFrame.from_records(vector_model.wv.most_similar(search_term,topn=top_num_words),columns=[tweet_day,"Cos_Sim"])
    related_words = pd.concat([related_words,terms_from_range],axis=1)

2017-09-10 00:00:00: 22953 tweets (23 occurrence threshold)


  if np.issubdtype(vec.dtype, np.int):


2017-09-11 00:00:00: 16322 tweets (17 occurrence threshold)


  if np.issubdtype(vec.dtype, np.int):


In [None]:
# vector_model.wv.get_vector("storm").shape

In [None]:
# vector_model.wv.similarity("storm","rain")

In [None]:
# vector_model.wv.vocab

In [39]:
related_words

Unnamed: 0,2017-09-10 00:00:00,Score,2017-09-11 00:00:00,Score.1,2017-09-10 00:00:00.1,Score.2,2017-09-11 00:00:00.1,Score.3
0,ready,0.79815,post,0.939864,2017,0.756712,post,0.910927
1,waiting,0.784966,#hurricaneimra,0.899409,hit,0.746316,#hurricaneimra,0.866769
2,2017,0.78094,survived,0.886995,9,0.739459,#irmahurricane2017,0.861776
3,4,0.766008,aftermath,0.885714,plan,0.722768,affected,0.861527
4,hit,0.747328,#naples,0.861007,#hurricanirma,0.720597,bye,0.860845
5,#hurricanirma,0.739417,hotel,0.857625,#hurricaineirma,0.716046,cat,0.854893
6,#huricaneirma,0.735558,911,0.850783,hits,0.707358,#hurricanirma,0.85306
7,#goawayirma,0.730663,morning,0.850201,#irma,0.700583,#irmageddon,0.851231
8,#hurricaineirma,0.724643,affected,0.849157,mayor,0.699137,internet,0.848135
9,closer,0.722545,#irmahurricane,0.846904,cuba,0.6965,thoughts,0.847423


In [35]:
related_words.iloc[:,0::2]

Unnamed: 0,2017-09-10 00:00:00,2017-09-11 00:00:00
0,2017,post
1,ready,#hurricaneimra
2,hit,#irmahurricane2017
3,plan,hotel
4,cuba,aftermath
5,hits,affected
6,4,family
7,hotel,live
8,waiting,survived
9,#goawayirma,#afterirma


In [None]:
# tweet_text[(tweet_text.str.contains(r"\bstorm\b",regex=True)) & (tweet_text.str.contains(r"\bdamage\b",regex=True))].values

In [None]:
# tweet_text[(tweet_text.str.contains(r"\bstorm\b",regex=True)) & (tweet_text.str.contains(r"\bhelping\b",regex=True))].values

Comparing words to hashtags:

In [None]:
word_list = tweet_full.loc["2017-09-11 00:00:00":"2017-09-12 00:00:00"].tweet_text.str.lower().str.split(r'\s+',expand=True).stack().value_counts()

In [None]:
hashtags_count = word_list[word_list.index.str[0] == '#']

In [None]:
hashtag_words = hashtags_count.index.str[1:].values

In [None]:
word_list['#hurricaneirma']

In [None]:
word_list[word_list.index.str[1]=='@'].index

In [None]:
word_list[hashtag_words].sort_values(ascending=False)

In [None]:
word_pairs = list(combinations(list(vector_model.wv.vocab.keys()),2))

In [None]:
tweet_graph = nx.Graph()

In [None]:
for pair in word_pairs:
    edge_weight = vector_model.wv.similarity(pair[0],pair[1])
    if edge_weight > .95:
        tweet_graph.add_edge(pair[0],pair[1],weight=edge_weight)

In [None]:
tweet_graph.add_nodes_from(vector_model.wv.vocab.keys())

In [None]:
nx.write_gexf(tweet_graph,path=r'./tweet_graph.gexf')