# Monitoring changes in related words over time.

### This notebook will show how words related to a particular word will change over time deltas

In [109]:
import pandas as pd
import numpy as np
from gensim.models import Word2Vec
from nltk.tokenize import TweetTokenizer
from nltk.corpus import stopwords
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from math import ceil
import string
from itertools import combinations
import networkx as nx
import re

In [2]:
columns = ['tweet_id','timestamp','tweet_text','user_id',
           'tweet_coords','tweet_coords_list','tweet_long','tweet_lat','location',
           'enc_url','tweet_lang','hashtags']
tweet_full = pd.read_csv(r'./tweetCoords.csv',
                         header=None,
                         names=columns,
                         parse_dates=[1],
                         infer_datetime_format=True,
                         index_col='timestamp')

writing a custom text cleaner. Currently configured to remove all punctuation, _except #_.

In [227]:
tweet_stops = stopwords.words('english')
tweet_tokenizer = TweetTokenizer(strip_handles=True,preserve_case=False,reduce_len=True)

def clean_tweet(tweet):
#     convert case:
    tweet = tweet.lower()
#     remove URLs:
    tweet = re.sub('https?://\S+','',tweet)
#     remove @mentions, including those with a leading '-' or '.' : 
    tweet = re.sub('[-\.]?@\w+','',tweet)
#     remove punctuation, but not hashtags:
    tweet = tweet.translate(tweet.maketrans('','',string.punctuation.replace("#","")))
#     remove non-hashtag '#'.
    tweet = re.sub('#\B','',tweet)
#     remove 'amp', 'gt', 'lt', indicating decoded ampersand, greater-than, less-than characters
    tweet = re.sub(r'\b(amp|gt|lt)\b','',tweet)
#     remove punctuation, including hashtags:
#     tweet = tweet.translate(tweet.maketrans('','',string.punctuation))
    return tweet

def tokens_no_stopwords(tweet_as_string):
#     wrapper function that combines the tokenizer, cleaner, and stopword removal.
#     takes a string and returns a list of strings
    cleaned_tweet = clean_tweet(tweet_as_string)
    tweet_as_tokens = tweet_tokenizer.tokenize(cleaned_tweet)
    tweet_no_stops = [word for word in tweet_as_tokens if word not in tweet_stops]
    
    return tweet_no_stops
    

In [229]:
re_text = "this is ! A TWEETlt withgtamp @some .@random amp gt lt @@extra #stuff ##in IT!?@>#! "
print(tokens_no_stopwords(re_text))

['tweetlt', 'withgtamp', '#stuff', '#in']


What's the word we're comparing similarity to?

In [184]:
search_term = "irma"

Starting here, begin the iteration over times.

In [214]:
related_words = pd.DataFrame()
tweet_date = pd.to_datetime("2017-09-10 00:00:00")
date_delta = pd.Timedelta("24HR")
end_date = pd.to_datetime("2017-09-10 00:00:00")

In [217]:
top_num_words = 20 # number of words to include in cosine similarity ordered list
pct_occ_thresh = .01 # words must occur a number of times >= this percent of number of tweets.

currently there is an incompatibility between gensim and numpy > 1.13

In [228]:
for tweet_day in pd.date_range(start = tweet_date, end = end_date, freq = date_delta):
    
    tweet_text = tweet_full.loc[tweet_day:tweet_day + date_delta,"tweet_text"]
    min_count = ceil(len(tweet_text) * pct_occ_thresh)
    print(str(tweet_day)+": "+str(len(tweet_text))+" tweets ("+str(min_count)+" occurrence threshold)") # this line is just here for diagnostic purposes.
    
    tweets_tokens = tweet_text.apply(tokens_no_stopwords)
    vector_model = Word2Vec(tweets_tokens, min_count=min_count, sg=3, window=1)
    word_matrix = vector_model.wv[vector_model.wv.vocab]
#     tsne = TSNE(n_components=2)
#     result = tsne.fit_transform(word_matrix)
    pca = PCA(n_components=2)
    result = pca.fit_transform(word_matrix)
    
    terms_from_range = pd.DataFrame.from_records(vector_model.wv.most_similar(search_term,topn=top_num_words),columns=[tweet_day,"Cos_Sim"])
    related_words = pd.concat([related_words,terms_from_range],axis=1)

2017-09-10 00:00:00: 22953 tweets (230 occurrence threshold)


  if np.issubdtype(vec.dtype, np.int):


In [32]:
# vector_model.wv.get_vector("storm").shape

In [33]:
# vector_model.wv.similarity("storm","rain")

In [34]:
# vector_model.wv.vocab

In [219]:
related_words

Unnamed: 0,2017-09-10 00:00:00,Cos_Sim,2017-09-10 00:00:00.1,Cos_Sim.1
0,ready,0.730827,#hurricaneirma,0.961513
1,#hurricaineirma,0.721239,#irma,0.955641
2,coverage,0.71899,house,0.954888
3,waiting,0.714566,lol,0.933836
4,pass,0.71275,update,0.92895
5,4,0.712392,#hurricane,0.926753
6,party,0.712297,us,0.924933
7,hit,0.71181,need,0.924002
8,#hurricanirma,0.709186,shit,0.923062
9,live,0.706116,got,0.922459


In [200]:
related_words.iloc[:,0::2]

Unnamed: 0,2017-09-10 00:00:00
0,ready
1,coverage
2,#hurricaineirma
3,party
4,#hurricanirma
5,pass
6,hit
7,live
8,waiting
9,2017


In [37]:
# tweet_text[(tweet_text.str.contains(r"\bstorm\b",regex=True)) & (tweet_text.str.contains(r"\bdamage\b",regex=True))].values

In [38]:
# tweet_text[(tweet_text.str.contains(r"\bstorm\b",regex=True)) & (tweet_text.str.contains(r"\bhelping\b",regex=True))].values

Comparing words to hashtags:

In [160]:
tweet_full[tweet_full.tweet_text.str.contains("\\blt\\b")].tweet_text

timestamp
2017-09-01 02:42:56    @matt_swag1  amp  _its_guwap TURNIN UP TO MY #...
2017-09-01 03:19:14    being bae today and running errands with her w...
2017-09-01 04:24:25    FULL VIDEO IN MY BIO  lt ---          #Benzo o...
2017-09-01 04:49:28    Summer 1997  lt  Summer 2017  minus the Nazi s...
2017-09-01 08:16:10    In my head  lt 3 en I m Not Lost I m RVing htt...
2017-09-01 09:01:16    In my head  lt 3 en I m Not Lost I m RVing htt...
2017-09-01 13:54:16                   introducing yourself in class  lt 
2017-09-01 14:59:08    Reminder:   gt  gt  Labor Day is on Monday, Se...
2017-09-01 15:45:23    The Danger that Lies Within  Diet  Foods... ht...
2017-09-01 16:53:56    John Kelly considered resigning after Comey wa...
2017-09-01 17:53:34    @FoxBusiness @CharlesHurt Straight-thinking Ch...
2017-09-01 22:17:37    E d_Mobarek to all the muselmans poeple  lt 3 ...
2017-09-02 15:06:57    When your favorite bartender isnt in the clubh...
2017-09-02 17:39:17    i just saw this im

In [39]:
word_list = tweet_full.loc["2017-09-10 09:00:00":"2017-09-11 09:00:00"].tweet_text.str.lower().str.split(r'\s+',expand=True).stack().value_counts()

In [40]:
hashtags_count = word_list[word_list.index.str[0] == '#']

In [41]:
hashtag_words = hashtags_count.index.str[1:].values

In [42]:
word_list

the                        5790
@                          4370
i                          4057
to                         3717
a                          3586
of                         3394
in                         3178
and                        3034
                           2811
is                         2761
my                         2334
florida                    2124
this                       2011
s                          1948
#hurricaneirma             1913
for                        1859
from                       1806
you                        1741
it                         1612
we                         1607
on                         1543
#irma                      1527
at                         1505
reports                    1386
t                          1351
irma                       1289
hurricane                  1257
are                        1166
mph                        1160
asos                       1118
                           ... 
season..

In [43]:
word_list['#hurricaneirma']

1913

In [44]:
word_list[word_list.index.str[1]=='@'].index

Index(['.@mayorgimenez', '.@cbs12', '.@flashgjr', '.@10newswtsp',
       '.@richarddymond', '.@drtiajolie', '#@abc', '.@realdonaldtrump',
       '.@andrewwulfeck:', '.@occc', '.@rborn83,', '.@miamidadecounty',
       '.@manateesheriff', '.@deadpool1973', '-@notcampbellmatt',
       '.@dukeenergy', '.@miamidadefire', '-@grant_gilmore', '.@jimsmallman',
       '.@thecwsupergirl', 'w@30.', '.@goabode', 'l@s', '.@jason_lanning',
       '.@tampaelectric', '.@nicoleebryan', '.@potus'],
      dtype='object')

In [45]:
word_list[hashtag_words].sort_values(ascending=False)

                                   2811.0
my                                 2334.0
florida                            2124.0
this                               2011.0
you                                1741.0
it                                 1612.0
we                                 1607.0
irma                               1289.0
hurricane                          1257.0
gust                               1074.0
me                                  952.0
power                               878.0
fl                                  847.0
wind                                656.0
storm                               630.0
our                                 563.0
now                                 524.0
rain                                499.0
down                                477.0
safe                                471.0
miami                               468.0
go                                  410.0
will                                393.0
beach                             

In [220]:
word_pairs = list(combinations(list(vector_model.wv.vocab.keys()),2))

In [221]:
tweet_graph = nx.Graph()

In [222]:
for pair in word_pairs:
    edge_weight = vector_model.wv.similarity(pair[0],pair[1])
    if edge_weight > .80:
        tweet_graph.add_edge(pair[0],pair[1],weight=edge_weight)

In [49]:
# tweet_graph.add_nodes_from(vector_model.wv.vocab.keys())

In [223]:
nx.write_gexf(tweet_graph,path=r'./tweet_graph.gexf')