# Monitoring changes in related words over time.

### This notebook will show how words related to a particular word will change over time deltas

In [109]:
import pandas as pd
import numpy as np
from gensim.models import Word2Vec
from nltk.tokenize import TweetTokenizer
from nltk.corpus import stopwords
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from math import ceil
import string
from itertools import combinations
import networkx as nx
import re

In [None]:
columns = ['tweet_id','timestamp','tweet_text','user_id',
           'tweet_coords','tweet_coords_list','tweet_long','tweet_lat','location',
           'enc_url','tweet_lang','hashtags']
tweet_full = pd.read_csv(r'./tweetCoords.csv',
                         header=None,
                         names=columns,
                         parse_dates=[1],
                         infer_datetime_format=True,
                         index_col='timestamp')

Creating a separate dataframe for just the tweets classified as english.

In [325]:
tweet_full_en = tweet_full[tweet_full['tweet_lang'] == 'en']

writing a custom text cleaner. Currently configured to remove all punctuation, _except #_.

In [231]:
tweet_stops = stopwords.words('english')
tweet_tokenizer = TweetTokenizer(strip_handles=True,preserve_case=False,reduce_len=True)

def clean_tweet(tweet):
#     takes input string and converts or removes characters depending on settings.
#     returns a string
#     convert case:
    tweet = tweet.lower()
#     remove URLs:
    tweet = re.sub('https?://\S+','',tweet)
#     remove @mentions, including those with a leading '-' or '.' : 
    tweet = re.sub('[-\.]?@\w+','',tweet)
#     remove punctuation, but not hashtags:
    tweet = tweet.translate(tweet.maketrans('','',string.punctuation.replace("#","")))
#     remove non-hashtag '#'.
    tweet = re.sub('#\B','',tweet)
#     remove 'amp', 'gt', 'lt', indicating decoded ampersand, greater-than, less-than characters
    tweet = re.sub(r'\b(amp|gt|lt)\b','',tweet)
#     remove punctuation, including hashtags:
#     tweet = tweet.translate(tweet.maketrans('','',string.punctuation))
    return tweet

def tokens_no_stopwords(tweet_as_string):
#     wrapper function that combines the tokenizer, cleaner, and stopword removal.
#     takes a string and returns a list of strings
    cleaned_tweet = clean_tweet(tweet_as_string)
    tweet_as_tokens = tweet_tokenizer.tokenize(cleaned_tweet)
    tweet_no_stops = [word for word in tweet_as_tokens if word not in tweet_stops]
    
    return tweet_no_stops
    

In [229]:
re_text = "this is ! A TWEETlt withgtamp @some .@random amp gt lt @@extra #stuff ##in IT!?@>#! "
print(tokens_no_stopwords(re_text))

['tweetlt', 'withgtamp', '#stuff', '#in']


In [326]:
tweet_full_en.loc["2017-09-10 09:00:00":"2017-09-11 09:00:00",'tweet_text'].apply(tokens_no_stopwords).apply(pd.Series).stack().value_counts()

florida                1774
#hurricaneirma         1658
reports                1385
hurricane              1316
irma                   1313
#irma                  1250
fl                     1232
mph                    1169
asos                   1118
gust                   1076
knots                  1027
power                  1024
storm                   726
wind                    710
rain                    661
still                   605
safe                    571
like                    562
beach                   542
county                  534
get                     492
us                      425
go                      422
miami                   421
good                    401
pm                      397
f                       386
right                   385
stay                    360
co                      355
                       ... 
basics                    1
restbetter                1
#carpool                  1
aswell                    1
stressbut           

What's the word we're comparing similarity to?

In [184]:
search_term = "irma"

Starting here, begin the iteration over times.

In [294]:
related_words = pd.DataFrame()
tweet_date = pd.to_datetime("2017-09-10 00:00:00")
date_delta = pd.Timedelta("24HR")
end_date = pd.to_datetime("2017-09-10 00:00:00")

In [287]:
top_num_words = 20 # number of words to include in cosine similarity ordered list
pct_occ_thresh = .01 # words must occur a number of times >= this percent of number of tweets.

List of words from this time frame, based upon the occurrence threshold above:

In [317]:
num_tweets = len(tweet_full[tweet_full.tweet_lang == 'en'].loc[tweet_date:tweet_date+date_delta])
min_count = ceil(num_tweets * pct_occ_thresh)
tweet_words = tweet_full[tweet_full.tweet_lang == 'en'].loc["2017-09-10 09:00:00":"2017-09-11 09:00:00",'tweet_text'].apply(tokens_no_stopwords)
word_counts = tweet_words.apply(pd.Series).stack().value_counts()
word_counts[word_counts > min_count]

florida           1774
#hurricaneirma    1658
reports           1385
hurricane         1316
irma              1313
#irma             1250
fl                1232
mph               1169
asos              1118
gust              1076
knots             1027
power             1024
storm              726
wind               710
rain               661
still              605
safe               571
like               562
beach              542
county             534
get                492
us                 425
go                 422
miami              421
good               401
pm                 397
f                  386
right              385
stay               360
co                 355
                  ... 
day                254
ese                249
st                 248
know               241
2                  237
shit               236
outside            228
weather            226
palm               226
west               226
humidity           219
edt                218
tampa      

Number of words per tweet, after processing:

In [324]:
tweet_full[tweet_full.tweet_lang == 'en'].loc["2017-09-10 09:00:00":"2017-09-11 09:00:00",'tweet_text'].apply(tokens_no_stopwords)

timestamp
2017-09-10 09:00:01    [wind, 10, mph, nne, barometer, 29832, falling...
2017-09-10 09:00:03    [im, going, live, national, fox, news, fews, m...
2017-09-10 09:00:07                   [hear, shutters, shake, one, time]
2017-09-10 09:00:11    [0457, temp, 783, f, hum, 92, dewp, 752, f, ba...
2017-09-10 09:00:20                                        [#irma, woke]
2017-09-10 09:00:29                                  [still, awake, lol]
2017-09-10 09:00:50    [reagan, running, lebanon, marine, barracks, b...
2017-09-10 09:00:53    [please, pray, florida, keys, keep, talking, s...
2017-09-10 09:01:29                                     [great, sweetie]
2017-09-10 09:02:04    [454, 1, w, coral, springs, broward, co, fl, m...
2017-09-10 09:02:08    [barometric, pressure, plummeting, fast, 973mb...
2017-09-10 09:02:10      [irma, eye, move, across, lower, florida, keys]
2017-09-10 09:02:18                                              [karma]
2017-09-10 09:02:21    [#firstresponders,

currently there is an incompatibility between gensim and numpy > 1.13

In [318]:
for tweet_day in pd.date_range(start = tweet_date, end = end_date, freq = date_delta):
    tweet_text = tweet_full[tweet_full.tweet_lang == 'en'].loc[tweet_day:tweet_day + date_delta,"tweet_text"]
    min_count = ceil(len(tweet_text) * pct_occ_thresh)
#     this line is just here for diagnostic purposes.
#     print(str(tweet_day)+": "+str(len(tweet_text))+" tweets ("+str(min_count)+" occurrence threshold)") 

    tweets_tokens = tweet_text.apply(tokens_no_stopwords)
    vector_model = Word2Vec(tweets_tokens, min_count=min_count, sg=1, window=3, workers=5, size=100)
    word_matrix = vector_model.wv[vector_model.wv.vocab]
#     tsne = TSNE(n_components=2)
#     result = tsne.fit_transform(word_matrix)
    pca = PCA(n_components=2)
    result = pca.fit_transform(word_matrix)

    terms_from_range = pd.DataFrame.from_records(vector_model.wv.most_similar(search_term,topn=top_num_words),columns=[tweet_day,"Cos_Sim"])
    related_words = pd.concat([related_words,terms_from_range],axis=1)

  if np.issubdtype(vec.dtype, np.int):


In [32]:
# vector_model.wv.get_vector("storm").shape

In [33]:
# vector_model.wv.similarity("storm","rain")

In [34]:
# vector_model.wv.vocab

In [314]:
word_counts[word_counts > min_count]

florida           2257
#hurricaneirma    1980
#irma             1585
irma              1537
reports           1387
hurricane         1344
fl                1295
mph               1177
asos              1118
gust              1076
power             1030
knots             1027
de                 749
storm              731
wind               714
miami              684
rain               662
beach              621
still              607
safe               575
en                 568
like               565
county             553
get                493
n                  467
us                 427
pm                 426
go                 426
good               402
#florida           398
                  ... 
winds              354
back               350
que                349
going              346
#hurricane         344
got                343
house              339
time               330
people             320
la                 316
one                314
se                 314
getting    

In [319]:
related_words.iloc[:,0::2]

Unnamed: 0,2017-09-10 00:00:00,2017-09-10 00:00:00.1
0,#irma,#irma
1,florida,stay
2,#florida,#hurricane
3,#hurricane,everyone
4,#miami,#hurricaneirma
5,stay,#florida
6,county,safe
7,us,house
8,#hurricaneirma,us
9,miami,like


In [37]:
# tweet_text[(tweet_text.str.contains(r"\bstorm\b",regex=True)) & (tweet_text.str.contains(r"\bdamage\b",regex=True))].values

In [38]:
# tweet_text[(tweet_text.str.contains(r"\bstorm\b",regex=True)) & (tweet_text.str.contains(r"\bhelping\b",regex=True))].values

Comparing words to hashtags:

In [160]:
tweet_full[tweet_full.tweet_text.str.contains("\\blt\\b")].tweet_text

timestamp
2017-09-01 02:42:56    @matt_swag1  amp  _its_guwap TURNIN UP TO MY #...
2017-09-01 03:19:14    being bae today and running errands with her w...
2017-09-01 04:24:25    FULL VIDEO IN MY BIO  lt ---          #Benzo o...
2017-09-01 04:49:28    Summer 1997  lt  Summer 2017  minus the Nazi s...
2017-09-01 08:16:10    In my head  lt 3 en I m Not Lost I m RVing htt...
2017-09-01 09:01:16    In my head  lt 3 en I m Not Lost I m RVing htt...
2017-09-01 13:54:16                   introducing yourself in class  lt 
2017-09-01 14:59:08    Reminder:   gt  gt  Labor Day is on Monday, Se...
2017-09-01 15:45:23    The Danger that Lies Within  Diet  Foods... ht...
2017-09-01 16:53:56    John Kelly considered resigning after Comey wa...
2017-09-01 17:53:34    @FoxBusiness @CharlesHurt Straight-thinking Ch...
2017-09-01 22:17:37    E d_Mobarek to all the muselmans poeple  lt 3 ...
2017-09-02 15:06:57    When your favorite bartender isnt in the clubh...
2017-09-02 17:39:17    i just saw this im

In [39]:
word_list = tweet_full.loc["2017-09-10 09:00:00":"2017-09-11 09:00:00"].tweet_text.str.lower().str.split(r'\s+',expand=True).stack().value_counts()

In [40]:
hashtags_count = word_list[word_list.index.str[0] == '#']

In [41]:
hashtag_words = hashtags_count.index.str[1:].values

In [42]:
word_list

the                        5790
@                          4370
i                          4057
to                         3717
a                          3586
of                         3394
in                         3178
and                        3034
                           2811
is                         2761
my                         2334
florida                    2124
this                       2011
s                          1948
#hurricaneirma             1913
for                        1859
from                       1806
you                        1741
it                         1612
we                         1607
on                         1543
#irma                      1527
at                         1505
reports                    1386
t                          1351
irma                       1289
hurricane                  1257
are                        1166
mph                        1160
asos                       1118
                           ... 
season..

In [43]:
word_list['#hurricaneirma']

1913

In [44]:
word_list[word_list.index.str[1]=='@'].index

Index(['.@mayorgimenez', '.@cbs12', '.@flashgjr', '.@10newswtsp',
       '.@richarddymond', '.@drtiajolie', '#@abc', '.@realdonaldtrump',
       '.@andrewwulfeck:', '.@occc', '.@rborn83,', '.@miamidadecounty',
       '.@manateesheriff', '.@deadpool1973', '-@notcampbellmatt',
       '.@dukeenergy', '.@miamidadefire', '-@grant_gilmore', '.@jimsmallman',
       '.@thecwsupergirl', 'w@30.', '.@goabode', 'l@s', '.@jason_lanning',
       '.@tampaelectric', '.@nicoleebryan', '.@potus'],
      dtype='object')

In [45]:
word_list[hashtag_words].sort_values(ascending=False)

                                   2811.0
my                                 2334.0
florida                            2124.0
this                               2011.0
you                                1741.0
it                                 1612.0
we                                 1607.0
irma                               1289.0
hurricane                          1257.0
gust                               1074.0
me                                  952.0
power                               878.0
fl                                  847.0
wind                                656.0
storm                               630.0
our                                 563.0
now                                 524.0
rain                                499.0
down                                477.0
safe                                471.0
miami                               468.0
go                                  410.0
will                                393.0
beach                             

In [320]:
word_pairs = list(combinations(list(vector_model.wv.vocab.keys()),2))

In [321]:
tweet_graph = nx.Graph()

In [322]:
for pair in word_pairs:
    edge_weight = vector_model.wv.similarity(pair[0],pair[1])
    if edge_weight > .80:
        tweet_graph.add_edge(pair[0],pair[1],weight=edge_weight)

In [49]:
# tweet_graph.add_nodes_from(vector_model.wv.vocab.keys())

In [323]:
nx.write_gexf(tweet_graph,path=r'./tweet_graph.gexf')