# Monitoring changes in related words over time.

### This notebook will show how words related to a particular word will change over time deltas

In [1]:
import pandas as pd
import numpy as np
from gensim.models import Word2Vec
from nltk.tokenize import TweetTokenizer
from nltk.corpus import stopwords
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.preprocessing import MinMaxScaler
from math import ceil
import string
from itertools import combinations
import networkx as nx
import re

In [2]:
columns = ['tweet_id','timestamp','tweet_text','user_id',
           'tweet_coords','tweet_coords_list','tweet_long','tweet_lat','location',
           'enc_url','tweet_lang','hashtags']
tweet_full = pd.read_csv(r'./tweetCoords.csv',
                         header=None,
                         names=columns,
                         parse_dates=[1],
                         infer_datetime_format=True,
                         index_col='timestamp')

Creating a separate dataframe for just the tweets classified as english.

In [3]:
tweet_full_en = tweet_full[tweet_full['tweet_lang'] == 'en']

writing a custom text cleaner. Currently configured to remove all punctuation, _except #_.

In [5]:
tweet_stops = stopwords.words('english')
tweet_tokenizer = TweetTokenizer(strip_handles=True,preserve_case=False,reduce_len=True)

def clean_tweet(tweet):
#     takes input string and converts or removes characters depending on settings.
#     returns a string
#     convert case:
    tweet = tweet.lower()
#     remove URLs:
    tweet = re.sub('https?://\S+','',tweet)
#     remove @mentions, including those with a leading '-' or '.' : 
    tweet = re.sub('[-\.]?@\w+','',tweet)
#     remove punctuation, but not hashtags:
    tweet = tweet.translate(tweet.maketrans('','',string.punctuation.replace("#","")))
#     remove non-hashtag '#'.
    tweet = re.sub('#\B','',tweet)
#     remove 'amp', 'gt', 'lt', indicating decoded ampersand, greater-than, less-than characters
    tweet = re.sub(r'\b(amp|gt|lt)\b','',tweet)
#     remove punctuation, including hashtags:
#     tweet = tweet.translate(tweet.maketrans('','',string.punctuation))
#     drop numbers and words of < 4 characters.
    tweet = re.sub(r'\b\w{1,3}\b','',tweet)
    tweet = re.sub(r'\b\d+\b','',tweet)
    return tweet

def tokens_no_stopwords(tweet_as_string):
#     wrapper function that combines the tokenizer, cleaner, and stopword removal.
#     takes a string and returns a list of strings
    cleaned_tweet = clean_tweet(tweet_as_string)
    tweet_as_tokens = tweet_tokenizer.tokenize(cleaned_tweet)
    tweet_no_stops = [word for word in tweet_as_tokens if word not in tweet_stops]
    
    return tweet_no_stops
    

In [6]:
re_text = "this is ! A TWEETlt withgtamp @some 3445 as .@random the amp gt lt @@extra #stuff ##in IT!?@>#! "
re_text = "OMGGGG @username @username2 Hurricane Irma!!!!!!!! SOOOOO much rain!!!!! I can't believe all of this water everywhere!"
print(len(re_text))
print(tokens_no_stopwords(re_text))

118
['omggg', 'hurricane', 'irma', 'sooo', 'much', 'rain', 'cant', 'believe', 'water', 'everywhere']


What's the word we're comparing similarity to?

In [7]:
search_term = "irma"

Starting here, begin the iteration over times.

In [8]:
related_words = pd.DataFrame()
tweet_date = pd.to_datetime("2017-09-10 00:00:00")
date_delta = pd.Timedelta("24HR")
end_date = pd.to_datetime("2017-09-10 00:00:00")

In [60]:
top_num_words = 200 # number of words to include in cosine similarity ordered list
pct_occ_thresh = .01 # words must occur a number of times >= this percent of number of tweets.

List of words from this time frame, based upon the occurrence threshold above:

In [None]:
tweet_window = tweet_full_en.loc[tweet_date:tweet_date+date_delta]
num_tweets = len(tweet_window)
min_count = ceil(num_tweets * pct_occ_thresh)
tweet_words = tweet_window['tweet_text'].apply(tokens_no_stopwords)
word_counts = tweet_words.apply(pd.Series).stack().value_counts()

In [None]:
tweet_window.iloc[-1]

In [None]:
tweet_window.loc['2017-09-12 00:00:28']

In [None]:
word_counts.index

histogram of word counts. Vertical line represents 1% threshold of word count.

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
hist_fig = plt.figure(figsize=(12,8))

plt.hist(word_counts[word_counts > min_count], bins=55,rwidth=.85);
plt.axvline(min_count,color='r');

In [None]:
word_counts[word_counts > min_count]

Stats on tokenized tweets. Starting with histogram of tweet length by word count, after processing into tokens.

In [None]:
# tweet_window['tweet_text'].head(20).apply(tokens_no_stopwords).apply(len)

In [None]:
hist_fig2 = plt.figure(figsize=(12,8))
plt.hist(tweet_window['tweet_text'].apply(tokens_no_stopwords).apply(len),bins=22,rwidth=.85);

currently there is an incompatibility between gensim and numpy > 1.13

In [61]:
for tweet_day in pd.date_range(start = tweet_date, end = end_date, freq = date_delta):
    tweet_text = tweet_full_en.loc[tweet_day:tweet_day + date_delta,"tweet_text"]
#     min_count = ceil(len(tweet_text) * pct_occ_thresh)
    min_count = 1
#     this line is just here for diagnostic purposes.
#     print(str(tweet_day)+": "+str(len(tweet_text))+" tweets ("+str(min_count)+" occurrence threshold)") 

    tweets_tokens = tweet_text.apply(tokens_no_stopwords)
    vector_model = Word2Vec(tweets_tokens, min_count=min_count, sg=1, window=3, workers=5, size=100)
    word_matrix = vector_model.wv[vector_model.wv.vocab]
#     tsne = TSNE(n_components=2)
#     result = tsne.fit_transform(word_matrix)
#     pca = PCA(n_components=2)
#     result = pca.fit_transform(word_matrix)

    terms_from_range = pd.DataFrame.from_records(vector_model.wv.most_similar(search_term,topn=top_num_words),columns=[tweet_day,"Cos_Sim"])
#     query_terms = list(terms_from_range.iloc[:,0])
#     query_terms.append("irma")
#     print(str(tweet_day))
#     print(" \'"+search_term+"\' tweets: "+str(len(tweet_text[tweet_text.str.contains("irma",flags=re.IGNORECASE)])))
#     print(len(tweet_text[tweet_text.str.contains("|".join(query_terms),flags=re.IGNORECASE)]))
#     print(len(tweet_text))
#     filename = str(tweet_day.date())+".csv"
#     tweet_text[tweet_text.str.contains("|".join(query_terms),flags=re.IGNORECASE)].to_csv(path=filename)
    related_words = pd.concat([related_words,terms_from_range],axis=1)

  if np.issubdtype(vec.dtype, np.int):


In [None]:
query_terms = list(terms_from_range.iloc[:,0])

In [None]:
query_terms.append("irma")

In [None]:
query_terms

In [None]:
list(terms_from_range.iloc[:,0])

In [None]:
"|".join(list(terms_from_range.iloc[:,0]))

In [None]:
vector_model.wv.similarity("irma","storm")

In [None]:
len(list(vector_model.wv.vocab.keys()))

In [None]:
vector_model.wv.vector_size

In [None]:
vector_model.wv.get_vector('irma')

In [None]:
# vector_model.wv.get_vector("storm").shape

In [None]:
# vector_model.wv.similarity("storm","rain")

In [None]:
# vector_model.wv.vocab

In [None]:
related_words.iloc[:,0]

In [None]:
terms = list(related_words.iloc[:,0])

In [None]:
re.IGNORECASE

In [None]:
"|".join(terms)

In [None]:
len(tweet_text[tweet_text.str.contains("irma",flags=re.IGNORECASE)])

In [None]:
len(tweet_text[tweet_text.str.contains("|".join(terms),flags=re.IGNORECASE)])

In [None]:
len(tweet_text)

In [None]:
!pwd

In [None]:
related_words.iloc[:,0::2].to_csv(r'daily_words2.csv')

In [None]:
# tweet_text[(tweet_text.str.contains(r"\bstorm\b",regex=True)) & (tweet_text.str.contains(r"\bdamage\b",regex=True))].values

In [None]:
# tweet_text[(tweet_text.str.contains(r"\bstorm\b",regex=True)) & (tweet_text.str.contains(r"\bhelping\b",regex=True))].values

Comparing words to hashtags:

In [None]:
word_list = tweet_window.tweet_text.str.lower().str.split(r'\s+',expand=True).stack().value_counts()

In [None]:
hashtags_count = word_list[word_list.index.str[0] == '#']

In [None]:
hashtag_words = hashtags_count.index.str[1:].values

In [None]:
word_list

In [None]:
word_list['#hurricaneirma']

In [None]:
word_list[word_list.index.str[1]=='@'].index

In [None]:
word_list[hashtag_words].sort_values(ascending=False)

In [None]:
word_pairs = list(combinations(list(vector_model.wv.vocab.keys()),2))

In [None]:
tweet_graph = nx.Graph()

In [None]:
for pair in word_pairs:
    edge_weight = vector_model.wv.similarity(pair[0],pair[1])
    if edge_weight > .80:
        tweet_graph.add_edge(pair[0],pair[1],weight=edge_weight)

In [None]:
# tweet_graph.add_nodes_from(vector_model.wv.vocab.keys())

In [None]:
nx.write_gexf(tweet_graph,path=r'./tweet_graph.gexf')