# Import Libraries

In [18]:
import gensim
import numpy as np
import spacy
import pickle
import json
import matplotlib.pyplot as plt
import pymongo
import re
from nltk import ngrams
from string import punctuation
from bson.son import SON
from pymongo import MongoClient
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans, MiniBatchKMeans
from gensim.models import CoherenceModel, LdaModel, LsiModel, HdpModel
from gensim.models.wrappers import LdaMallet
from gensim.corpora import Dictionary
import pyLDAvis.gensim

import os, re, operator, warnings
warnings.filterwarnings('ignore')  # Let's not pay heed to them right now
%matplotlib inline

# Import Data

In [None]:
client = MongoClient(port=27017)
db = client.tweet_on_vegan
popular_tweets = db.streaming_tweets
texts = popular_tweets.find({}, {'_id':1,"id":1,"text":1,'is_retweet':1}).skip(400000).limit(100)
for text in texts:
    pass
    #print(text)

In [20]:
client = MongoClient(port=27017)
db = client.tweet_on_vegan
popular_tweets = db.streaming_tweets
texts = popular_tweets.find({'lang':'en'}, 
                            {'_id':0,"text":1})
tweets = [text['text'] for text in texts]

In [21]:
import pandas as pd
mvp_tweets = pd.DataFrame(tweets, columns=['tweets'])
mvp_tweets.sample(5)

Unnamed: 0,tweets
22554,Please RT? #vegetarian #vegan #healthyfood #re...
30641,Oh yes #Dairy farmers care.\nThey care about p...
135251,RT @BigHospitality: High profile US vegan rest...
372181,RT @janelleshakur: asap ferg is vegan NONE OF ...
253975,RT @taylorndean: That fox looks like it’s on h...


# Clean Data

* Remove urls
* Remove user names
* Remove line breaks
* Group tweets and count retweets

In [22]:
# http://stackoverflow.com/a/13752628/6762004
RE_EMOJI = re.compile('[\U00010000-\U0010ffff]', flags=re.UNICODE)
def clean_tweet(x):
    x = x.strip()
    x = RE_EMOJI.sub(r'', x)
    x = re.sub(r'http\S+', '', x, flags=re.MULTILINE)
    x = re.sub(r'RT\s@\S+', '', x, flags=re.MULTILINE)
    x = re.sub(r'@\S+', '', x, flags=re.MULTILINE)
    x = re.sub(r'\n', ' ', x, flags=re.MULTILINE)
    return (''.join(c for c in x if c not in punctuation)).strip()

mvp_tweets.tweets = mvp_tweets.tweets.apply(clean_tweet)

mvp_tweets = (pd.DataFrame(mvp_tweets.tweets.value_counts())
              .reset_index()
              .rename(columns={'tweets':'retweets', 'index': 'text'}))
mvp_tweets.sample(5)

Unnamed: 0,text,retweets
142411,Looking for healthy energy Stop by table to t...,1
132012,Retweeted Vegan Paradise My food is grown no...,1
88919,Sis is vegan and thicc my inspiration,1
146767,Nutrition Guidelines for Children Should Inclu...,1
16197,Please RT recipes food cooking delicious cook ...,2


In [25]:
# Checking
mvp_tweets.info()
mvp_tweets.head(10)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 148187 entries, 0 to 148186
Data columns (total 2 columns):
text        148187 non-null object
retweets    148187 non-null int64
dtypes: int64(1), object(1)
memory usage: 2.3+ MB


Unnamed: 0,text,retweets
0,I want to be a vegan but I’m hungry,15243
1,someone please save this fox from this crazy v...,14820
2,If you are vegan and don’t want to feed carniv...,10214
3,That fox looks like it’s on his death bed A ca...,7912
4,Y’all heard Beyoncé it’s VEGAN TIME,6003
5,Me I could never be a vegan Beyoncé Vegan tim...,5062
6,yes im vegan yes i eat meat we exist,4418
7,BREAKING NEWS Mariah Carey has officially beco...,4014
8,can people like NOT debate whether or not you ...,3859
9,are you going to be okay,3495


# Tokenization 

In [26]:
# Stop words
nlp = spacy.load("en")

my_stop_words = [u'vegan', u'amp', 
                 u"think", u"thinks", u"thought",
                 u"okay", u"Okay", u"OKAY",
                 u"want", u"wants",
                 u"like",
                 u"amp", u"vegan", u"Vegan", u"VEGAN"]
for stopword in my_stop_words:
    lexeme = nlp.vocab[stopword]
    lexeme.is_stop = True


In [27]:
from spacy.tokens import Token
from spacy.lang.en.stop_words import STOP_WORDS  # import stop words from language data
stop_words = STOP_WORDS | set(my_stop_words)
stop_words_getter = lambda token: token.is_stop or token.lower_ in stop_words or token.lemma_ in stop_words
Token.set_extension('is_stop', getter=stop_words_getter)  # set attribute with getter


In [28]:
# we add some words to the stop word list
tweets_spacy = []

for i, tweet in enumerate(mvp_tweets.text):
    if i % 1000 == 0:
        print(i)
    tweet_tokenized = []
    doc = nlp(tweet)
    for token in doc:
    # if it's not a stop word or punctuation mark, add it to our article!
        if not token.is_stop and not token.is_punct \
        and not token.like_num and not token.pos_ in ['SYM', 'SPACE']\
        and not token.lemma_ == '-PRON-' and token.__len__() >= 2:
            # we add the lematized version of the word
            tweet_tokenized.append(token.lemma_)
    tweets_spacy.append(tweet_tokenized)

0
1000
2000
3000
4000
5000
6000
7000
8000
9000
10000
11000
12000
13000
14000
15000
16000
17000
18000
19000
20000
21000
22000
23000
24000
25000
26000
27000
28000
29000
30000
31000
32000
33000
34000
35000
36000
37000
38000
39000
40000
41000
42000
43000
44000
45000
46000
47000
48000
49000
50000
51000
52000
53000
54000
55000
56000
57000
58000
59000
60000
61000
62000
63000
64000
65000
66000
67000
68000
69000
70000
71000
72000
73000
74000
75000
76000
77000
78000
79000
80000
81000
82000
83000
84000
85000
86000
87000
88000
89000
90000
91000
92000
93000
94000
95000
96000
97000
98000
99000
100000
101000
102000
103000
104000
105000
106000
107000
108000
109000
110000
111000
112000
113000
114000
115000
116000
117000
118000
119000
120000
121000
122000
123000
124000
125000
126000
127000
128000
129000
130000
131000
132000
133000
134000
135000
136000
137000
138000
139000
140000
141000
142000
143000
144000
145000
146000
147000
148000


In [29]:
tweets_twograms = []
for i in range(len(tweets_spacy)):
    twograms = ngrams(tweets_spacy[i], 2)
    temp = []
    for grams in twograms:
        #pass
        temp.append(' '.join(grams))
        
    tweets_twograms.append(temp+tweets_spacy[i])


In [30]:
len(tweets_twograms)

148187

In [None]:
#for ent in tweet_tokenized.ents:
#    print(ent.text, ent.start_char, ent.end_char, ent.label_)

In [None]:
#for noun_chunks in doc.noun_chunks:
#    print(noun_chunks)

# Model 1
* TF-IDF (sklearn)
* LSA (gensim)

In [None]:
#import numpy as np
#from nltk.tokenize import word_tokenize, wordpunct_tokenize, WhitespaceTokenizer
#from nltk.chunk import ne_chunk
#from nltk.tag import pos_tag

# Text with some entities
#sample_tweet = np.array(foo.text)
#for t in sample_tweet[:5]:
#    tokens = pos_tag(word_tokenize(t))
#    entities = ne_chunk(tokens)

#foo.text.apply(lambda x: pos_tag(word_tokenize(x)))



## TF-IDF (sklearn)

In [31]:
from sklearn.feature_extraction import text

my_stop_words = text.ENGLISH_STOP_WORDS.union([u'vegan', u'amp', 
                 u"think", u"thinks", u"thought",
                 u"okay", u"Okay", u"OKAY",
                 u"want", u"wants"
                 u"amp", u"vegan", u"Vegan", u"VEGAN",
                 u"like"])

In [32]:
# Import
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.stem.lancaster import LancasterStemmer

# This uses WordNet (huge lexical database of English words)
stemmer = LancasterStemmer()
analyzer = CountVectorizer().build_analyzer()

def stemmed_words(doc):
    return (stemmer.stem(w) for w in analyzer(doc))

#tfidf = TfidfVectorizer(tokenizer=lambda i:i, lowercase=False)

# Create a vectorizer object to generate term document counts
# Note all the parameters we can use, let's play!
tfidf = TfidfVectorizer(tokenizer=lambda i:i, lowercase=False,
                        stop_words = my_stop_words, 
                        min_df=100, max_df=0.8, ngram_range=(1,2))
                        #analyzer=stemmed_words)
#tokenizer=lambda i:i, lowercase=False,    

# Get the vectors
sample_tweet_train = tfidf.fit_transform(tweets_twograms)
# Store them in a Pandas DataFrame
#sample_tweet_train_df = pd.DataFrame(sample_tweet_train.todense(), 
                           #columns=[tfidf.get_feature_names()])
#sample_tweet_train_df.head()

In [33]:
len(tfidf.get_feature_names())

2330

## LSA (gensim)

In [34]:
# Import
from gensim import corpora, models, similarities, matutils

In [35]:

# Convert sparse matrix of counts to a gensim corpus
# Need to transpose it for gensim which wants 
# terms by docs instead of docs by terms
tfidf_corpus = matutils.Sparse2Corpus(sample_tweet_train.transpose())

# Row indices
id2word = dict((v, k) for k, v in tfidf.vocabulary_.items())

# This is a hack for Python 3!
id2word = corpora.Dictionary.from_corpus(tfidf_corpus, 
                                         id2word=id2word)

# Build an LSI space from the input TFIDF matrix, mapping of row id to word, and num_topics
# num_topics is the number of dimensions to reduce to after the SVD
# Analagous to "fit" in sklearn, it primes an LSI space
lsi = models.LsiModel(tfidf_corpus, id2word=id2word, num_topics=100)

# Retrieve vectors for the original tfidf corpus in the LSI space ("transform" in sklearn)
lsi_corpus = lsi[tfidf_corpus]

# Dump the resulting document vectors into a list so we can take a look
doc_vecs = [doc for doc in lsi_corpus]

In [36]:
# Create an index transformer that calculates similarity based on 
# our space
index = similarities.MatrixSimilarity(doc_vecs, 
                                      num_features=100)

In [None]:
# Return the sorted list of cosine similarities to the first document
sims = sorted(enumerate(index[doc_vecs[7]]), key=lambda item: -item[1])

#similar_tweet = {}
similar_tweet = []
count = 0
for tweet, similarity in sims:
    if similarity >= 0.8:
        #pass
        count += 1
        print(mvp_tweets.loc[tweet, 'text'])
        #similar_tweet.append(tweet)

# KMean

In [None]:
# Convert the gensim-style corpus vecs to a numpy array for sklearn manipulations
ng_lsi = matutils.corpus2dense(lsi_corpus, num_terms=100).transpose()
ng_lsi.shape

In [None]:
scaler = StandardScaler()
ng_lsi_standard = scaler.fit_transform(ng_lsi)

In [None]:
from sklearn.metrics import silhouette_score

SSEs = []
Sil_coefs = []
for k in range(2,52,5):
    print(k)
    #km = KMeans(n_clusters=k, random_state=1)
    km = MiniBatchKMeans(n_clusters=k)
    km.fit(ng_lsi_standard)
    labels = km.labels_
    Sil_coefs.append(silhouette_score(ng_lsi_standard, labels, metric='euclidean'))
    SSEs.append(km.inertia_) 
    
   # print(Sil_coefs)
   # print(labels)

In [None]:
fig, (ax1, ax2) = plt.subplots(1,2, figsize=(15,5), sharex=True, dpi=200)
k_clusters = range(2,52,5)
ax1.plot(k_clusters, Sil_coefs)
ax1.set_xlabel('number of clusters')
ax1.set_ylabel('silhouette coefficient')

# plot here on ax2
ax2.plot(k_clusters, SSEs)
ax2.set_xlabel('number of clusters')
ax2.set_ylabel('SSE');

In [None]:
# Create KMeans
kmeans = MiniBatchKMeans(n_clusters=22)

# Cluster
ng_lsi_clusters = kmeans.fit_predict(ng_lsi_standard)

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline

plt.hist(ng_lsi_clusters, bins=21)

In [None]:
for i, tweet in enumerate(mvp_tweets.text):
    if ng_lsi_clusters[i] == 3:
        print(tweet)

# Model 2

* CountVecorizer
* LDA

In [69]:
# Import
from nltk.stem.lancaster import LancasterStemmer

# This uses WordNet (huge lexical database of English words)
stemmer = LancasterStemmer()
analyzer = CountVectorizer().build_analyzer()

def stemmed_words(doc):
    return (stemmer.stem(w) for w in analyzer(doc))

count_vectorizer = CountVectorizer(tokenizer=lambda i:i, lowercase=False,
                                   stop_words = my_stop_words, 
                                   min_df=100, max_df=0.8)
                                  #analyzer=stemmed_words)

# Create a vectorizer object to generate term document counts
# Note all the parameters we can use, let's play!
#cv = CountVectorizer(token_pattern='[a-z][a-z]+',
#                     stop_words = 'english')
#tokenizer=lambda i:i, lowercase=False,
#token_pattern='[a-z][a-z]+', ngram_range=(1,2),
# Get the vectors
count_vectorizer.fit(tweets_twograms)
counts = count_vectorizer.transform(tweets_twograms).transpose()
# Store them in a Pandas DataFrame
#sample_tweet_train_df = pd.DataFrame(sample_tweet_train.todense(), columns=[stem_vectorizer.get_feature_names()])
#ng_train_df.head()

In [71]:
bag_of_words = counts.transpose()

In [72]:
bag_of_words.shape

(148187, 2218)

In [73]:
bag_of_words = np.sum(bag_of_words, axis=0)

In [74]:
freq = list(zip(count_vectorizer.get_feature_names(), np.squeeze(np.asarray(bag_of_words))))

In [75]:
freq

[('1st', 192),
 ('able', 406),
 ('abs', 197),
 ('abs exercise', 164),
 ('absolute', 107),
 ('absolutely', 423),
 ('abuse', 567),
 ('accept', 129),
 ('accidentally', 150),
 ('accord', 136),
 ('account', 261),
 ('achieve', 217),
 ('acid', 135),
 ('acne', 112),
 ('act', 287),
 ('action', 155),
 ('activism', 167),
 ('activist', 335),
 ('actual', 210),
 ('actually', 1558),
 ('ad', 342),
 ('add', 1834),
 ('add closet', 195),
 ('add video', 521),
 ('addition', 101),
 ('ado1305', 214),
 ('ado1305 save', 143),
 ('adopt', 201),
 ('adventure', 129),
 ('advice', 220),
 ('advocate', 147),
 ('af', 289),
 ('affect', 102),
 ('afford', 224),
 ('affordable', 109),
 ('afternoon', 165),
 ('age', 229),
 ('ago', 487),
 ('agree', 482),
 ('agriculture', 185),
 ('ahead', 261),
 ('air', 391),
 ('aka', 130),
 ('alcohol', 123),
 ('alert', 101),
 ('alfredo', 147),
 ('alive', 325),
 ('alive say', 131),
 ('allergic', 134),
 ('allergy', 187),
 ('allnatural', 111),
 ('allow', 249),
 ('almond', 894),
 ('almond milk', 3

In [76]:
sort_word_freq = sorted(freq, key=lambda x: -x[1])

In [77]:
sort_word_freq

[('eat', 13447),
 ('food', 11505),
 ('good', 9376),
 ('recipe', 9134),
 ('’', 8921),
 ('animal', 8128),
 ('vegetarian', 7251),
 ('meat', 6863),
 ('diet', 6853),
 ('try', 6414),
 ('love', 6368),
 ('people', 6090),
 ('day', 5850),
 ('know', 5568),
 ('free', 5188),
 ('healthy', 5142),
 ('new', 5128),
 ('time', 4506),
 ('make', 4402),
 ('need', 4395),
 ('look', 4232),
 ('cheese', 4171),
 ('health', 3951),
 ('glutenfree', 3640),
 ('say', 3560),
 ('today', 3499),
 ('thank', 3443),
 ('delicious', 3437),
 ('thing', 3404),
 ('come', 3375),
 ('organic', 3372),
 ('life', 3291),
 ('chocolate', 3244),
 ('milk', 3085),
 ('great', 2924),
 ('’s', 2888),
 ('year', 2862),
 ('video', 2812),
 ('product', 2712),
 ('way', 2668),
 ('easy', 2662),
 ('meal', 2568),
 ('feel', 2556),
 ('live', 2552),
 ('tell', 2547),
 ('veganism', 2517),
 ('rt', 2472),
 ('dairy', 2448),
 ('raw', 2433),
 ('start', 2400),
 ('veggie', 2398),
 ('week', 2398),
 ('option', 2374),
 ('chicken', 2364),
 ('plant', 2359),
 ('beyoncé', 2327

In [1]:
import pickle
sort_word_freq_customized = pickle.load(open('sort_word_freq_customized', 'rb'))
sort_word_freq = pickle.load(open('sort_word_freq', 'rb'))

In [17]:
for word, number in sort_word_freq[:60]:
    print(str(number//300)+'\t'+word)

35	just
34	food
25	eat
23	vegetarian
22	im
22	meat
20	diet
20	people
19	going
19	recipe
19	make
18	love
18	good
17	free
17	new
15	recipes
15	animals
15	healthy
15	know
15	day
14	eating
14	plantbased
13	time
13	dont
13	cheese
13	health
12	vegans
12	glutenfree
12	animal
11	try
11	need
11	don
11	delicious
11	today
11	organic
11	really
10	chocolate
10	best
10	life
10	milk
9	great
8	got
8	video
8	veganism
8	rt
8	dairy
8	raw
8	beyonc
7	way
7	easy
7	ve
7	products
7	lol
7	chicken
7	protein
7	cooking
7	thanks
6	say
6	pizza
6	thing


In [None]:
corpus = matutils.Sparse2Corpus(counts)

id2word = dict((v, k) for k, v in count_vectorizer.vocabulary_.items())

lda = models.LdaModel(corpus=corpus, num_topics=20, id2word=id2word, passes=20)

lda.print_topics(num_words=20)


# Gensim Model

In [None]:
dictionary = Dictionary(tweets_twograms)
corpus = [dictionary.doc2bow(tweet) for tweet in tweets_twograms]

# LSI

In [None]:
lsimodel = LsiModel(corpus=corpus, num_topics=20, id2word=dictionary)
lsimodel.show_topics(num_topics=20)  # Showing only the top 5 topics

# HDP

In [None]:
hdpmodel = HdpModel(corpus=corpus, id2word=dictionary)
hdpmodel.show_topics()

# LDA

In [None]:
ldamodel = LdaModel(corpus=corpus, num_topics=10, passes=20, id2word=dictionary)
ldamodel.show_topics()

# pyLDAvis

In [None]:
pyLDAvis.enable_notebook()
pyLDAvis.gensim.prepare(ldamodel, corpus, dictionary)