In [20]:

% matplotlib inline

# Standard imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import tweepy as tw
from collections import Counter
from itertools import combinations
import string
import networkx as nx
import matplotlib as mpl
from sklearn.decomposition import LatentDirichletAllocation, TruncatedSVD
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.datasets import fetch_20newsgroups 
from sklearn.decomposition import NMF, LatentDirichletAllocation, TruncatedSVD
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import euclidean_distances
from nltk.corpus import stopwords

# These two lines suppress warnings that sometimes
# occur when making visualizations
import warnings
warnings.filterwarnings('ignore')


### Twiiter Authorization

In [2]:
tokens = []
# Order: Access Token, Access Token Secret, Consumer Key, Consumer SecretAccess
with open("twitter_cred.txt", 'r') as fin:
    for line in fin:
        if line[0] != '#': # Not a comment line
            tokens.append(line.rstrip('\n'))

auth = tw.OAuthHandler(tokens[2], tokens[3])
auth.set_access_token(tokens[0], tokens[1])
api = tw.API(auth, wait_on_rate_limit=True)
user = api.me()
print("Twitter Screen Name: ", user.screen_name)


Twitter Screen Name:  amit_darekar27


### Collect Tweets

In [3]:
tweets = []
name = 'nytimes' # Screen Name of the user
num_tweets = 200

# Get the tweets
results = api.user_timeline(screen_name= name, count=num_tweets)
    
# Build list from Twitter search results
for result in results:
    tweets.append(result.text)

In [6]:
name = 'justinbieber' # Screen Name of the user

# Get the tweets
results2 = api.user_timeline(screen_name= name, count=num_tweets)
    
# Build list from Twitter search results
for result in results2:
    tweets.append(result.text)

In [8]:
name = 'KimKardashian' # Screen Name of the user

# Get the tweets
results3 = api.user_timeline(screen_name= name, count=num_tweets)
    
# Build list from Twitter search results
for result in results3:
    tweets.append(result.text)

In [9]:
name = 'realDonaldTrump' # Screen Name of the user

# Get the tweets
results4 = api.user_timeline(screen_name= name, count=num_tweets)
    
# Build list from Twitter search results
for result in results4:
    tweets.append(result.text)

In [10]:
name = 'NatGeo' # Screen Name of the user

# Get the tweets
results5 = api.user_timeline(screen_name= name, count=num_tweets)
    
# Build list from Twitter search results
for result in results5:
    tweets.append(result.text)

In [11]:
name = 'richardbranson' # Screen Name of the user

# Get the tweets
results6 = api.user_timeline(screen_name= name, count=num_tweets)
    
# Build list from Twitter search results
for result in results6:
    tweets.append(result.text)

In [12]:
name = 'elonmusk' # Screen Name of the user

# Get the tweets
results7 = api.user_timeline(screen_name= name, count=num_tweets)
    
# Build list from Twitter search results
for result in results7:
    tweets.append(result.text)

### Feature Extraction

In [48]:
#Paramter
NUM_TOPICS = 10

# To remove English stopword + word 'https'
stop_words = set(stopwords.words('english'))
stop_words.update(['https'])

#Convert a collection of text documents to a matrix of token counts
vectorizer = CountVectorizer(
                             stop_words=stop_words, lowercase=True, 
                             token_pattern='[a-zA-Z\-][a-zA-Z\-]{2,}')
data_vectorized = vectorizer.fit_transform(tweets)
 
# Build a Latent Dirichlet Allocation Model
lda_model = LatentDirichletAllocation(n_topics=NUM_TOPICS, max_iter=10, learning_method='online')
lda_Z = lda_model.fit_transform(data_vectorized)
print(lda_Z.shape)  # (NO_DOCUMENTS, NO_TOPICS)
 
# Different topic distribution in the first document
print('Different topic distribution in the first document')
print(lda_Z[0])
print('\nDifferent topic distribution in the last document')
print(lda_Z[-1])

  token_pattern='[a-zA-Z\-][a-zA-Z\-]{2,}')


(1399, 10)
Different topic distribution in the first document
[0.00769231 0.00769252 0.00769253 0.00769236 0.0076926  0.00769231
 0.00769456 0.00769256 0.00769301 0.93076525]

Different topic distribution in the last document
[0.07332421 0.00666716 0.74001646 0.00666667 0.00666667 0.00666667
 0.00666667 0.0066667  0.07332599 0.07333283]


### Inspect Each Topic

In [40]:
def print_topics(model, vectorizer, top_n=10):
    for idx, topic in enumerate(model.components_):
        print("Topic %d:" % (idx))
        print([(vectorizer.get_feature_names()[i], topic[i])
                        for i in topic.argsort()[:-top_n - 1:-1]])
print("LDA Model:")
print_topics(lda_model, vectorizer)
print("=" * 20)

LDA Model:
Topic 0:
[('space', 22.583956653907), ('everyone', 18.65035553794515), ('first', 16.854050720277556), ('pobz', 16.536524039249986), ('look', 16.509928792595336), ('bhjs', 15.036539406190334), ('one', 14.912388687833), ('tomorrow', 13.538251795631568), ('million', 12.980755614099639), ('got', 12.66001518112506)]
Topic 1:
[('send', 18.2042631599868), ('give', 16.437083376219075), ('copy', 15.576774858068921), ('way', 15.250798627891612), ('love', 14.370731921348757), ('happy', 12.281562619482454), ('thanks', 12.109958569465919), ('hard', 10.870159381606204), ('years', 10.425235594640455), ('rock', 10.305515958818983)]
Topic 2:
[('first', 17.921212988572652), ('justinbieber', 17.636464552640163), ('want', 16.0369398489275), ('like', 16.02248407087329), ('mario', 14.297258662937047), ('boring', 12.330256077028046), ('story', 11.259476326931795), ('kkw', 11.146177029651803), ('exciting', 9.912806231871746), ('amp', 9.14013654184014)]
Topic 3:
[('great', 58.38938826489664), ('amp'

In [41]:
from bokeh.io import push_notebook, show, output_notebook
from bokeh.plotting import figure
from bokeh.models import ColumnDataSource, LabelSet
output_notebook()

### Plotting words and documents in 2D

In [42]:
svd = TruncatedSVD(n_components=2)
documents_2d = svd.fit_transform(data_vectorized)
df = pd.DataFrame(columns=['x', 'y', 'document'])
df['x'], df['y'], df['document'] = documents_2d[:,0], documents_2d[:,1], range(len(tweets))
source = ColumnDataSource(ColumnDataSource.from_df(df))
labels = LabelSet(x="x", y="y", text="document", y_offset=8,
text_font_size="8pt", text_color="#555555",
source=source, text_align='center')
plot = figure(plot_width=300, plot_height=300)
plot.circle("x", "y", size=12, source=source, line_color="black", fill_alpha=0.8)
plot.add_layout(labels)
show(plot, notebook_handle=True)

In [43]:
svd = TruncatedSVD(n_components=2)
words_2d = svd.fit_transform(data_vectorized.T)
 
df = pd.DataFrame(columns=['x', 'y', 'word'])
df['x'], df['y'], df['word'] = words_2d[:,0], words_2d[:,1], vectorizer.get_feature_names()
 
source = ColumnDataSource(ColumnDataSource.from_df(df))
labels = LabelSet(x="x", y="y", text="word", y_offset=8,
                  text_font_size="8pt", text_color="#555555",
                  source=source, text_align='center')
 
plot = figure(plot_width=600, plot_height=600)
plot.circle("x", "y", size=12, source=source, line_color="black", fill_alpha=0.8)
plot.add_layout(labels)
show(plot, notebook_handle=True)

In [44]:
import pyLDAvis.sklearn
 
pyLDAvis.enable_notebook()
panel = pyLDAvis.sklearn.prepare(lda_model, data_vectorized, vectorizer, mds='tsne')
panel

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  topic_term_dists = topic_term_dists.ix[topic_order]


In [32]:
from bokeh.plotting import figure
from bokeh.resources import CDN
from bokeh.embed import file_html

plot = figure()

html = file_html(plot, CDN, "my plot")
#html.show()

W-1001 (NO_DATA_RENDERERS): Plot has no data renderers: Figure(id='f6f559df-8d42-4ab7-af38-e31e494f4649', ...)
