# Group Midterm - Text analysis and visualization

The group members of this project are:
* Devdhar Patel
* Abhishek Singhal

Notes for running this notebook: 
```pip install wordcloud```

## The Dataset
The data was scraped off the twitter website using [twitterscraper](https://github.com/taspinar/twitterscraper). We scraped 10,000 tweets with 'Big Data' in the text.

In [100]:
import pandas as pd
from nltk.tokenize import TweetTokenizer
from nltk.corpus import stopwords
from collections import Counter
import nltk
import string
import math
from bokeh.plotting import figure, show
from bokeh.io import output_notebook
from bokeh.models import ColumnDataSource, Select,LabelSet, HoverTool, BoxZoomTool,PanTool,WheelZoomTool,ResetTool
from datetime import timedelta
import datetime
from wordcloud import WordCloud
from PIL import Image
from bokeh.application.handlers import FunctionHandler
from bokeh.application import Application
from bokeh.layouts import Row, widgetbox, Column
import re
from nltk.stem.snowball import SnowballStemmer
from sklearn.feature_extraction.text import TfidfVectorizer,HashingVectorizer,CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
from sklearn.manifold import MDS
from sklearn.decomposition import TruncatedSVD
from bokeh.palettes import Category10
from sklearn.cluster import KMeans


output_notebook()


df = pd.read_json('tweets.json')
df.head()

Unnamed: 0,fullname,id,likes,replies,retweets,text,timestamp,user
0,Norbert Marek,918106625423982592,0,0,0,"Join us for a #Vertica ""Deep Dive"" in LA on Oc...",2017-10-11 13:30:51,NorbertMarek
1,Jeff MAURY,918106691652210688,0,0,1,3 Machine Learning Algorithms You Need to Know...,2017-10-11 13:31:07,jeffmaury
2,RACV,918106706651041792,0,0,0,RT@ ipfconline1: 5 Actual #BigData Uses for Hu...,2017-10-11 13:31:11,racvfr
3,Optimize Intl,918106708060311552,0,0,0,Turning #bigdata into business insights: The s...,2017-10-11 13:31:11,CIO_Success
4,Nicholas O'Brien,918106739731521536,0,0,0,The latest The Nicholas O'Brien Daily! http://...,2017-10-11 13:31:19,NickO_Brien


## Visualizaions
### 1. Word Cloud
We create a word cloud based on the text of the tweets using a [word cloud generator](https://github.com/amueller/word_cloud). To install this word cloud generator, run:

```pip install wordcloud```

In [52]:
minDate = min(df['timestamp']).date()
maxDate = max(df['timestamp']).date()
print('Min Date:', minDate)
print('Max Date:', maxDate)

Min Date: 2017-10-11
Max Date: 2017-10-22


In [53]:
dateList = []
currentDate = minDate
dateList.append(currentDate)
while(currentDate < maxDate):
    currentDate += timedelta(days=1)
    dateList.append(currentDate)

dateStrings = []
for date in dateList:
    dateStrings.append(date.strftime('%a,%d %b'))

In [54]:
#The code below is commented to avoid generating a wordcloud every time the notebook is run.
# wordcloud = WordCloud().generate(' '.join(list(df['text'])))
# image = wordcloud.to_image()
# image.save('wordcloud.bmp')

# for index,date in enumerate(dateList):
#     start = datetime.datetime(date.year,date.month,date.day,0,0,0)
#     end = datetime.datetime(date.year,date.month,date.day, 23,59,59)
#     tweets = df.loc[(start <= df['timestamp']) & (end >= df['timestamp'])]
#     wordcloud = WordCloud().generate(' '.join(list(df['text'])))
#     image = wordcloud.to_image()
#     image.save('wordcloud'+str(chr(97 + index))+'.bmp')

def modify_doc1(doc):
       
    def update(attr, old, new):
        layout.children[1] = create_figure()
        
    
    options = list(dateStrings)
    options.append('All Days')
    dateSelect = Select(title="Date:", options=options, value=options[-1])
    dateSelect.on_change('value', update)
    
    
    def create_figure():
        p = figure(x_range=(0,2), y_range=(0,2), width=600, height=300, tools="pan, wheel_zoom, reset")
        p.xgrid.grid_line_color = None
        p.ygrid.grid_line_color = None
        p.axis.visible = False
        if dateSelect.value == 'All Days':
            url = 'wordcloud.bmp'
        else:
            url = 'wordcloud' + str(chr(97+options.index(dateSelect.value))) + '.bmp'
        p.image_url(url=[url], x=0, y=2,w=2, h=2 )
        return p
    p = create_figure()
    layout = Column(dateSelect, p)
    doc.add_root(layout)
    

handler1 = FunctionHandler(modify_doc1)
app1 = Application(handler1)
app1.create_document()
show(app1, notebook_url="localhost:8888")

**Observation:** As we can see from the generated image, the words big data nad twitter are some of the most common words which is to be expected. Howver, some one the other common words are associated with links: https, goo, gl, ly, etc. Other notable words are: machine learning, paper, web and status.

### 2. Tweet frequencies of different hashtags
Next we will create a tweet frequency graph over time for the top 20 different hashtags. To do this, we will first need to extract all the hashtags from the text of the tweets.

In [55]:
hashtagsColumn = []
hashtagDictionary = {}
for index,row in df.iterrows():
    words = row['text'].split()
    hashtags = []
    for word in words:
        if(word[0] == '#'):
            hashtags.append(word)
            if word in hashtagDictionary:
                hashtagDictionary[word] += 1
            else:
                hashtagDictionary[word] = 1
            
    hashtagsColumn.append(hashtags)

df['hashtags'] = hashtagsColumn
hashtagSeries = pd.Series(hashtagDictionary)
hashtagSeries = hashtagSeries.sort_values(ascending=False)


In [56]:
dateStrings = []
for date in dateList:
    dateStrings.append(date.strftime('%a,%d %b'))
        

def modify_doc(doc):
       
    def update(attr, old, new):
        layout.children[1] = create_figure()
    
    options = list(hashtagSeries.keys())[0:20]
    options.append('All Tweets')
    hashtagSelect = Select(title="Hashtag:", options=options, value=hashtagSeries.keys()[0])
    hashtagSelect.on_change('value', update)
    
    
    def create_figure():
        hashTagCount = []
        for date in dateList:
            start = datetime.datetime(date.year,date.month,date.day,0,0,0)
            end = datetime.datetime(date.year,date.month,date.day, 23,59,59)
            tweets = df.loc[(start <= df['timestamp']) & (end >= df['timestamp'])]
            count = 0
            if(hashtagSelect.value == 'All Tweets'):
                count += len(tweets)
            else:
                for row in tweets.iterrows():
                    tags = row[1]['hashtags']
                    if hashtagSelect.value in tags:
                        count += 1
            hashTagCount.append(count)
        hover = HoverTool(tooltips=[
            ("count", "@y"),
        ])
        source = ColumnDataSource(data = {'x': dateStrings, 'y':hashTagCount})
        p = figure(width=800, height=250, x_range=dateStrings, tools=[hover], title='Count of ' + hashtagSelect.value)
        p.vbar('x',top='y',width=0.5,source=source, bottom=0)
        return p
    
    p = create_figure()
    layout = Column(hashtagSelect, p)
    doc.add_root(layout)

handler = FunctionHandler(modify_doc)
app = Application(handler)
show(app, notebook_url="localhost:8888")


**Observation:** As we can see, a lot of tweets related to big data were tweeted on 11 Oct 2017. This could be due to a lot of big data event happening on that particular day. Here are a few of the events that were hosted on 11 Oct:
1. [AWS Tech Talk - Big Data by Chris Widmann](https://www.eventbrite.com/e/aws-tech-talk-big-data-tickets-37541684188#)
2. [11-12 October 2017: 2nd BMBF Big Data All Hands Meeting and 2nd Smart Data Innovation Conference](http://www.bdva.eu/?q=node/838)
3. [Predictive Analytics World London, 11-12 October, 2017](https://predictiveanalyticsworld.co.uk/)
4. [DEEP LEARNING SUMMIT MONTREAL](https://www.re-work.co/events/deep-learning-summit-montreal-canada-track1-2017)

### 3. Day-wise word count (trending words of the day)
We will now create a graph to look at the trending words of each day. To create this graph, we will ignore common words that are popular across all days.

In [57]:
garbageTokens = ['¿','...','…','’','RT','–','el','la','via','en','que',"it's",'es'] # we remove RT because it is associated with re-tweet
top10EachDay = []
for date in dateList:
    start = datetime.datetime(date.year,date.month,date.day,0,0,0)
    end = datetime.datetime(date.year,date.month,date.day, 23,59,59)
    tweets = df.loc[(start <= df['timestamp']) & (end >= df['timestamp'])]
    text = ' '.join(list(tweets['text']))
    tokens = TweetTokenizer(preserve_case=False).tokenize(text)
    stop = stopwords.words('english') + list(string.punctuation) + garbageTokens
    filteredTokens = [w for w in tokens if not w in stop]
    counts = Counter(filteredTokens)
    counts = pd.Series(counts)
    counts = counts.sort_values(ascending=False)
    top10EachDay.extend(list(counts.keys())[0:10])

top10EachDay = set(top10EachDay)
    
def modify_doc2(doc):
       
    def update(attr, old, new):
        layout.children[1] = create_figure()
        
    
    options = list(dateStrings)
    options.append('All Days')
    dateSelect = Select(title="Date:", options=options, value=options[0])
    dateSelect.on_change('value', update)
    
    
    def create_figure():
        dateIndex = options.index(dateSelect.value)
        if dateIndex == (len(options) - 1):
            tweets = df
        else:
            date = dateList[dateIndex]
            start = datetime.datetime(date.year,date.month,date.day,0,0,0)
            end = datetime.datetime(date.year,date.month,date.day, 23,59,59)
            tweets = df.loc[(start <= df['timestamp']) & (end >= df['timestamp'])]
        
        text = ' '.join(list(tweets['text']))
        tokens = TweetTokenizer(preserve_case=False).tokenize(text)
        stop = stopwords.words('english') + list(string.punctuation) + garbageTokens
        filteredTokens = [w for w in tokens if not w in stop]
        counts = Counter(filteredTokens)
        counts = pd.Series(counts)
        counts = counts.sort_values(ascending=False)
        hover = HoverTool(tooltips=[
            ("count", "@y"),
        ])
        x = []
        y = []
        count = 0
        if dateIndex != (len(options) - 1):
            for item in counts.iteritems():
                if (item[0] not in top10EachDay):
                    x.append(item[0])
                    y.append(item[1])
                    count += 1
                if(count == 10):
                    break
        else:
            x = list(counts.keys())[0:10]
            y = list(counts)[0:10]
        source = ColumnDataSource(data = {'x': x, 'y':y})
        p = figure(width=800, height=250, x_range=x, tools=[hover], title='Trending words for ' + dateSelect.value)
        p.vbar('x',top='y',width=0.5,source=source, bottom=0)
#         p.xaxis.major_label_orientation = -math.pi/3
        return p
    
    p = create_figure()
    layout = Column(dateSelect, p)
    doc.add_root(layout)

handler2 = FunctionHandler(modify_doc2)
app2 = Application(handler2)
app2.create_document()
show(app2, notebook_url="localhost:8888")

**Observations:** This graph provides some really insightful data. Here are some of the observations that we think are interesting:
1. On 15th october, the forbes website published this [article](http://ift.tt/2hIKAiC) Therefore, the url and the words age, app,latest, and rethink are trending on 15th Oct.
2. On 16th october, the Smart Cities Summit 2017 was hosted which results in #smartcities trending.
2. On 17th october, CNBC published this [story](https://www.cnbc.com/2017/10/17/thoughtspot-ceo-problem-with-big-data-isnt-visualization-its-scale.html). The headline was: A.I. company CEO: Big data is not a visualization problem, it's a human scale problem. This is reflected on the trending words that day being: human, ceo and company
4. On 19th october, TechCrunch published this [article](https://techcrunch.com/2017/10/19/data-is-the-name-of-the-game-as-intel-capital-puts-60m-in-15-startups-566m-in-2017-overall/) with the title: Data is the name of the game, as Intel Capital puts $60M in 15 startups, $566M in 2017 overall written by Ingrid Lunden. This resulted in a bunch of trending words: 60m, startups, 566m, invests, intel, total, ingrid and investments.
5. On 22nd october, The Wall Street Journal published this (article)[https://www.wsj.com/articles/how-facebooks-master-algorithm-powers-the-social-network-1508673600] with the title: How Facebook’s Master Algorithm Powers the Social Network. This resulted in a bunch of trending words including facebook, dictators and evil.

## Clustering
Next we will run clustering algorithms on frequency–inverse document frequency for words and tags in each tweet.

In [83]:
#take text in twitter to 'cluster on' from dataframe
textDF = df['text']
tokenizer = TweetTokenizer(preserve_case=False)
#tokenize given text
def tokenize(text):
    #for userComments in commentsDf:
    textTokens = tokenizer.tokenize(text)
    stop = stopwords.words('english') + list(string.punctuation) + garbageTokens
    filteredTokens = [w for w in textTokens if not w in stop]
    #filter out punctuations and numeric tokens
    return filteredTokens


#making Term Frequency-Inverse document Frequency matrix model
tfidf_vectorizer = TfidfVectorizer(max_df=0.8,
                                 min_df=0, stop_words='english',
                                 use_idf=True,tokenizer=tokenize, ngram_range=(1,1))

tfidf_matrix = tfidf_vectorizer.fit_transform(textDF)
print (tfidf_matrix.shape)
wordFeatures = tfidf_vectorizer.get_feature_names()   #this may be different from totalVocabTokens

#calcluate distance between different tweets based on tf-idf matrix
#will be used for clustering and visualization later on
distWords = 1 - cosine_similarity(tfidf_matrix)

(10019, 28352)


In [84]:
#code to calculate frequency of each hashtag
#will be later used to cluster on hashtags
def join(x):
    return " ".join(x['hashtags'])
hashtagsDf = df.apply(join, axis=1)
countHashtagsVectorizer = CountVectorizer(stop_words='english',tokenizer=tokenize,ngram_range=(1,1))
countHasgtagsMatrix = countHashtagsVectorizer.fit_transform(hashtagsDf)

#calcluate distance between different tweets based on tf-idf matrix
#will be used for clustering and visualization later on
distHashtags = 1 - cosine_similarity(countHasgtagsMatrix)

In [116]:
def clusteringAlgo(matrix,para):
        clf = KMeans(n_clusters=para)
        clf.fit(matrix)
        
        order_centroids = clf.cluster_centers_.argsort()[:,::-1]
        topFeaturesInEachCluster=[]
        for centres in order_centroids:
            featuredWords=[]
            for words in centres[:5]:
                featuredWords.append(wordFeatures[words])
            topFeaturesInEachCluster.append(featuredWords)
        clusters = clf.labels_.tolist()
        return clusters,topFeaturesInEachCluster



clusterWords,topFeaturesWordsInEachCluster = clusteringAlgo(tfidf_matrix,5)
clusterHashtags, topHashtagFeatures = clusteringAlgo(countHasgtagsMatrix,2)

In [117]:
#dimensionality reduction for the TF-IDF matrix (or df equally)
def dimensionReduction(dist):

    # convert two components as we're plotting points in a two-dimensional plane

    # we will also specify `random_state` so the plot is reproducible.
    mds = TruncatedSVD(n_components=2)

    #toarray() converts sparse array to dense numpy array
    pos = mds.fit_transform(dist)  # shape (n_components, n_samples)
    return pos
    #store the dimensions in xs, ys
    #xs, ys = pos[:, 0], pos[:, 1]
    
hovera = HoverTool(tooltips=[
            ("Tweet", "@text"),
        ])
a=figure(plot_height=500, plot_width=900, title='clustering groups based on Tweet Text',tools=[BoxZoomTool(),PanTool(),WheelZoomTool(),ResetTool(),hovera])

dimensionReduced = dimensionReduction(distWords)
x = dimensionReduced[:, 0]
y = dimensionReduced[:, 1]



for i in range(0,5):
    
    filteredX = []
    filteredY = []
    text = []
    for index, cluster in enumerate(clusterWords):
        if(cluster == i):
            filteredX.append(x[index])
            filteredY.append(y[index])
            text.append(textDF[index])
    source = ColumnDataSource(data={'x':filteredX,'y':filteredY, 'text':text})
    a.circle('x','y',source=source,color=Category10[5][i], size=10, alpha=0.2,legend='Top Features:' 
             + ",".join(topFeaturesWordsInEachCluster[i]))

a.legend.location = "top_left"
a.legend.click_policy="hide"
show(a)


In [119]:
hoverb = HoverTool(tooltips=[
            ("Hashtags", "@text"),
        ])

b=figure( plot_height=500, plot_width=900, title='clustering groups based on Hashtags', tools=[BoxZoomTool(),PanTool(),WheelZoomTool(),ResetTool(),hoverb])

dimensionReduced = dimensionReduction(distHashtags)
x = dimensionReduced[:, 0]
y = dimensionReduced[:, 1]

#user are labels
for i in range(0,2):
    
    filteredX = []
    filteredY = []
    text = []
    for index, cluster in enumerate(clusterHashtags):
        if(cluster == i):
            filteredX.append(x[index])
            filteredY.append(y[index])
            text.append(hashtagsDf[index])
    source = ColumnDataSource(data={'x':filteredX,'y':filteredY, 'text':text})
    b.circle('x','y',source=source,color=Category10[5][i], size=10, alpha=0.2,legend='Top Features:' 
             + ",".join(topHashtagFeatures[i]))

b.legend.location = "top_right"
b.legend.click_policy="hide"
show(b)