# Twitter keywords analysis
(a) twitter count  
(b) tweet count  
(c) bar chart of twitter count in month  
(d) bar chart of tweet count in month  
(e) pie chart of count of exist-url & no-url tweets  
(f) box chart of tweet count of twitter in month  

# Keyword searching in tweet

In [1]:
import pyes
import elasticsearch

es_address='192.168.1.100:9200'
conn = pyes.es.ES(es_address)
bq = pyes.query.BoolQuery()

tq = pyes.query.TermQuery(field='text', value='cve')

ESR = pyes.ESRange(field='created_at', from_value='Wed Jan 01 00:00:00 +0000 2014', to_value='Wed Dec 31 23:59:59 +0000 2014', 
                   include_lower=True, include_upper=False)
rq = pyes.query.RangeQuery(qrange=ESR)

bq.add_must(tq)
bq.add_must(rq)

result = conn.search(query=bq, indices='twitter2', doc_types='tweet') 
print len(result)

265


## (a) Twitter count

In [2]:
import json

conn = pyes.es.ES('192.168.1.100:9200')
bq = pyes.query.BoolQuery()

tq = pyes.query.TermQuery(field='text', value='cve') # use keyword 'cve'
ESR = pyes.ESRange(field='created_at', from_value='Wed Jan 01 00:00:00 +0000 2014', to_value='Wed Dec 31 23:59:59 +0000 2014', 
                   include_lower=True, include_upper=False) # use whole data in 2014
rq = pyes.query.RangeQuery(qrange=ESR)

bq.add_must(tq)
bq.add_must(rq)

tagg = pyes.aggs.TermsAgg('user-tweets', field='uid') # tweet count for each user

qsearch = pyes.query.Search(bq) 
qsearch.agg.add(tagg) 

rs = conn.search(query=qsearch, index='twitter2', type="tweet")
twitters_count = len(rs.aggs['user-tweets']['buckets'])
print twitters_count
#print json.dumps(rs.aggs, indent=2)

47


## (b) Tweet count & data of c-f

In [3]:
import pyes
import json

conn=pyes.es.ES('192.168.1.100:9200')

bq = pyes.query.BoolQuery()

tq = pyes.query.TermQuery(field='text', value='cve') # use keyword 'cve'
ESR = pyes.ESRange(field='created_at', from_value='Wed Jan 01 00:00:00 +0000 2014', to_value='Wed Dec 31 23:59:59 +0000 2014', 
                   include_lower=True, include_upper=False) # use whole data in 2014
rq = pyes.query.RangeQuery(qrange=ESR)

bq.add_must(tq)
bq.add_must(rq)

# for each month, tweet count of all users
tagg = pyes.aggs.DateHistogramAgg('month', field='created_at', interval='month', sub_aggs=[])
tagg1 = pyes.aggs.TermsAgg('user-tweet', field='uid')
tagg.sub_aggs.append(tagg1)

qsearch = pyes.query.Search(bq)
qsearch.agg.add(tagg)

rs = conn.search(query=qsearch, indices='twitter2', type="tweet")
rs_url = conn.search(query=bq, indices='twitter2', type="tweet")


res_list = rs.aggs['month']['buckets']
res_url_list = [rs_url[t]['entities']['urls'] for t in range(len(rs_url))]

# e
url_count = len([url for url in res_url_list if url != []])
url_to_nourl = [url_count, len(res_url_list) - url_count]

twitters_month = [len(res_list[m]['user-tweet']['buckets']) for m in range(12)] # c
tweets_month = [res_list[m]['doc_count'] for m in range(12)] # d
tweets_count = sum(tweets_month) # b

# f
twitter_tweet_month = [[res_list[m]['user-tweet']['buckets'][n]['doc_count'] for n in range(twitters_month[m])] for m in range(12)] # Q2-f

print url_to_nourl
print tweets_count, tweets_month
print twitters_count, twitters_month
print twitter_tweet_month

[220, 45]
265 [5, 17, 19, 21, 17, 15, 10, 12, 55, 56, 27, 11]
47 [3, 10, 10, 13, 10, 11, 6, 8, 28, 27, 10, 6]
[[2, 2, 1], [4, 3, 2, 2, 1, 1, 1, 1, 1, 1], [7, 3, 2, 1, 1, 1, 1, 1, 1, 1], [4, 3, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1], [3, 3, 2, 2, 2, 1, 1, 1, 1, 1], [4, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1], [3, 2, 2, 1, 1, 1], [3, 2, 2, 1, 1, 1, 1, 1], [6, 5, 5, 4, 3, 3, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [10, 9, 6, 4, 3, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [8, 5, 4, 3, 2, 1, 1, 1, 1, 1], [4, 2, 2, 1, 1, 1]]


# Visualization: use Bokeh

In [4]:
from bokeh.io import push_notebook, show, output_notebook
from bokeh.layouts import row
from bokeh.plotting import figure
output_notebook()

## (c) & (d) Bar chart of twitter and tweet count in month

In [5]:
from bokeh.charts import Bar
from bokeh.charts.attributes import CatAttr

MONTHS = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']

twitters_month_dict = dict(data=twitters_month, label=MONTHS)
tweets_month_dict = dict(data=tweets_month, label=MONTHS)

twitters_bar = Bar(twitters_month_dict, values='data', label=CatAttr(columns=['label'], sort=False), 
               plot_width=450, plot_height=450, legend=None, title="Twitters count")
tweets_bar = Bar(tweets_month_dict, values='data', label=CatAttr(columns=['label'], sort=False), 
             plot_width=450, plot_height=450, legend=None, title="Tweets count")

twitters_bar.xaxis.axis_label = tweets_bar.xaxis.axis_label = ""
twitters_bar.yaxis.axis_label = "twitters"
tweets_bar.yaxis.axis_label = "tweets"

show(row(twitters_bar, tweets_bar))

## (e) Pie chart of exist-url & no-url tweet count

In [6]:
from bokeh.charts import Donut

URL_EXIST = ['exist-url', 'no-url']

url_exist_dict = dict(data=url_to_nourl, label=URL_EXIST)

url_exist_donut = Donut(url_exist_dict, values='data', label='label', 
                        text_font_size='8pt', hover_text='label')

show(url_exist_donut)

## (f) Box chart of tweet count of twitter in month

In [7]:
from bokeh.charts import BoxPlot
from bokeh.charts.attributes import CatAttr

d = []
l = []
for m in range(12):
    d = d + twitter_tweet_month[m]
    l = l + [MONTHS[m]]*len(twitter_tweet_month[m])

twitter_tweet_dict = dict(data=d, label=l)
twitter_tweets_box = BoxPlot(twitter_tweet_dict, values='data', label=CatAttr(columns=['label'], sort=False), 
                             color='label', plot_width=800, title="twitter tweets")

twitter_tweets_box.xaxis.axis_label = ""
twitter_tweets_box.yaxis.axis_label = "tweets"

show(twitter_tweets_box)