# Twitter keywords analysis
(a) twitter count  
(b) tweet count  
(c) bar chart of twitter count in month  
(d) bar chart of tweet count in month  
(e) pie chart of count of exist-url & no-url tweets  
(f) box chart of tweet count of twitter in month  

# Keyword searching in tweet

In [2]:
import pyes
import elasticsearch

KEYWORDS = ['cve', 'vulnerability', 'exploit'] # keywords to analyze

conn = pyes.es.ES('140.118.126.148:9200')

tqs = [pyes.query.TermQuery(field='text', value=t) for t in KEYWORDS]

ESR = pyes.ESRange(field='created_at', from_value='Wed Jan 01 00:00:00 +0000 2014', to_value='Wed Dec 31 23:59:59 +0000 2014', 
                   include_lower=True, include_upper=False) # use whole data in 2014
rq = pyes.query.RangeQuery(qrange=ESR)

bqs = [pyes.query.BoolQuery().add_must(rq).add_must(tq) for tq in tqs]
rs = [conn.search(query=q, indices='twitter2', doc_types='tweet') for q in bqs]

tweets_count = [len(r) for r in rs]
print tweets_count # b

[265, 720, 561]


## (a) Twitter count

In [3]:
tagg = pyes.aggs.TermsAgg('user-tweets', field='uid') # aggrated by twitter

qss = [pyes.query.Search(bq) for bq in bqs]
for q in qss:
    q.agg.add(tagg)

rs = [conn.search(query=q, index='twitter2', type="tweet") for q in qss]

twitters_count = [len(r.aggs['user-tweets']['buckets']) for r in rs]
print twitters_count

[47, 75, 69]


## (b) Tweet count & data of c-f

In [4]:
# for each month, tweet count of all users
tagg = pyes.aggs.DateHistogramAgg('month', field='created_at', interval='month', sub_aggs=[])
tagg1 = pyes.aggs.TermsAgg('user-tweet', field='uid')
tagg.sub_aggs.append(tagg1)

qss = [pyes.query.Search(bq) for bq in bqs]
for q in qss:
    q.agg.add(tagg)

rs = [conn.search(query=q, indices='twitter2', type="tweet") for q in qss]
rs_url = [conn.search(query=bq, indices='twitter2', type="tweet") for bq in bqs]


res_lists = [r.aggs['month']['buckets'] for r in rs]
res_url_lists = [[r_url[t]['entities']['urls'] for t in range(len(r_url))] for r_url in rs_url]

url_counts = [len([url for url in r_url_list if url != []]) for r_url_list in res_url_lists]
url_to_nourls = [[url_counts[i], tweets_count[i] - url_counts[i]] for i in range(len(url_counts))] # e

twitters_months = [[len(res_list[m]['user-tweet']['buckets']) for m in range(12)] for res_list in res_lists] # c
tweets_months = [[res_list[m]['doc_count'] for m in range(12)] for res_list in res_lists] # d

# f
twitter_tweet_month = [[[res_lists[i][m]['user-tweet']['buckets'][n]['doc_count'] for n in range(twitters_months[i][m])] for m in range(12)] for i in range(len(res_lists))]

print url_to_nourls
print 'twitters_count:', twitters_count
print twitters_months
print 'tweets_count:', tweets_count
print tweets_months
print twitter_tweet_month

[[220, 45], [610, 110], [443, 118]]
twitters_count: [47, 75, 69]
[[3, 10, 10, 13, 10, 11, 6, 8, 28, 27, 10, 6], [22, 14, 12, 37, 24, 24, 20, 21, 33, 30, 27, 27], [16, 21, 16, 33, 20, 18, 22, 28, 27, 31, 24, 22]]
tweets_count: [265, 720, 561]
[[5, 17, 19, 21, 17, 15, 10, 12, 55, 56, 27, 11], [44, 46, 33, 88, 40, 60, 52, 39, 98, 87, 76, 57], [53, 47, 47, 58, 39, 44, 50, 37, 59, 56, 43, 28]]
[[[2, 2, 1], [4, 3, 2, 2, 1, 1, 1, 1, 1, 1], [7, 3, 2, 1, 1, 1, 1, 1, 1, 1], [4, 3, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1], [3, 3, 2, 2, 2, 1, 1, 1, 1, 1], [4, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1], [3, 2, 2, 1, 1, 1], [3, 2, 2, 1, 1, 1, 1, 1], [6, 5, 5, 4, 3, 3, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [10, 9, 6, 4, 3, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [8, 5, 4, 3, 2, 1, 1, 1, 1, 1], [4, 2, 2, 1, 1, 1]], [[5, 5, 4, 4, 3, 3, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [10, 6, 5, 4, 4, 3, 3, 3, 2, 2, 1, 1, 1, 1], [14, 3, 3, 2, 2, 2, 2, 1, 1, 1, 1, 1], [

# Visualization: use Bokeh

In [5]:
from bokeh.io import push_notebook, show, output_notebook
from bokeh.layouts import row, column
from bokeh.plotting import figure
output_notebook()

## (c) & (d) Bar chart of twitter and tweet count in month

In [6]:
from bokeh.charts import Bar
from bokeh.charts.attributes import CatAttr

MONTHS = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']

twitters_month_dicts = [dict(data=d, label=MONTHS) for d in twitters_months]
tweets_month_dicts = [dict(data=d, label=MONTHS) for d in tweets_months]

twitters_bars = [Bar(twitters_month_dict, values='data', label=CatAttr(columns=['label'], sort=False), 
               plot_width=450, plot_height=450, legend=None) for twitters_month_dict in twitters_month_dicts]
tweets_bars = [Bar(tweets_month_dict, values='data', label=CatAttr(columns=['label'], sort=False), 
             plot_width=450, plot_height=450, legend=None) for tweets_month_dict in tweets_month_dicts]

for i in range(len(twitters_bars)):
    twitters_bars[i].title = 'Twitter count of \''+ KEYWORDS[i] + '\''
    tweets_bars[i].title = 'Tweet count of \''+ KEYWORDS[i] + '\''
    twitters_bars[i].xaxis.axis_label = tweets_bars[i].xaxis.axis_label = ''
    twitters_bars[i].yaxis.axis_label = "twitters"
    tweets_bars[i].yaxis.axis_label = "tweets"

show(column(row(twitters_bars), row(tweets_bars)))

            and will be removed. The title is now an object on Plot (which holds all of it's
            styling properties). Please use Plot.title.text instead.

            SERVER USERS: If you were using plot.title to have the server update the plot title
            in a callback, you MUST update to plot.title.text as the title object cannot currently
            be replaced after intialization.
            
  """)


## (e) Pie chart of exist-url & no-url tweet count

In [7]:
from bokeh.charts import Donut

URL_EXIST = ['exist-url', 'no-url']

url_exist_dicts = [dict(data=d, label=URL_EXIST) for d in url_to_nourls]

url_exist_donuts = [Donut(url_exist_dict, values='data', label='label', 
                        text_font_size='8pt', hover_text='label') for url_exist_dict in url_exist_dicts]

for i in range(len(url_exist_donuts)):
    url_exist_donuts[i].title = 'Exist-url to no-url of \''+ KEYWORDS[i] + '\''

show(row(url_exist_donuts))

## (f) Box chart of tweet count of twitter in month

In [8]:
from bokeh.charts import BoxPlot
from bokeh.charts.attributes import CatAttr

twitter_tweet_dicts = []
for i in range(len(twitter_tweet_month)):
    d = [] 
    l = []
    for m in range(12):
        d = d + twitter_tweet_month[i][m]
        l = l + [MONTHS[m]]*len(twitter_tweet_month[i][m])
    twitter_tweet_dicts.append(dict(data=d, label=l))

twitter_tweets_boxs = [BoxPlot(twitter_tweet_dict, values='data', label=CatAttr(columns=['label'], sort=False), 
                             color='label', plot_width=800) for twitter_tweet_dict in twitter_tweet_dicts]

for i in range(len(twitter_tweets_boxs)):
    twitter_tweets_boxs[i].title = 'Twitter tweets of \''+ KEYWORDS[i] + '\''
    twitter_tweets_boxs[i].xaxis.axis_label = ""
    twitter_tweets_boxs[i].yaxis.axis_label = "tweets"

show(column(twitter_tweets_boxs))