## Part 2 of the Data Science challenge.
#### Create a tool that visualises different metrics for different search results on Twitter

In [1]:
import tweepy 
import pandas as pd
import requests
import json
from sklearn.feature_extraction.text import CountVectorizer
from textblob import TextBlob
from collections import Counter
import numpy as np

In [2]:
def get_bearer_token():
    base_path = "/home/adrian/test/Kahoot-challenge/keys/"
    with open(base_path + "bearer_token.txt") as f:
        return f.readline().strip()

In [3]:
bearer_token = get_bearer_token()
search_count_url = "https://api.twitter.com/2/tweets/counts/recent"
search_url = "https://api.twitter.com/2/tweets/search/recent"

In [4]:
def bearer_oauth(r):
    """
    Method required by bearer token authentication.
    """

    r.headers["Authorization"] = f"Bearer {bearer_token}"
    r.headers["User-Agent"] = "v2RecentTweetCountsPython,v2RecentSearchPython"
    return r


def connect_to_endpoint(url, params):
    response = requests.request("GET", url, auth=bearer_oauth, params=params)
    if response.status_code != 200:
        raise Exception(response.status_code, response.text)
    return response.json()


In [5]:
def get_query_count(query):
    # Optional params: start_time,end_time,since_id,until_id,next_token,granularity
    query_count_params = {'query': query,'granularity': 'minute'}
    query_count_object = connect_to_endpoint(search_count_url, query_count_params)
    
    '''make dataframe out of query count numbers, and convert from string to timestamps'''
    df_query_count = pd.DataFrame(query_count_object["data"], columns=["start","end","tweet_count"])
    df_query_count.loc[:,"start"] = pd.to_datetime(df_query_count.loc[:,"start"],utc=True)
    df_query_count.loc[:,"end"] = pd.to_datetime(df_query_count.loc[:,"end"],utc=True)
     
    '''Get the search counts for the query, 1 minute, 5 minutes, 15 minutes ago and total hits'''
    time_steps = [1,5,15]
    query_count = {}
    timezone_delay = 120
    for minutes_ago in time_steps:
        time_delta = pd.to_datetime('today') - pd.Timedelta(120 + minutes_ago, 'minutes')
        query_count[str(minutes_ago) + " minutes ago"] = df_query_count[df_query_count["start"] >= time_delta.tz_localize('UTC')].tweet_count.sum()    
    query_count["total"] = query_count_object["meta"]["total_tweet_count"]
    return query_count

In [6]:
def get_tweets(search_term):
    '''
    Due to limitations in results for each request from API, pagination needs to be implemented. Using next_token for this.
    Will only iterate through the 1000 first results, as this should be more than enough to illustrate proof of concept
    '''
    
    #hack to get RFC 3339 timestamp as required by twitter
    start_time = pd.to_datetime('today') - pd.Timedelta(120 + 15, 'minutes')
    start_time = start_time.isoformat("T")+"Z"

    # Optional params: start_time,end_time,since_id,until_id,max_results,next_token,
    # expansions,tweet.fields,media.fields,poll.fields,place.fields,user.fields
    query_search_params = {'query': search_term,'tweet.fields': 'author_id,created_at','max_results':'100', 'start_time':start_time}

    tweets = []
    for i in range(100,1001,100):
        query_search_object = connect_to_endpoint(search_url, query_search_params)
        tweets.extend(query_search_object["data"])
        try: 
            next_token = query_search_object["meta"]['next_token']
            query_search_params["next_token"] = query_search_object["meta"]["next_token"]
        except:
            #no more content, exit loop
            break
            
    df_tweets = pd.DataFrame(tweets)
    df_tweets["created_at"] = pd.to_datetime(df_tweets.loc[:,"created_at"],utc=True)
    return df_tweets
    

In [7]:
def get_most_frequent_words(query_word,corpus,n_words):
    '''
    Returns the most frequently used words from the input corpus. 
    Removes punctuation and english stop words. 
    Removing english stop words can be a weakness in some analytical tasks, and should be considered further.
    '''
    vectorizer = CountVectorizer(stop_words='english')
    X = vectorizer.fit_transform(corpus)
    #Https and re-tweet.... remove them
    stop_words = ["https","rt"]
    
    sum_words = X.sum(axis=0)
    words_freq = [(word, sum_words[0, idx]) for word, idx in vectorizer.vocabulary_.items() if word != query_word and word.lower() not in stop_words]
    words_freq = sorted(words_freq, key = lambda x: x[1], reverse=True)[:n_words]
    word_freq_dict = {}
    for word, freq in words_freq:
        word_freq_dict[word] = freq
    return word_freq_dict

In [8]:
def find_sentiment(review):
    return TextBlob(review).sentiment.polarity

In [9]:
def get_top_users_and_sentiment(tweets):
    top_tweepers = {}
    sentiment_data = []
    group_labels = []
    for i, minutes_ago  in enumerate(time_steps):
        time_delta = pd.to_datetime('today') - pd.Timedelta(120 + minutes_ago, 'minutes')
        tmp_df = tweets[tweets["created_at"] >= time_delta.tz_localize('UTC')]
        top_tweepers[str(minutes_ago) + " minutes ago"] = Counter(tmp_df.author_id).most_common(10)  
        sentiment = tmp_df.text.apply(find_sentiment).values
        if(sentiment.shape[0]==0):
            continue
        sentiment_data.append(sentiment)
        group_labels.append(str(minutes_ago) + " minutes ago" )
#         sentiment_plots[str(minutes_ago) + " minutes ago"] = ff.create_distplot([sentiment.values],["distplot"],  show_hist =False)

    return top_tweepers, ff.create_distplot(sentiment_data, group_labels,  show_hist =False)

### Creating a plotly dashboard to visualize the metrics

In [10]:
from jupyter_dash import JupyterDash
import dash_core_components as dcc
import dash_html_components as html
from dash.dependencies import Input, Output,State
from io import BytesIO
import base64
from wordcloud import WordCloud
import plotly.figure_factory as ff

external_stylesheets = ['https://codepen.io/chriddyp/pen/bWLwgP.css']

app = JupyterDash(__name__, external_stylesheets=external_stylesheets)

In [11]:
time_steps = [1,5,15]

app.layout = html.Div(children=[
    html.H1(children='Twitter statistics Dashboard'),

    html.H3(children='''
        Input a desired search term!
    ''',style={"textAlign":"center"}),
    html.Section([
            html.Section([
                html.Div([
                    dcc.Input(id='search-term', value='kahoot', type='text'),
                    html.Button('SEARCH!', id='search-button',n_clicks=0,style={"backgroundCcolor":"white"})
                ],style={
                "width":"10%",
                "display":"flex",
                "margin":"auto",
                "justifyContent":"space-around"
                })
            ],style={
                "marginBottom":"2em"
            })
    ]),
    html.Section([
        html.H2(children="Number of hits:",style={"textAlign":"center","background":"white"}),
        html.Section([
            html.H6(id="query-count", style={"background":"white","margin":"auto","textAlign":"left","paddingBottom":"2em"})
        ]),
        html.H2(children="Frequent search terms:",style={"textAlign":"center","background":"white"}),
        html.Section([
            html.H4("Last minute:"),
            html.H4("Last 5 minutes:"),
            html.H4("Last 15 minutes:")
        ], style= {
            "margin":"auto",
            "display":"flex",
            "width":"70%",
            "justifyContent":"space-between"
        }),
        html.Section([
            html.Img(id="wc_1",style={"width":"20%"}),
            html.Img(id="wc_5",style={"width":"20%"}),
            html.Img(id="wc_15",style={"width":"20%"}),
        ],style={
            "marginTop":"2rem",
            "display":"flex",
            "margin":"auto",
            "justifyContent":"space-between",
            "width":"100%"
        }),
        html.H2(children="Most active users:",style={"textAlign":"center","background":"white"}),
        html.Section([
            html.H6(id="top_tweepers", style={"background":"white","margin":"auto","textAlign":"left","paddingBottom":"2em"})
        ]),
        
        html.H2(children="Sentiment for recent tweets:",style={"textAlign":"center","background":"white"}),

        dcc.Graph(id="sentiment_dist",style={"width":"80%","margin":"auto"})

        
    ],style={"background":"white"})
    
#     html.Section([
#         html.H6(id="query-count", style={"background":"white","margin":"auto","textAlign":"left"})
#     ]),
    
],
style={
    "background":"radial-gradient(circle, rgba(63,94,251,0.5298494397759104) 0%, rgba(252,70,107,1) 100%)"
})

def plot_wordcloud(frequent_terms):
    wc = WordCloud(background_color='white', width=480, height=360)
    wc.fit_words(frequent_terms)
    return wc.to_image()

@app.callback(
    Output('query-count','children'),
    [Input('search-button', 'n_clicks')],
    state=[State(component_id='search-term', component_property='value')]
)

def queryCount(search_button_clicks,search_term):
    query_count = get_query_count(search_term)
    return (
        html.Section([
        
            html.Div(f'last minute:\t {query_count["1 minutes ago"]}'),
            html.Div(f'last 5 minutes:\t {query_count["5 minutes ago"]}'),
            html.Div(f'last 15 minutes:\t {query_count["15 minutes ago"]}'),
            html.Div(f'total:\t {query_count["total"]}')
            ],
            style ={
            "background":"white",
            "width":"50%",
            "display":"flex",
            "margin":"auto",
            "justifyContent":"space-around",
            "flexDirection":"rows"
            })
    )



@app.callback(
    Output('wc_1', 'src'),
    Output('wc_5', 'src'),
    Output('wc_15', 'src'),
    #Output('top_tweepers', 'children'),
    #Output('sentiment_dist','figure'),
    [Input('search-button', 'n_clicks')],
    state=[State(component_id='search-term', component_property='value')]
)

def tweetFrequency(search_button_clicks,search_term):   
    df_tweets = get_tweets(search_term)
    frequent_terms = {}
    for minutes_ago in time_steps:
        time_delta = pd.to_datetime('today') - pd.Timedelta(120 + minutes_ago, 'minutes')
        frequent_terms[str(minutes_ago) + " minutes ago"] = get_most_frequent_words(search_term,df_tweets[df_tweets["created_at"] >= time_delta.tz_localize('UTC')].text.values, 10)  
    
    word_clouds = []
    for terms in frequent_terms.values():  
        img = BytesIO()
        plot_wordcloud(terms).save(img, format='PNG')
        word_clouds.append('data:image/png;base64,{}'.format(base64.b64encode(img.getvalue()).decode()))
        
        
    return (
        word_clouds[0],
        word_clouds[1],
        word_clouds[2]
    )

@app.callback(
    Output('top_tweepers', 'children'),
    Output('sentiment_dist','figure'),
    [Input('search-button', 'n_clicks')],
    state=[State(component_id='search-term', component_property='value')]
)

def topUsersAndSentiment(search_button_clicks,search_term):   
    df_tweets = get_tweets(search_term)        
    top_tweepers, sentiment_figure = get_top_users_and_sentiment(df_tweets)
    sentiment_figure.update_layout(width=1500,height=800)
    return (
        f'''Last minute:\t {str(top_tweepers["1 minutes ago"])}
        \nLast 5 minutes:\t {str(top_tweepers["5 minutes ago"])}
        \nLast 15 minutes:\t {str(top_tweepers["15 minutes ago"])}''',
        sentiment_figure
    )
    
if __name__ == '__main__':
    app.run_server(debug=True, port=1111)

Dash app running on http://127.0.0.1:1111/
