In [1]:
# Import Packages
import json
import googlemaps # This had to be installed
import twitter # This should already be installed (but was additional)
import urllib.parse as urllib
import pandas as pd
import nltk
import numpy as np
import re
import string
import random
import time
import datetime
from wordcloud import WordCloud
from math import pi
from IPython.display import Image
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt

In [118]:
# Import Bokeh Packages
import bokeh
from bokeh import events
from bokeh.layouts import row, column, widgetbox, layout
from bokeh.models.widgets import Button, TextInput, Select, Div, DataTable, TableColumn, NumberFormatter, Panel, Tabs, Paragraph
from bokeh.models import HoverTool, ColumnDataSource, GMapOptions, CustomJS
from bokeh.plotting import show, figure, gmap
from bokeh.io import show, push_notebook, output_notebook, reset_output
from bokeh.application.handlers import FunctionHandler
from bokeh.application import Application
from bokeh.models.tiles import WMTSTileSource
from bokeh.document import Document
from bokeh.palettes import Category20c
from bokeh.models import ColumnDataSource, Range1d, LabelSet, Label
from bokeh.transform import cumsum

In [3]:
nltk.download('stopwords')
nltk.download('wordnet')
from nltk import *
from nltk.corpus import stopwords
from nltk.tokenize import TweetTokenizer
from nltk.stem import WordNetLemmatizer

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Qitianyu\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Qitianyu\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [4]:
# Global Parameters
CREDFILE = 'OAuth.json'
GOOGLE_MAPS_API_URL = 'http://maps.googleapis.com/maps/api/geocode/json'
RATE_LIMIT = 15
TRAINSET = 'data/train_data.csv'
# DEFAULT_GEO = 'Syracuse, NY'
PADDING = 0.1
APP_WIDTH = 650
APP_HEIGHT = 700

In [5]:
#************************************ HELPER FUNCTIONS ************************************

# Helper Function - Get API Keys
def getKeys(filename):
    with open(filename,'r') as fd:
        keys = json.load(fd)
    return keys

In [45]:
# Helper Function - Intialize Twitter API
def authorizeTwitter(keys):
    api = twitter.Api(consumer_key = keys["consumer_key"], 
                consumer_secret = keys["consumer_secret"], 
                access_token_key = keys["token"], 
                access_token_secret = keys["token_secret"],
                sleep_on_rate_limit=True)
    return api

In [7]:
# Helper Function - Initialize Google Maps API
def authorizeGoogle(keys):
    gmaps = googlemaps.Client(key=keys['GoogleKey'])
    return gmaps

In [8]:
# Helper Function - Geocode Location
def geocode(loc, api):
    result = api.geocode(loc)
    if (result):
        result = result[0]['geometry']['location']
    else:
        result = None
    return result

In [9]:
# Helper Function - Process User Query
def processQuery(term, api):
    raw = "q=" + term.replace(" ", "") + "%20%23travel%20-filter%3Aretweets"
    switch = 1
    i = 0
    tweets = list();
    while (switch == 1):
        
        # Harvest tweets from Twitter API
        results = api.GetSearch(raw_query = raw, return_json=True)
        i += 1
        tweets.append(results)
        
        # Check if there are more tweets to harvest
        if ('next_results' in results['search_metadata'].keys()):
            raw = results['search_metadata']['next_results']
            temp = raw[1:].split('&q=')
            raw = '&q=' + temp[1] + '&' + temp[0]
        else:
            switch = 0
        
        if (i == RATE_LIMIT):
            switch = 0
    return tweets

In [10]:
# Helper Function - Extract data from twitter results
def extractData(results):
    labels = ['text']
    records = list()
    for i in range(len(results)):
        for j in range(len(results[i]['statuses'])):
        
            if ('text' not in results[i]['statuses'][j].keys()):
                text = results[i]['statuses'][j]['full_text']
            else:
                text = results[i]['statuses'][j]['text']
        
            tweet_id = results[i]['statuses'][j]['id']
            timestamp = results[i]['statuses'][j]['created_at']
            geo = results[i]['statuses'][j]['geo']
            coordinates = results[i]['statuses'][j]['coordinates']
            place = results[i]['statuses'][j]['place']
            favorites = results[i]['statuses'][j]['favorite_count']
            retweets = results[i]['statuses'][j]['retweet_count']
            user_id = results[i]['statuses'][j]['user']['id']
            screen_name = results[i]['statuses'][j]['user']['screen_name']
            source = results[i]['statuses'][j]['source']
            name = results[i]['statuses'][j]['user']['name']
            location = results[i]['statuses'][j]['user']['location']
            favourites = results[i]['statuses'][j]['user']['favourites_count']
            followers = results[i]['statuses'][j]['user']['followers_count']
            friends = results[i]['statuses'][j]['user']['friends_count']
            created_at = results[i]['statuses'][j]['user']['created_at']
            geo_enabled = results[i]['statuses'][j]['user']['geo_enabled']
        
            records.append([text, timestamp])
        
    dataset = pd.DataFrame.from_records(records, columns = labels)
    return dataset

In [11]:
# Helper Function - Process Categorized Data
def processTrainData():
    rawSet = pd.read_csv(TRAINSET)
    stops = set(stopwords.words('english'))
    wnl = WordNetLemmatizer()
    tt = TweetTokenizer()
    emotionVector = {"empty": -1, "sadness": 0, "worry": 0, "neutral": 1, "surprise": 2, 
                     "love": 3, "happiness": 3, "relief": 4, "fun": 5, "enthusiasm": 5,
                     "hate": 6, "anger": 6, "boredom": 7}
    special = re.compile('[0-9,\,,\:,\/,\=,\&,\;,\%,\$,\@,\#,\%,\^,\*,\(,\),\{,\},\[,\],\|,\>,\<,\-,\!,\?,\.\'\"]')
    total = 0
    emotions = [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
    conditionSet = dict()
    wordSet = dict()
    trainList = dict()
    
    for index, row in rawSet.iterrows():
        if row["sentiment"] == "empty" or row["sentiment"] == "worry": 
            continue
        total += 1
        emotions[emotionVector[row["sentiment"]]] += 1
        sentence = row["content"]
        wordList = list(set(tt.tokenize(sentence)))
        for word in wordList:
            word = word.strip(string.punctuation)
            word = wnl.lemmatize(word)
            word = word.lower()
            if special.search(word) == None and word not in stops and len(word) > 1:
                if word not in trainList:
                    trainList[word] = [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
                trainList[word][emotionVector[row["sentiment"]]] += 1
    trainList = sorted(trainList.items(),key = lambda x: (x[1][0] + x[1][1] + x[1][2] + x[1][3] + x[1][4] + x[1][5] + x[1][6] + x[1][7]), reverse = True)
    trainList = trainList[:10000]
    for word in trainList:
        conditionSet[word[0]] = word[1]
        wordSet[word[0]] = (word[1][0] + word[1][1] + word[1][2] + word[1][3] + word[1][4] + word[1][5] + word[1][6] + word[1][7]) / total
    
    for (key, value) in conditionSet.items():
        for i in range(8):
            value[i] = value[i] / emotions[i]
    for i in range(8):
        emotions[i] = emotions[i] / total
#     emotions[0] /= 10
#     emotions[3] *= 10
    return (conditionSet, wordSet, emotions)

In [193]:
def classify(wordList, wordDup, conditionSet, wordSet, emotionSet):
    result = []
    actual = []
    actualDup = []
    for word in wordList:
        if word in conditionSet.keys():
            actual.append(word)
    if len(actual) < 2:
        return (-1, None)
    for word in wordDup:
        if word in conditionSet.keys():
            actualDup.append(word)
    general = 1.0
    for word in actual:
        general *= wordSet[word]
    for i in range(8):
        condition = emotionSet[i]
        for word in actual:
            condition *= conditionSet[word][i]
        result.append(condition / general)
    res = 0.0
    pos = -1
    for i in range(8):
        if result[i] > res:
            res = result[i]
            pos = i
    return (pos, actualDup)

In [13]:
stops = set(stopwords.words('english'))
wnl = WordNetLemmatizer()
tt = TweetTokenizer()
special = re.compile('[0-9,\,,\:,\/,\=,\&,\;,\%,\$,\@,\#,\%,\^,\*,\(,\),\{,\},\[,\],\|,\>,\<,\-,\!,\?,\.\'\"]')
def getQueryWordList(word):
    wordListDup = tt.tokenize(word)
    wordList = list(set(tt.tokenize(word)))
    resultDup = []
    result = []
    for word in wordList:
        word = word.strip(string.punctuation)
        word = wnl.lemmatize(word)
        word = word.lower()
        if special.search(word) == None and word not in stops and len(word) > 1:
            result.append(word)
    for word in wordListDup:
        word = word.strip(string.punctuation)
        word = wnl.lemmatize(word)
        word = word.lower()
        if special.search(word) == None and word not in stops and len(word) > 1:
            resultDup.append(word)
    return (result, resultDup)

In [14]:
#************************************ MAIN PROGRAM ************************************

# PHASE I - Initialize

# Initialize APIs
keys = getKeys(CREDFILE)
tAPI = authorizeTwitter(keys)
gAPI = authorizeGoogle(keys)

In [15]:
(conditionSet, wordSet, emotionSet) = processTrainData()
emotionVectorReverse = {0: "sadness", 1: "neutral", 2: "surprise", 3: "happiness", 4: "relief", 5: "fun", 6: "anger", 7: "bordom"}

In [220]:
def getEmotions(tweetList):
    emotions = {"sadness": 0, "neutral": 0, "surprise": 0, "happiness": 0, "relief": 0, "fun": 0, "anger": 0, "bordom": 0}
    emotionVocab = {"sadness": [], "neutral": [], "surprise": [], "happiness": [], 
                    "relief": [], "fun": [], "anger": [], "bordom": []}
    emotionTweets = dict()
    for word in tweetList:
        (words, wordsDup) = getQueryWordList(word)
        (emotionResult, emotionWords) = classify(words, wordsDup, conditionSet, wordSet, emotionSet)
        if emotionResult != -1:
            emotions[emotionVectorReverse[emotionResult]] += 1
            emotionTweets[word] = emotionVectorReverse[emotionResult]
            for emotionWord in emotionWords:
                emotionVocab[emotionVectorReverse[emotionResult]].append(emotionWord)
    return (emotions, emotionVocab, emotionTweets)

In [331]:
def printWordClouds(wordList, name):
    wordcloud = WordCloud(background_color='white',
                           width=1000,
                           height=600, 
                           margin=0
                         ).generate(wordList)
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis("off")
    plt.savefig("data/" + name + ".png")
    
def getWordClouds(emotionVocab):
    dt = time.time()
    timeArray = time.localtime(dt)
    otherStyleTime = time.strftime("%Y_%m_%d_%H_%M_%S", timeArray)
    possible = dict()
    allWords = ""
    for (key, value) in emotionVocab.items():
        if len(value) != 0:
            possible[key] = otherStyleTime + "_" + key
            tmp = ""
            for word in value:
                tmp += " "
                tmp += word
            printWordClouds(tmp, key)
            allWords += (tmp + " ")
    printWordClouds(allWords, otherStyleTime + "_" + "all")
    possible["all"] = (otherStyleTime + "_" + "all")
    return possible

In [328]:
feeds = processQuery("yellowstone", tAPI)
data = extractData(feeds)["text"]
(emotionData, emotionVocab, emotionTweets) = getEmotions(data)
possibleImages = getWordClouds(emotionVocab)

In [297]:
# function of plotting pie chart
def plotPieChart(dataSet):
    data = pd.Series(dataSet).reset_index(name='value').rename(columns={'index':'sentiment'})
    data['angle'] = data['value']/data['value'].sum() * 2*pi
    data['color'] = Category20c[len(dataSet)]
    data['percent'] = data['value'] / sum(dataSet.values()) * 100

    p = figure(plot_height=350, title="Sentiment Distribution", toolbar_location=None,
           tools="hover", tooltips="@sentiment: @percent{0.2f} %", x_range=(-0.5, 1.0))

    p.wedge(x=0, y=1, radius=0.4,
            start_angle=cumsum('angle', include_zero=True), end_angle=cumsum('angle'),
            line_color="white", fill_color='color', legend='sentiment', source=data)

    p.axis.axis_label=None
    p.axis.visible=False
    p.grid.grid_line_color = None
#     show(p)
    return p

In [315]:
def plotTweets(tweets, condition):
    if condition != "all":
        tmp = dict()
        for (key, value) in tweets.items():
            if value == condition:
                tmp[key] = value
#         tweets.clear()
        tweets = tmp
    dataRaw = {"Tweet": list(tweets.keys()), "Emotion": list(tweets.values())}
    data = pd.DataFrame.from_records(dataRaw, columns=["Tweet", "Emotion"])
    cols = [TableColumn(field='Tweet', title='Tweet'),
            TableColumn(field='Emotion', title='Emotion'),]
    dt = DataTable(source=ColumnDataSource(data), columns=cols, width=APP_WIDTH, height=APP_HEIGHT)
    return dt

In [48]:
# function of plotting informative words
def plotTagCloud(dataset):
    p = figure(plot_width=750, plot_height=500, toolbar_sticky=True)
    p.line([-6,6], [0,0], line_width=4)
    p.line([0,0], [-6,6], line_width=4)
    source = ColumnDataSource(data=dict(x=[-6, 6, 0, 0],
                                    y=[0, 0, 6, -6],
                                    names=['unpleasant', 'pleasant', 'positive', 'negtive']))
    p.scatter(x='x', y='y', size=8, source=source)
    labels = LabelSet(x='x', y='y', text='names', level='glyph',
              x_offset=2, y_offset=2, source=source, render_mode='canvas')
    p.add_layout(labels)
    p.axis.axis_label=None
    p.axis.visible=False
    p.grid.grid_line_color = None
#     show(p)
    return p

In [330]:
def plotPanel(dataset, imageNames, condition, tweets):

    # Spacing ELements
    space1 = Div(text='div1', sizing_mode='scale_height', width=75)
    space2 = Div(text='div2', sizing_mode='scale_height', width=70)
    space3 = Div(text='div3', sizing_mode='scale_width', height=25)

    # Content for Panel 1
    most = ""
    maxi = 0
    for (key, value) in dataset.items():
        if value > maxi:
            most = key
            maxi = value
    header1 = Div(text='<div align="Left" style="display:block"> \
                            <h3>People\'s emotion of this spot<h3> \
                            <span style="display:block">Most people feel </span> \
                            <br/>' + \
                            '<span style="color: blue">' + most + '</span> \
                        </div>', width=700)
    
    pieChart = row(plotPieChart(dataset), height = 600, width = APP_WIDTH)
    
    # Content for Panel 2
    header2 = Div(text='<div align="Left" style="display:block"><h3>Harvested tweets and sentiments<h3><br></div>', width=700)
    select = imageNames["all"]
    if condition in imageNames.keys():
        select = imageNames[condition]
    dataTable = row(plotTweets(tweets, condition), height = 600, width = APP_WIDTH)
    
    # Content for Panel 3
    header3 = Div(text='<div align="Left" style="display:block"><h3>Word cloud for selected sentiment<h3><br></div>', width=700)
    imgUrl = "http://localhost:8888/files/data/" + select + ".png"
    div = Div(text="<div style='margin:-10px'> \
                        <img src=" + imgUrl + ">" + \
                   "</div>", width=1000)

    # Layout Panels
    tab1 = Panel(child=column([header1,pieChart], height=APP_HEIGHT, width=APP_WIDTH), title='General Analyze Result')
    tab2 = Panel(child=column([header2, dataTable], height=APP_HEIGHT, width=APP_WIDTH), title='Tweets')
    tab3 = Panel(child=column([header3, div], height=APP_HEIGHT, width=APP_WIDTH), title='Word Cloud')
    tabs = Tabs(tabs=[tab1, tab2, tab3])
    return tabs

In [339]:
# Handle the modification of HTML content
def modify_doc(doc):
    
    # Create the main plot
    def dashboard(mode, data, imageName, condition, emotionTweets):
        
        # Spacing
        spacing = Div(text='', sizing_mode='scale_width', height=50)
        
        # Initial Display
        if (mode == 'initial'):
            intro = Div(text='<div align="center" style="display:block"> \
                                    <h2>Welcome to TravelTwitter!</h2> \
                                    <h3>Author: Tianyu Qi, Chang Liu, Bohao Li</h3> \
                              </div>', width=700)
            p = column([spacing, intro])
        
        # Display results of Request
        elif (mode == 'results'):
            p = plotPanel(data, imageName, condition, emotionTweets)
        
        return p

    # Handle Errors
    def error(code):
        
        # Spacing
        br = Div(text='', sizing_mode='scale_width', height=50)
        
        # No results returned from Twitter search
        if (code == 'SEARCH'):
            err = Div(text='<div align="center" style="display:block"><h2>Error: No Results Found</h2></div>', width=APP_WIDTH)
            p = column([br, err])
        elif (code == 'INPUT'):
            err = Div(text='<div align="center" style="display:block"><h2>Error: Check your input</h2></div>', width=APP_WIDTH)
            p = column([br, err])
        return p
    
    def status(code):
        
        # Spacing
        space = Div(text='', sizing_mode='scale_width', height=50)

        # Harvesting data from Twitter
        if (code == 'HARVEST'):
            message = Div(text='<div align="center" style="display:block"> \
                                      <h3>Getting Tweets...</h3> \
                                      <br><br> \
                                      <iframe src="https://www.alarisworld.com/images/loading.gif" \
                                          width="200" height="200" frameBorder="0"> \
                                      </iframe> \
                                </div>', width=APP_WIDTH)
            p = column([space, message])
        
        # Classifying sentiment of Tweets
        elif (code == 'CLASSIFY'):
            message = Div(text='<div align="center" style="display:block"> \
                                      <h3>Classifying...</h3> \
                                      <br><br> \
                                      <iframe src="https://www.alarisworld.com/images/loading.gif" \
                                          width="200" height="200" frameBorder="0"> \
                                      </iframe> \
                                </div>', width=APP_WIDTH)
            p = column([space, message])
            
        # Generating plots to display results
        elif (code == 'PLOT'):
            message = Div(text='<div align="center" style="display:block"> \
                                      <h3>Ploting...</h3> \
                                      <br><br> \
                                      <iframe src="https://www.alarisworld.com/images/loading.gif" \
                                          width="200" height="200" frameBorder="0"> \
                                      </iframe> \
                                </div>', width=APP_WIDTH)
            p = column([space, message])
        
        return p
    
    # Update the plot
    def update():
        
        # Location Input
        locName = GUI.children[0].children[1].value
        condition = GUI.children[0].children[4].value
        
        if locName=='':
            GUI.children[2] = error('INPUT')
            return
        
        GUI.children[2] = status('HARVEST')
        time.sleep(2)
        feeds = processQuery(locName, tAPI)
        data = extractData(feeds)["text"]
        
        GUI.children[2] = status('CLASSIFY')
        time.sleep(1)
        (emotionData, emotionVocab, emotionTweets) = getEmotions(data)
        possibleImages = getWordClouds(emotionVocab)
        
        # Display Results
        GUI.children[2] = status('PLOT')
        time.sleep(2)
        GUI.children[2] = dashboard('results', emotionData, possibleImages, condition, emotionTweets)
        # Reset Menu
        GUI.children[0] = buildMenu()
        
    # Construct Menu
    def buildMenu():
    
        # Buttons
        submit = Button(label='Submit', button_type='primary')
        submit.on_click(update)
        conditionSubmit = Button(label='Condition', button_type='primary')
        conditionSubmit.on_click(update)
        
        # Location Input
        inputLabel = Div(text='<h3>Location Input</h3>', height=20)
        spotName = TextInput(value="", title='',sizing_mode='scale_width')
        
        # Condition Input
        conditionLabel = Div(text='<h3>Condition Input</h3>', height=20)
        select = Select(title="Sentiment", value="all", options=["all", "sadness", "neutral", "surprise", "happiness", 
                                                                "relief", "fun", "anger", "bordom"])
        
        # Input Control
        menu = widgetbox([inputLabel, spotName, submit, conditionLabel, select, conditionSubmit], width=200)
        return menu
    seperator = Div(text='', sizing_mode='scale_height', width=75)
    GUI = row([buildMenu(), seperator, dashboard('initial', None, None, None, None)], width=900, height = APP_HEIGHT)
    doc.add_root(GUI)
handler = FunctionHandler(modify_doc)
app = Application(handler)
doc = app.create_document()

In [341]:
reset_output()
output_notebook()
show(app, notebook_url="localhost:8888", notebook_handle=True)