In [69]:
import re
import numpy as np
import pandas as pd

from google.cloud import bigquery

from bert_serving.client import BertClient
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

from sklearn.cluster import KMeans
from sklearn.manifold import TSNE

from bokeh.plotting import figure, show, output_file
from bokeh.models import HoverTool,ColumnDataSource
import colorcet as cc

In [3]:
warriors_entities = [
    "Golden State","Warriors","Steve Kerr","Kevin Durant","Stephen Curry","DeMarcus Cousins",
    "Klay Thompson","Draymond Green","Andre Iguodala","Andrew Bogut","Damion Lee","Jordan Bell",
    "Shaun Livingston","Kevon Looney","Jonas Jerebko","Quinn Cook","Alfonzo McKinnie","Jacob Evans",
    "Damian Jones","Marcus Derrickson"
]

raptors_entities = [
    "Toronto","Raptors","Nick Nurse","Kawhi Leonard","Kyle Lowry","Jeremy Lin","Fred VanVleet",
    "Marc Gasol","Pascal Siakam","Danny Green","Serge Ibaka","OG Anunoby","Norman Powell",
    "Patrick McCaw""Chris Boucher","Jodie Meeks","Eric Moreland","Malcolm Miller","Jordan Loyd",
    "Nav Bhatia",
]

league_entities = [
    "NBA finals","NBA"
    ]

hashtags = [
    "#DubNation","#WeTheNorth","#Basketball","#Sports","#NBAFinals","#Warriors","#Raptors",
    "#GoldenState","#ESPN","#BBall","#Dunk","#Basket","#StephCurry","#KevinDurant",
    "#NBAbasketball","#GoldenStateWarriors","#Curry","#Hoops","#Player","#Game","#NBAhistory"
]

### Generate Encodings

Since we'll be getting a mean of the tweets' words, let's clean away stopwords to not be overwhelmed by those.

In [213]:
stopwords =    {"i", "me", "my", "myself", "we", "our", "ours", "ourselves", \
                "you", "your", "yours", "yourself", "yourselves", "he", "him", \
                "his", "himself", "she", "her", "hers", "herself", "it", "its", \
                "itself", "they", "them", "their", "theirs", "themselves", "what", \
                "which", "who", "whom", "this", "that", "these", "those", "am", "is", \
                "are", "was", "were", "be", "been", "being", "have", "has", "had", \
                "having", "do", "does", "did", "doing", "a", "an", "the", "and", \
                "but", "if", "or", "because", "as", "until", "while", "of", "at", \
                "by", "for", "with", "about", "against", "between", "into", \
                "through", "during", "before", "after", "above", "below", "to", \
                "from", "up", "down", "in", "out", "on", "off", "over", "under", \
                "again", "further", "then", "once", "here", "there", "when", "where", \
                "why", "how", "all", "any", "both", "each", "few", "more", "most", \
                "other", "some", "such", "no", "nor", "not", "only", "own", "same", \
                "so", "than", "s", "t", "can", "will", "just", "should", "", "fuck", "shit",
                "pussy", "bitch", "asshole", "bastard","ck"}

def clean_phrases(uncleaned_tweet, stopwords):
    cleaned_tokens = []
    for word in re.finditer(r'([a-z]+)\W', uncleaned_tweet.lower()):
        if word.group(1) not in stopwords:
            cleaned_tokens.append(word.group(1))
    return " ".join(cleaned_tokens)

In [214]:
# Initialize the BigQuery Client
client = bigquery.Client()

In [215]:
# Initialize the BERT as Service client
bc = BertClient(ip='127.0.0.1', port=5555)

In [217]:
# Perform a query.
QUERY = (
    """
    SELECT
        text, user.location
    FROM
        `myspringml2.nba_finals.nba_tweets_g3`
    WHERE
        text IS NOT NULL AND
        text NOT LIKE 'RT %'
    LIMIT 10000""")
query_job = client.query(QUERY)  # API request
rows = query_job.result()  # Waits for query to finish

In [218]:
raw_tweets = []
cleaned_tweets = []
compound_sentiments = []

analyzer = SentimentIntensityAnalyzer()

for row in rows:
    this_tweet_text = row.text
    cleaned_string = clean_phrases(this_tweet_text, stopwords)
    if cleaned_string:
        # Only add to the lists if we've successfully gotten a clean string
        raw_tweets.append(row.text)
        cleaned_tweets.append(cleaned_string)
        # Add sentiment score as well
        vs = analyzer.polarity_scores(this_tweet_text)
        compound_sentiments.append(vs['compound'])

In [219]:
# Get the encodings (the BERT server is set to REDUCE_MEAN so we will get an average
# of the encodings of each word.)
tweet_encodings = bc.encode(cleaned_tweets)

here is what you can do:
- or, start a new server with a larger "max_seq_len"
  '- or, start a new server with a larger "max_seq_len"' % self.length_limit)


### Clustering

In [220]:
N_CLUSTERS = 30

km = KMeans(n_clusters=N_CLUSTERS, max_iter=2400)

In [227]:
clust_labels = km.fit_predict(tweet_encodings)
seg_distances_to_all_ks = km.transform(tweet_encodings)
seg_distance_to_nearest_k = []
for i, label in enumerate(clust_labels):
    seg_distance_to_nearest_k.append(seg_distances_to_all_ks[i,label])

In [228]:
order_of_proximity_to_clust_centers = np.argsort(np.array(seg_distance_to_nearest_k))

### Combining to Pandas Dataframe and Subsetting to the top 10% most indicative of their clusters

In [229]:
d = {
    'raw_tweets':raw_tweets,
    'clust_labels':clust_labels,
    'proximity_order':order_of_proximity_to_clust_centers,
    'tweet_encodings_idx':range(0,len(tweet_encodings)),
    'compound_sentiment':compound_sentiments
}

combined_df = pd.DataFrame(d)

In [230]:
# Filtering for to 10%

filter_cutoff = int(len(clust_labels) * .1) + 1

In [231]:
filtered_df = combined_df[combined_df.proximity_order <= filter_cutoff]

In [232]:
filtered_df.head()

Unnamed: 0,raw_tweets,clust_labels,proximity_order,tweet_encodings_idx,compound_sentiment
15,Le propriétaire des Cavaliers Dan Gilbert se r...,14,41,15,0.0
17,“I don’t feel bad for Steph cause LeBron went ...,20,428,17,-0.5423
21,@BIGBOYCHILL Lmaoooo! I really love Toronto so...,20,386,21,0.6989
26,Bucket. Bucket.,12,577,26,0.0
28,"LeBron doesn’t want to be coached, so stop wit...",21,579,28,-0.3749


### Plotting on 2D Plot

First we reduce the dimensions to 2 using a process called TSNE.

In [233]:
selected_encodings = tweet_encodings[filtered_df.tweet_encodings_idx]

reduced_dimensions_2 = TSNE(n_components=2).fit_transform(selected_encodings)
filtered_df.loc[:,'x'] = [i[0] for i in reduced_dimensions_2];
filtered_df.loc[:,'y'] = [i[1] for i in reduced_dimensions_2];

reduced_dimensions_1 = TSNE(n_components=1).fit_transform(selected_encodings)
filtered_df.loc[:,'z'] = [i[0] for i in reduced_dimensions_1];

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the document

In [234]:
filtered_df

Unnamed: 0,raw_tweets,clust_labels,proximity_order,tweet_encodings_idx,compound_sentiment,x,y,z
15,Le propriétaire des Cavaliers Dan Gilbert se r...,14,41,15,0.0000,41.864017,11.296360,33.460182
17,“I don’t feel bad for Steph cause LeBron went ...,20,428,17,-0.5423,-11.868267,16.270046,-12.110106
21,@BIGBOYCHILL Lmaoooo! I really love Toronto so...,20,386,21,0.6989,3.265311,9.574461,16.489031
26,Bucket. Bucket.,12,577,26,0.0000,10.330570,29.069399,9.004966
28,"LeBron doesn’t want to be coached, so stop wit...",21,579,28,-0.3749,-11.052497,17.373768,-11.762106
43,@BR_NBA @NBA A G,9,804,43,0.0000,-3.549524,-27.288439,-5.594676
46,😎 Raptors point guard Kyle Lowry pushed by a f...,11,229,46,0.6486,-16.665897,-11.774182,-7.903577
49,I like Canada.\nI like Stamp Collecting.\nI li...,25,183,49,0.8481,-11.417602,3.187245,-13.488012
93,I hate Steph Curry but big respect to him for ...,23,612,93,0.9468,-16.924686,10.944407,-14.957760
109,#NBAFinals RT @RaptorsNationCP: Nick Nurse is ...,5,537,109,-0.2960,4.828117,-7.938273,17.359179


In [235]:
# Set up our colors
nba_category_palette = cc.glasbey[:N_CLUSTERS]
fills = [nba_category_palette[clust] for clust in filtered_df.clust_labels]

### Plot 1: 2D Topic Clustering

In [242]:
output_file('tweet_clusters_2dtopic.html')

source = ColumnDataSource(
        data=dict(x=filtered_df['x'],y=filtered_df['y'],
            tweet = filtered_df['raw_tweets'],
            fills = fills))

hover = HoverTool(tooltips=[("Text","@tweet")])

tsne_plot_a = figure(tools=[hover])
tsne_plot_a.scatter('x','y',source=source,fill_color='fills',fill_alpha=0.5,radius=.8,line_color=None)

show(tsne_plot_a)

In [243]:
output_file('tweet_clusters_1dsent_1dtopic.html')

source = ColumnDataSource(
        data=dict(x=filtered_df['z'],y=filtered_df['compound_sentiment'],
            tweet = filtered_df['raw_tweets'],
            fills = fills))

hover = HoverTool(tooltips=[("Text","@tweet")])

tsne_plot_b = figure(tools=[hover])
tsne_plot_b.scatter('x','y',source=source,fill_color='fills',fill_alpha=0.5,radius=.6,line_color=None)

show(tsne_plot_b)

In [209]:
from bokeh.embed import file_html
from bokeh.resources import CDN

In [244]:
html_a = file_html(tsne_plot_a, CDN)
print(html_a)





<!DOCTYPE html>
<html lang="en">
  
  <head>
    
      <meta charset="utf-8">
      <title>Bokeh Application</title>
      
      
        
          
        <link rel="stylesheet" href="https://cdn.pydata.org/bokeh/release/bokeh-1.0.2.min.css" type="text/css" />
        
        
          
        <script type="text/javascript" src="https://cdn.pydata.org/bokeh/release/bokeh-1.0.2.min.js"></script>
        <script type="text/javascript">
            Bokeh.set_log_level("info");
        </script>
        
      
      
    
  </head>
  
  
  <body>
    
      
        
          
          
            
              <div class="bk-root" id="61e3f4f4-01b7-4ac8-ab7b-141565779d67"></div>
            
          
        
      
      
        <script type="application/json" id="4647">
          {"7298d173-bfb9-4098-af58-c113421ba638":{"roots":{"references":[{"attributes":{},"id":"4435","type":"BasicTickFormatter"},{"attributes":{"callback":null,"tooltips":[["Text","@tweet"]]},"id":

In [245]:
html_b = file_html(tsne_plot_b, CDN)
print(html_b)





<!DOCTYPE html>
<html lang="en">
  
  <head>
    
      <meta charset="utf-8">
      <title>Bokeh Application</title>
      
      
        
          
        <link rel="stylesheet" href="https://cdn.pydata.org/bokeh/release/bokeh-1.0.2.min.css" type="text/css" />
        
        
          
        <script type="text/javascript" src="https://cdn.pydata.org/bokeh/release/bokeh-1.0.2.min.js"></script>
        <script type="text/javascript">
            Bokeh.set_log_level("info");
        </script>
        
      
      
    
  </head>
  
  
  <body>
    
      
        
          
          
            
              <div class="bk-root" id="6143315f-ccca-4a2b-b6d6-a81022b743d3"></div>
            
          
        
      
      
        <script type="application/json" id="4704">
          {"5b8731d5-6322-47f3-b243-4c7e561532c1":{"roots":{"references":[{"attributes":{},"id":"4530","type":"BasicTickFormatter"},{"attributes":{},"id":"4507","type":"LinearScale"},{"attributes":{"f