In [None]:
! pip install umap-learn

In [2]:
import pandas as pd
from umap import UMAP

In [None]:
SQL = "SELECT * from `questrom.datasets.airline-intents` where intent in ('atis_airfare', 'atis_ground_service', 'atis_airline', 'atis_abbreviation')"
msgs = pd.read_gbq(SQL, "questrom")

In [None]:
msgs.head(3)

In [3]:
# vectorize the data
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer



In [None]:
# vectorize the data
tf = TfidfVectorizer()
tf.fit(msgs.text)

dtm = tf.transform(msgs.text)




In [None]:
type(dtm)

In [None]:
dtm.shape

In [None]:
# umap for DR

umap = UMAP(random_state=820)
embeds = umap.fit_transform(dtm)


In [None]:
type(embeds)

In [None]:
embeds.shape

In [None]:
msgs.shape

In [None]:
# scatterplot for viz
import seaborn as sns

In [None]:
# plot it up
sns.scatterplot(x=embeds[:, 0], y=embeds[:, 1], hue=msgs.intent)


In [7]:
##  there is a table on big query
##  questrom.datasets.bruins_twitter
##
##  get the records where the hour is 0,1,2,3
##  this is not a select *, you have to filter records
##  - TRICKY: apply afinn sentiment to each record
##  - ensure that the data sorted by status_id
##  - plot the sentiment score over the records (this is a timeseries - like view)
##  - calculate the average sentiment by hour
##
##

SQL = "SELECT * from `questrom.datasets.bruins_twitter`"
tweets = pd.read_gbq(SQL, "ba-820-business-analytics")

In [8]:
# a quick look
tweets.sample(3)

Unnamed: 0,status_id,created_at,text,source,hour,minute,day
2720,1204183214610436096,2019-12-09T23:37:05Z,Decade In Review: 2013 was about #BostonStrong...,Twitter for iPhone,23,37,9
7522,1204928230572068864,2019-12-12T00:57:31Z,20 minutes in the books.\n\n#NHLBruins https:/...,Twitter for Android,0,57,12
8622,1204191006079541253,2019-12-10T00:08:03Z,To the rink.\n\n#NHLBruins https://t.co/M1v6rM...,Twitter for iPhone,0,8,10


In [9]:
# info of the dataset
tweets.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10724 entries, 0 to 10723
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   status_id   10724 non-null  int64 
 1   created_at  10724 non-null  object
 2   text        10724 non-null  object
 3   source      10724 non-null  object
 4   hour        10724 non-null  int64 
 5   minute      10724 non-null  int64 
 6   day         10724 non-null  int64 
dtypes: int64(4), object(3)
memory usage: 586.6+ KB


In [None]:
! pip install afinn

In [10]:
# apply the sentiment over all of the records
from afinn import Afinn

afinn = Afinn()

# function
def sentiment(text):
  return afinn.score(text)



In [11]:
# apply the sentiment score
tweets['sent'] = tweets.text.apply(sentiment)

In [13]:
tweets.head(10)

Unnamed: 0,status_id,created_at,text,source,hour,minute,day,sent
0,1204113478518288385,2019-12-09T18:59:59Z,Cleared Waivers\nSteven Kampfer (D) | Boston B...,CapFriendly Transactions,18,59,9,1.0
1,1204115033443241992,2019-12-09T19:06:10Z,#NHLBruins center Patrice Bergeron is set to r...,Twitter Media Studio,19,6,9,0.0
2,1204115626597462018,2019-12-09T19:08:31Z,IR → NHL\nPatrice Bergeron (C) | Boston Bruins...,CapFriendly Transactions,19,8,9,0.0
3,1204116299930120193,2019-12-09T19:11:11Z,Steven Kampfer: Has been reassigned to the AHL...,Left Wing Lock News Feed,19,11,9,0.0
4,1204116764931641345,2019-12-09T19:13:02Z,Bruins at Senators 12/9/19 - #NHL Picks &amp; ...,Buffer,19,13,9,0.0
5,1204118852893847552,2019-12-09T19:21:20Z,#NHLBruins center Patrice Bergeron speaks on h...,Twitter Media Studio,19,21,9,-2.0
6,1204129128594726922,2019-12-09T20:02:10Z,NOW AIRING - @BigBadBruinsPod w/ @iglen31 and ...,Radio.co now playing,20,2,9,4.0
7,1204136907204104193,2019-12-09T20:33:05Z,Refs Dan O'Halloran and Kelly Sutherland work ...,Postcron App,20,33,9,0.0
8,1204141617667596288,2019-12-09T20:51:48Z,Patrice Bergeron returns to the lineup tonight...,Twitter Media Studio,20,51,9,0.0
9,1204157277495865344,2019-12-09T21:54:01Z,Las Vegas Odds &amp; #NHLPicks and Parlays by ...,Sprout Social,21,54,9,0.0


In [14]:
# describe
tweets.sent.describe()

count    10724.000000
mean         0.834297
std          2.927782
min        -20.000000
25%          0.000000
50%          0.000000
75%          2.000000
max         26.000000
Name: sent, dtype: float64

In [15]:
# plot sentiment over time
tweets.sort_values("status_id", ascending=True, inplace=True)

# properly handle the datetime
tweets['created'] = pd.to_datetime(tweets.created_at)

In [None]:
# plot
sns.lineplot(x=tweets.index, y=tweets.sent)


In [None]:
sns.lineplot(x=tweets.created, y=tweets.sent)

In [None]:
# aggregate by hour
tweets.groupby("hour").agg({'sent':['size', 'mean']})