In [8]:
# LEARNING GOALS
#
#                 - text as a datasource
#                 - cleaning text
#                 - basic eda
#                 - Doc Term Matrix representation by hand
#                 - The intuition behind working with text before jumping into tools that abstract this away
#                 - how text can be used in ML


In [9]:
# some helpful resources:
# https://www.w3schools.com/python/python_regex.asp
# https://docs.python.org/3/library/re.html
# https://www.debuggex.com/cheatsheet/regex/python
# https://www.shortcutfoo.com/app/dojos/python-regex/cheatsheet

In [10]:
# installs
# ! pip install newspaper3k
# ! pip install -U spacy
# ! pip install wordcloud
# ! pip install emoji
# ! pip install nltk
# ! pip install scikit-plot

In [11]:
# imports
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import scikitplot as skplot

# some "fun" new packages
from wordcloud import WordCloud
import emoji

import re

# new imports for text specific tasks
import spacy
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer,TfidfVectorizer  
import nltk


In [63]:
a = ['I like turtles!',
      'You like hockey and golf ',
      'Turtles and hockey ftw',
     'Python is very easy to learn. 🐍',
      'A great resource is www.spacy.io',
      ' Today is the Feb 22, 2021 !           ',
     '@username #hashtag https://www.text.com',
      'BA820 ']
df = pd.DataFrame({'text':a})
df

Unnamed: 0,text
0,I like turtles!
1,You like hockey and golf
2,Turtles and hockey ftw
3,Python is very easy to learn. 🐍
4,A great resource is www.spacy.io
5,"Today is the Feb 22, 2021 !"
6,@username #hashtag https://www.text.com
7,BA820


In [13]:
## QUICK QUESTION
##        What do you see about the data being brought in?

In [64]:
## we can always get the values back
df.text.values.tolist()

['I like turtles!',
 'You like hockey and golf ',
 'Turtles and hockey ftw',
 'Python is very easy to learn. 🐍',
 'A great resource is www.spacy.io',
 ' Today is the Feb 22, 2021 !           ',
 '@username #hashtag https://www.text.com',
 'BA820 ']

In [66]:
# quick review of some of the string funcationality
# we saw in 760

# capitalize or change case
# upper, lower, strip
df.text.str.lower()
df.text.str.strip()

0                            I like turtles!
1                   You like hockey and golf
2                     Turtles and hockey ftw
3            Python is very easy to learn. 🐍
4           A great resource is www.spacy.io
5                Today is the Feb 22, 2021 !
6    @username #hashtag https://www.text.com
7                                      BA820
Name: text, dtype: object

In [68]:
# we can detect
df.text.str.contains("turtle").count()
df.text.str.replace("a","ZZZ")

0                                  I like turtles!
1                      You like hockey ZZZnd golf 
2                         Turtles ZZZnd hockey ftw
3              Python is very eZZZsy to leZZZrn. 🐍
4             A greZZZt resource is www.spZZZcy.io
5         TodZZZy is the Feb 22, 2021 !           
6    @usernZZZme #hZZZshtZZZg https://www.text.com
7                                           BA820 
Name: text, dtype: object

In [17]:
# remember python is case sensitive!


In [18]:
# we can replace anything that matches a pattern
# but we will come back to patterns


In [69]:
# we can look at the length
df.text.str.len()

0    15
1    25
2    22
3    31
4    32
5    39
6    39
7     6
Name: text, dtype: int64

In [20]:
#### NOTE:
##      but look at above, what do you notice about the lengths calculated?


In [21]:
# lets look at the values directly again for the last entry



In [73]:
# lets count characters and numbers
df.text.str.count("['a-zA-Z0-9']")

0    12
1    20
2    19
3    23
4    26
5    19
6    30
7     5
Name: text, dtype: int64

In [23]:
## regex
## https://www.regular-expressions.info/quickstart.html
##
## https://regex101.com/     <------------- fantastic resource
##
## [a-z] will match a single letter lowercase a to z
## [A-Z] will match a single letter uppercase A to Z
## [a-zA-Z0-9] will match a single character that is alphanumeric
## ^ matches a pattern at the start
## $ matches a pattern at the end
## + will match a pattern one or more times
## * will match 0 or more
## .* will match everything (dot is any character)
## {3} match pattern exactly 3 times
## {2,4} match a pattern 2 to 4 times
## {3, } match a pattern 3 or more times
## | allows us to specify "or"
## so much more including special patterns and shortcuts
## \d for a digit
## \w for word characters
## \s for whitespace

In [76]:
# only print out entries if the pattern matches
FIND = df.text.str.contains("tu+|BA")
df.text[FIND]

0    I like turtles!
7             BA820 
Name: text, dtype: object

In [25]:
# again, case sensitive


In [26]:
# we can use "OR" logic


In [27]:
# matches


In [28]:
# more matches


In [29]:
# special characters anywhere - digits


In [30]:
# extract username or hashtag
# uses not whitespace character, repeating 1+


In [31]:
# you may get an error around capture groups
# a group is in parentheses


> Regular expressions and searching text can be a superpower when working with text.  If we have a large corpus, we can interate over the documents and scan/search via regular expressions to extract our datasets!

In [32]:
## Thought Exercise:
##    Our datasets that we typically see take the shape of:
##    Rows =    Observations
##    Columns = Attributes about those Observations
## 
##    How can we map this to text?
##
##    Rows =    A document (the source, we will talk about this)
##    Columns = The words in the document
##   
##    Above can be referred to as a Document Term Matrix, or Document Feature Matrix
##


In [77]:
# lets reset the dataframe

df = pd.DataFrame({'doc':a})
df

Unnamed: 0,doc
0,I like turtles!
1,You like hockey and golf
2,Turtles and hockey ftw
3,Python is very easy to learn. 🐍
4,A great resource is www.spacy.io
5,"Today is the Feb 22, 2021 !"
6,@username #hashtag https://www.text.com
7,BA820


In [78]:
df['tokens'] = df.doc.str.split()
df.head(2)

Unnamed: 0,doc,tokens
0,I like turtles!,"[I, like, turtles!]"
1,You like hockey and golf,"[You, like, hockey, and, golf]"


In [79]:
# if we really wanted to (or had to), we 
# have the python chops to make this a doc/term matrix

# step 0, just the tokens but keep as a dataframe
tdf = df[['tokens']]

#step 1: melt it via explode
tdf_long = tdf.explode("tokens")
tdf_long

Unnamed: 0,tokens
0,I
0,like
0,turtles!
1,You
1,like
1,hockey
1,and
1,golf
2,Turtles
2,and


In [80]:
# step 3: back to wide for a dtm
tdf_long['value'] = 1
dtm = tdf_long.pivot_table(columns="tokens", 
                           values="value", 
                           index=tdf_long.index,
                           aggfunc=np.count_nonzero)





In [83]:
dtm

tokens,!,#hashtag,2021,"22,",@username,A,BA820,Feb,I,Python,...,is,learn.,like,resource,the,to,turtles!,very,www.spacy.io,🐍
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0
4,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
5,1.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
6,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [84]:
# lets review what we have
dtm.fillna(0, inplace=True)
dtm

tokens,!,#hashtag,2021,"22,",@username,A,BA820,Feb,I,Python,...,is,learn.,like,resource,the,to,turtles!,very,www.spacy.io,🐍
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0
4,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
5,1.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
6,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [37]:
## Quick thought exercise:
##      What do you notice about our tokenized dataset
##      What about the values?  What would you change?
##   



In [38]:
################ YOUR TURN
##  from the topics table on big query (questrom.datasets.topics), 
##  bring in just the text column via select
##  Make the text lowercase
##  Tricky!! remove punctuation if you can (keep just letters and numbers)
##  get the text into a long form where each token is a row in the dataframe
##  

In [107]:
SQL = "SELECT * from `questrom.datasets.topics`"
YOUR_BILLING_PROJECT = "ba820-329602"
topics = pd.read_gbq(SQL, YOUR_BILLING_PROJECT)
topic_ml = topics.copy()

In [99]:
topics

Unnamed: 0,id,text,topic
0,1,I'm going to be out of the country for about a...,Shipping
1,2,I was wondering if you'd be able to overnight ...,Shipping
2,3,The Swingline electronic stapler (472555) look...,Shipping
3,4,I think this cosmetic bag would work great for...,Shipping
4,5,I'm going to be out of the state for about a w...,Shipping
...,...,...,...
4995,3469,Do you provide tracking info with all your ord...,Shipping
4996,3474,Do you provide tracking info with all your ord...,Shipping
4997,3486,Do you provide tracking info with all your ord...,Shipping
4998,3494,Do you provide tracking info with all your ord...,Shipping


In [100]:
topics.text.str.lower()

0       i'm going to be out of the country for about a...
1       i was wondering if you'd be able to overnight ...
2       the swingline electronic stapler (472555) look...
3       i think this cosmetic bag would work great for...
4       i'm going to be out of the state for about a w...
                              ...                        
4995    do you provide tracking info with all your ord...
4996    do you provide tracking info with all your ord...
4997    do you provide tracking info with all your ord...
4998    do you provide tracking info with all your ord...
4999    do you provide tracking info with all your ord...
Name: text, Length: 5000, dtype: object

In [90]:
topics["text"] = topics['text'].str.replace('[^\w\s]','')

  topics["text"] = topics['text'].str.replace('[^\w\s]','')


In [101]:
def remove_punct(text):
  import string
  text = ''.join([p for p in text if p not in set(string.punctuation)])
  return text

topics['text'] = topics.text.apply(remove_punct)

In [102]:
topics['tokens'] = topics.text.str.split()
topics.head(2)

Unnamed: 0,id,text,topic,tokens
0,1,Im going to be out of the country for about a ...,Shipping,"[Im, going, to, be, out, of, the, country, for..."
1,2,I was wondering if youd be able to overnight a...,Shipping,"[I, was, wondering, if, youd, be, able, to, ov..."


In [103]:
tdf = topics[['tokens']]

#step 1: melt it via explode
tdf_long = tdf.explode("tokens")
tdf_long

Unnamed: 0,tokens
0,Im
0,going
0,to
0,be
0,out
...,...
4999,be
4999,eligible
4999,for
4999,overnight


In [104]:
tdf_long['value'] = 1
dtm = tdf_long.pivot_table(columns="tokens", 
                           values="value", 
                           index=tdf_long.index,
                           aggfunc=np.count_nonzero)


In [105]:
dtm

tokens,000293,000907,002458,002694,002698,003456,004945,005789,006107,008082,...,young,youor,your,yours,youth,zip,zipped,zipper,zipperdetail,zippered
0,,,,,,,,,,,...,,,,,,,,,,
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,,,,,,,,,,,...,,,1.0,,,,,,,
4996,,,,,,,,,,,...,,,1.0,,,,,,,
4997,,,,,,,,,,,...,,,1.0,,,,,,,
4998,,,,,,,,,,,...,,,1.0,,,,,,,


In [96]:
dtm.fillna(0, inplace=True)
dtm

tokens,000293,000907,002458,002694,002698,003456,004945,005789,006107,008082,...,young,youor,your,yours,youth,zip,zipped,zipper,zipperdetail,zippered
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4996,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4997,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4998,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [39]:
# just highlighting what is possible, you don't need to do this
# keep just the numbers and letters
# just highlighting that depending on your use cases, you can 
# roll your own functions to clean text
# pandas makes it easy to `apply` these to our text column!


In [40]:
#################################### Lets predict the category!
##
## we now have a dataset that can be used to fit a ML model.  
## the quality of the models and how we think about ML tasks is all about the data
## let's start with this framing for intuition
##
##


In [41]:
## get the topics data again

# topics = pd.read_gbq("SELECT * from `questrom.datasets.topics`", "questrom")
# topics.shape

In [42]:
# what do we have


In [43]:
# what do we have for a distro on topics?


In [44]:
# imports -- violating my rule of thumb, but lets put that aside for emphasis

# from sklearn.tree import DecisionTreeClassifier
# from sklearn.model_selection import train_test_split
# from sklearn import metrics


In [45]:
# remember, we have the topics data


In [46]:
# we can tokenize our data with sklearn pipelines
# above highlights we have full control, but there are frameworks that aim to abstract this for us
# abstractions have their own overhead costs, but lets build on top of sklearn to soften the impact

# cv = CountVectorizer()
# cv.fit(topics.text)

In [47]:
# we can easily have done fit_transform, but lets explore what was learned about our corpus

# get the vocabulary and their term:numeric id map
# this is a common representation for downstream word embedding tasks


In [48]:
# length


In [49]:
## make this a numeric matrix of document by term (dtm)


In [50]:
# confirm the shape is what we expect


In [51]:
# missing data are zeros


In [52]:
# make this a dataframe to help with our mental model

# dtm_df = pd.DataFrame(dtm.toarray(), columns=cv.get_feature_names())
# dtm_df.columns

In [53]:
# lets build the datasets for the model

# X = dtm_df.copy()
# y = topics.topic

In [54]:
# confirm we have the same thing


In [55]:
# split the data

# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.3, random_state=820)

In [56]:
# fit the model

# tree = DecisionTreeClassifier(max_depth=5, min_samples_split=30, min_samples_leaf=15)
# tree.fit(X_train, y_train)


In [57]:
# fit metrics on test

# preds = tree.predict(X_test)
# ctable = metrics.classification_report(y_test, preds)
# print(ctable)

In [58]:
# confusion matrix from skplot
# cancan see where the model isn't sure

# skplot.metrics.plot_confusion_matrix(y_test, preds, 
#                                      figsize=(7,4), 
#                                      x_tick_rotation=90 )



In [59]:
# accuracy score   <----- confirming the classification report


In [60]:
#################################### REVIEW
##
## - normal text form -> a DTM
## - we saw that tokenizing, and the logic we apply, matters (case, punctuation)
#     will we see even more example
## - if we had to, we can parse text into a format for machine learning
## - nothing stopping us from passing in a count-based dtm into a ML model!
## 

In [61]:
############################################################
########################################### Team Challenge
############################################################
# 
## Work in Project Groups
# 
# - tokenize the dataset on Big Query from 
# URL link: https://console.cloud.google.com/bigquery?project=questrom&d=SMSspam&

## review the slides at the end of this module
## predict spam
## objetive =  based on f1
## only input is text, but you can derive features
## limited time, but how do you maximize your time (and the model?)
## HINTS:
##        start small, simple models
##        iterate and see how you do against the leaderboard
##        code above helps you with the core mechanics


In [62]:
# get the datasets - select * is fine, but there are two datasets and an example submission to review!


! head myteam-submission.csv