## In your own words: Computing customer similarity using website text data
#### Workshop developed for DSS Austin '19
#### By: Ben Batorsky [github](https://github.com/bpben)


### Import necessary libraries

In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import NMF, TruncatedSVD
from sklearn.metrics.pairwise import cosine_similarity
from collections import Counter
import re

In [2]:
# optional, can run exercise code without these
import Stemmer
stemmer = Stemmer.Stemmer('english')
import spacy
nlp = spacy.load('en_core_web_sm')
from gensim.models.phrases import Phrases

In [6]:
DATA_PATH = './data/'

### Data ingestion
The data for the workshop comes from a random set of business website text

In [7]:
full_data = pd.read_pickle(DATA_PATH+'website_text.pkl')
w_content = full_data['content']

In [369]:
#full_data.content.iloc[0]

### Preprocessing


In [11]:
# capitalization
text = "We're Cowboys fans, but we're not cowboys"
print(Counter(text.split()))
print(Counter(text.lower().split()))

Counter({"We're": 1, 'Cowboys': 1, 'fans,': 1, 'but': 1, "we're": 1, 'not': 1, 'cowboys': 1})
Counter({"we're": 2, 'cowboys': 2, 'fans,': 1, 'but': 1, 'not': 1})


In [13]:
# punctuation
text = "We're Cowboys fans, but we're not cowboys"
strip_punct = '[^A-Za-z0-9 ]'
print(text)
print(re.sub(strip_punct, '', text))

We're Cowboys fans, but we're not cowboys
Were Cowboys fans but were not cowboys


In [14]:
# numbers
text = 'Call 867-5309'
strip_num = '[0-9]'
print(text)
print(re.sub(strip_num, '', text))

Call 867-5309
Call -


In [16]:
# urls
# regex from textacy: https://github.com/chartbeat-labs/textacy
SHORT_URL_REGEX = re.compile(
    r"(?:^|(?<![\w/.]))"
    # optional scheme
    r"(?:(?:https?://)?)"
    # domain
    r"(?:\w-?)*?\w+(?:\.[a-z]{2,12}){1,3}"
    r"/+"
    # hash
    r"[^\s.,?!'\"|+]{2,12}"
    r"(?:$|(?![\w?!+&/]))",
    flags=re.IGNORECASE)
text = 'Check out this conference: https://datascience.salon/austin/'
print(text)
print(SHORT_URL_REGEX.sub('', text))

Check out this conference: https://datascience.salon/austin/
Check out this conference: 


#### Exercise: Write your preprocessing script
Combine some of the regex expressions (or write your own!) to process the text data

In [17]:
def preprocess(text):
    # url
    text = SHORT_URL_REGEX.sub('', text)
    # numbers
    text = re.sub(strip_num, '', text)
    # punctuation
    text = re.sub(strip_punct, '', text)
    # capitalization
    text = text.lower()
    return(text)

In [18]:
w_processed = w_content.apply(preprocess)

### From text to vectors


In [19]:
CountVectorizer()

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [26]:
# n-grams
# note: 1-letter words are dropped by default
text = ['I have a lovely bunch of coconuts']
for n in range(1, 4):
    vec = CountVectorizer(ngram_range=(1, n))
    print('1 to {}-grams: '.format(n), list(vec.fit(text).vocabulary_.keys()))

1 to 1-grams:  ['have', 'lovely', 'bunch', 'of', 'coconuts']
1 to 2-grams:  ['have', 'lovely', 'bunch', 'of', 'coconuts', 'have lovely', 'lovely bunch', 'bunch of', 'of coconuts']
1 to 3-grams:  ['have', 'lovely', 'bunch', 'of', 'coconuts', 'have lovely', 'lovely bunch', 'bunch of', 'of coconuts', 'have lovely bunch', 'lovely bunch of', 'bunch of coconuts']


In [370]:
# choosing cutoffs
texts = ['I have a lovely bunch of coconuts']
texts = texts*9
texts.append('I have a lovely bunch of pears')
vec = CountVectorizer()
print('Default (no minimum): ', list(vec.fit(texts).vocabulary_.keys()))
vec = CountVectorizer(min_df=.2)
print('Appear in >=20% of docs: ', list(vec.fit(texts).vocabulary_.keys()))
vec = CountVectorizer(max_df=.1)
print('Appear in <=10% of docs: ', list(vec.fit(texts).vocabulary_.keys()))

Default (no minimum):  ['have', 'lovely', 'bunch', 'of', 'coconuts', 'pears']
Appear in >=20% of docs:  ['have', 'lovely', 'bunch', 'of', 'coconuts']
Appear in <=10% of docs:  ['pears']


In [28]:
# stemming and lemmatizing
words = ['ponies', 'operation', 'are']
for w in words:
    print('Stem of {}: {}'.format(w, stemmer.stemWord(w)))
    print('Lemma of {}: {}'.format(w, nlp(w)[0].lemma_))
    print()

Stem of ponies: poni
Lemma of ponies: pony

Stem of operation: oper
Lemma of operation: operation

Stem of are: are
Lemma of are: be



In [29]:
# entities with spaCy
text = "I'm a Cowboys fan, but I'm not a cowboy"
ents = nlp(text).ents
for e in ents:
    print(e, e.label_)

Cowboys ORG


In [59]:
# entities with gensim - co-location
# This example is a bit odd: Likely not dealing with a bunch of duplicates
# Worth noting that, all likelihood being equal, gensim picks the first in a series of bigrams
texts = ['have a lovely bunch of coconuts']
texts = texts*4
texts.append('have a lovely bunch of pears')
texts.append('have a lovely bunch of pears')
split_texts = [x.split() for x in texts]
phrases = Phrases(split_texts, min_count=1, threshold=1)
phrases_m2 = Phrases(split_texts, min_count=2, threshold=1)
# scores are based on https://radimrehurek.com/gensim/models/phrases.html#gensim.models.phrases.original_scorer
# higher = more likely to be a bigram
print('with min count = 1:') 
print([x for x in phrases.export_phrases([split_texts[-1]])])
print('with min count = 2')
print([x for x in phrases_m2.export_phrases([split_texts[-1]])])

with min count = 1:
[(b'have a', 1.8055555555555556), (b'lovely bunch', 1.8055555555555556), (b'of pears', 1.0833333333333333)]
with min count = 2
[(b'have a', 1.4444444444444444), (b'lovely bunch', 1.4444444444444444)]


In [60]:
# creating count vectors
texts = []
text = "We are Cowboys fans, but we are not cowboys"
text_tagged = "We are CowboysORG fans, but we are not cowboys"
# phrase without tags, lowercase
texts.append(text.lower())
# phrase with tags, lowercase
texts.append(text_tagged.lower())
# utility to display vectorizer
def display_vec(vec, data):
    df = pd.DataFrame(data.toarray(),
                     columns=vec.get_feature_names())
    return(df)
# count vector
vec = CountVectorizer()
data = vec.fit_transform(texts)
print('count vectors \n', display_vec(vec, data))
# binary count vector
b_vec = CountVectorizer(binary=True)
data = b_vec.fit_transform(texts)
print('binary vectors \n', display_vec(b_vec, data))

count vectors 
    are  but  cowboys  cowboysorg  fans  not  we
0    2    1        2           0     1    1   2
1    2    1        1           1     1    1   2
binary vectors 
    are  but  cowboys  cowboysorg  fans  not  we
0    1    1        1           0     1    1   1
1    1    1        1           1     1    1   1


#### Exercise: Remove the stopwords from the above texts
Use what we explored above to remove the stopwords from the count vectors of the following texts

In [61]:
texts = ['we are cowboys fans, but we are not cowboys',
 'we are cowboysorg fans, but we are not cowboys']

nostop_vec = CountVectorizer(stop_words='english')
data = nostop_vec.fit_transform(texts)
print('count vectors without stopwords \n', display_vec(nostop_vec, data))

count vectors without stopwords 
    cowboys  cowboysorg  fans
0        2           0     1
1        1           1     1


#### Exercise: Set limits on the vocabulary to remove potentially irrelevant words
With the following set of texts, set a limit to remove unimportant words like "Patriots"

In [62]:
texts = ['We are Cowboys fans',
        'We are cowboys',
        'We are Patriots fans']
texts_lower = [t.lower() for t in texts]

limit_vec = CountVectorizer(min_df=2)
data = limit_vec.fit_transform(texts)
print('Count vector with vocab limit \n', display_vec(limit_vec, data))

Count vector with vocab limit 
    are  cowboys  fans  we
0    1        1     1   1
1    1        1     0   1
2    1        0     1   1


In [63]:
# TF-IDF weighting
TfidfVectorizer()

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)

In [64]:
texts = ['We are Cowboys fans',
         'We are Patriots fans']

In [65]:
# calculate term frequency
vec = CountVectorizer()
count_vectors = vec.fit_transform(texts)
count_df = display_vec(vec, count_vectors)
print(count_df)

   are  cowboys  fans  patriots  we
0    1        1     1         0   1
1    1        0     1         1   1


Formula for inverse document frequency weight:

$$log(\frac{N}{df(t)}) + 1$$

"smooth" option ensures no zero-divisions:

$$log(\frac{N+1}{df(t)+1}) + 1$$

In [67]:
# get inverse document frequency
df = np.log(3/(1+count_df.sum()))+1
print(df)

are         1.000000
cowboys     1.405465
fans        1.000000
patriots    1.405465
we          1.000000
dtype: float64


In [68]:
# calculate tfidf
tfidf_df = count_df*df
# normalize
print(tfidf_df.apply(
    lambda x: x/np.sqrt(x.dot(x)), axis=1))

        are   cowboys      fans  patriots        we
0  0.448321  0.630099  0.448321  0.000000  0.448321
1  0.448321  0.000000  0.448321  0.630099  0.448321


In [69]:
# now with scikit-learn
tfidf = TfidfVectorizer()
data = tfidf.fit_transform(texts)
print(display_vec(tfidf, data))

        are   cowboys      fans  patriots        we
0  0.448321  0.630099  0.448321  0.000000  0.448321
1  0.448321  0.000000  0.448321  0.630099  0.448321


#### Exercise: Turn text to vectors
Using what we've gone through above, create your own count vectorizer and TFIDF vectorizer.  Apply these vectorizers to the data, and store the result.

In [70]:
vector_params = {'min_df': .005, 'max_df': .3, 'stop_words':'english'}

In [371]:
count_vectorizer = CountVectorizer(**vector_params)
tfidf_vectorizer = TfidfVectorizer(**vector_params)

In [372]:
count_vecs = count_vectorizer.fit_transform(w_processed)
tfidf_vecs = tfidf_vectorizer.fit_transform(w_processed)

### Matrix factorization and topic modelling

#### Latent Semantic Indexing
In scikit-learn this is implemented as TruncatedSVD, a version of SVD where the top k elements are retained


In [73]:
TruncatedSVD()

TruncatedSVD(algorithm='randomized', n_components=2, n_iter=5,
       random_state=None, tol=0.0)

In [175]:
# couple examples website text
# choose some from pretty opposite industries
random_state = 9
example_inds = ['Health and Fitness', 'Home & Home Improvement']
example_idxs = []
for ind in example_inds:
    ind_data = full_data[full_data.type==ind]
    idxs = ind_data.sample(n=2, random_state=random_state).index
    example_idxs.extend(idxs.tolist())
example_texts = w_content.loc[example_idxs]
example_texts

812    Basketball Training in Greenville, South Carol...
558    Personal Trainer | Nutrition Coach | Bolton, M...
610    Siding company in Millersburg, OH | Holmes Sid...
946    Glass solutions in Devils Lake, ND  | The Glas...
Name: content, dtype: object

In [373]:
# LSI requires tfidf-weighted vectors, use from above
tfidf_example = tfidf_vectorizer.transform(example_texts)
# create display of examples
display_example = display_vec(tfidf_vectorizer, tfidf_example)
# for clarity, drop vocab that does not occur
display_example.loc[:, (display_example.sum(axis=0)>0).values]

Unnamed: 0,accepted,action,added,advanced,advantage,afternoon,allowed,answer,application,area,...,websites,weeks,weight,wellness,windows,working,workmanship,worth,year,young
0,0.0,0.0,0.0,0.034436,0.036078,0.0,0.0,0.0,0.0,0.042093,...,0.04141,0.037496,0.0,0.0,0.0,0.02398,0.0,0.036615,0.0,0.034227
1,0.0,0.06906,0.0,0.0,0.0,0.082503,0.08393,0.0,0.129506,0.0,...,0.0,0.0,0.131218,0.338251,0.0,0.0,0.0,0.0,0.0,0.0
2,0.041274,0.0,0.036495,0.0,0.0,0.0,0.0,0.067037,0.0,0.0,...,0.0,0.0,0.0,0.0,0.107877,0.0,0.079786,0.0,0.02515,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.071538,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [374]:
def display_components(model, word_features, top_display=5):
    # utility for displaying respresentative words per component
    for topic_idx, topic in enumerate(model.components_):
        print("Topic %d:" % (topic_idx))
        top_words_idx = topic.argsort()[::-1][:top_display]
        top_words = [word_features[i] for i in top_words_idx]
        print(" ".join(top_words))

In [375]:
# specify number of components
n_components = 4
svd = TruncatedSVD(n_components=n_components, random_state=random_state)
svd_example = svd.fit_transform(tfidf_example)
display_components(svd, tfidf_vectorizer.get_feature_names())

Topic 0:
training basketball coaching glass skills
Topic 1:
siding glass contractors lake products
Topic 2:
siding contractors products installation gutters
Topic 3:
basketball skills training camps combine


In [376]:
# for display
pd.DataFrame(svd_example,
             index=[t[:50] for t in example_texts])

Unnamed: 0,0,1,2,3
"Basketball Training in Greenville, South Carolina",0.712078,-0.203583,-0.073317,0.667925
"Personal Trainer | Nutrition Coach | Bolton, MA A",0.634499,-0.444886,0.10218,-0.623736
"Siding company in Millersburg, OH | Holmes Siding",0.320416,0.634437,0.702907,-0.0273
"Glass solutions in Devils Lake, ND | The Glass Sh",0.358093,0.625431,-0.66421,-0.198572


#### Exercise: Create LSI vectors
Using the TFIDF vectors from above, create LSI vectors for the website text data.

In [377]:
# likely better to use more than 4 components
n_components = 10
lsi = TruncatedSVD(n_components=n_components)
lsi_vecs = lsi.fit_transform(tfidf_vecs)

In [378]:
tfidf_vecs

<1000x4591 sparse matrix of type '<class 'numpy.float64'>'
	with 100650 stored elements in Compressed Sparse Row format>

In [379]:
display_components(lsi, tfidf_vectorizer.get_feature_names())

Topic 0:
care products read pm health
Topic 1:
cleaning repair carpet commercial residential
Topic 2:
insurance auto coverage cleaning repair
Topic 3:
insurance menu food pizza catering
Topic 4:
cleaning care insurance health carpet
Topic 5:
repair auto care massage car
Topic 6:
repair auto cleaning car computer
Topic 7:
care dental law patient dr
Topic 8:
spa law hair estate salon
Topic 9:
massage pest therapy lawn control


#### Non-negative matrix factorization

In [380]:
NMF()

NMF(alpha=0.0, beta_loss='frobenius', init=None, l1_ratio=0.0, max_iter=200,
  n_components=None, random_state=None, shuffle=False, solver='cd',
  tol=0.0001, verbose=0)

In [381]:
print(example_texts)

812    Basketball Training in Greenville, South Carol...
558    Personal Trainer | Nutrition Coach | Bolton, M...
610    Siding company in Millersburg, OH | Holmes Sid...
946    Glass solutions in Devils Lake, ND  | The Glas...
Name: content, dtype: object


In [382]:
# NMF also requires tfidf-weighted vectors
tfidf_example = tfidf_vectorizer.transform(example_texts)

In [383]:
# specify number of components
# with NMF, n_components must be <= number of documents
n_components = 4
nmf = NMF(n_components=n_components)
nmf_example = nmf.fit_transform(tfidf_example)
display_components(nmf, tfidf_vectorizer.get_feature_names())

Topic 0:
training basketball skills camps combine
Topic 1:
glass lake nd shop mail
Topic 2:
siding contractors products installation gutters
Topic 3:
coaching wellness choices personal foods


In [384]:
pd.DataFrame(nmf_example,
             index=[t[:50] for t in example_texts])

Unnamed: 0,0,1,2,3
"Basketball Training in Greenville, South Carolina",0.772661,1.588835e-09,1.367966e-09,0.0
"Personal Trainer | Nutrition Coach | Bolton, MA A",4.3e-05,0.0,0.0,1.12806
"Siding company in Millersburg, OH | Holmes Siding",0.0,1.714173e-09,1.073865,0.0
"Glass solutions in Devils Lake, ND | The Glass Sh",0.0,0.9722972,0.0,2.395137e-16


#### Exercise: Create NMF vectors
Using the TFIDF vectors, create NMF vectors for the website text data.

In [385]:
# likely better to use more than 5 components
n_components = 10
nmf = NMF(n_components=n_components)
nmf_vecs = nmf.fit_transform(tfidf_vecs)

In [386]:
display_components(nmf, tfidf_vectorizer.get_feature_names())

Topic 0:
marketing media solutions digital consulting
Topic 1:
cleaning carpet commercial clean janitorial
Topic 2:
insurance coverage agency auto quote
Topic 3:
menu food pm pizza catering
Topic 4:
massage spa hair salon skin
Topic 5:
repair auto car repairs equipment
Topic 6:
design products custom lawn construction
Topic 7:
care health dental dr patient
Topic 8:
law estate real legal attorney
Topic 9:
training fitness preschool life coaching


### Computing similarity


In [387]:
# taking inventory of the vectors we have
vector_sets = {'count':count_vecs,
               'tfidf':tfidf_vecs,
               'lsi':lsi_vecs,
               'nmf':nmf_vecs}
for k, v in vector_sets.items():
    print(k, 'shape:',  v.shape)

count shape: (1000, 4591)
tfidf shape: (1000, 4591)
lsi shape: (1000, 10)
nmf shape: (1000, 10)


In [388]:
# cosine similarity
# looking at our examples from above
print(example_texts)
print('tfidf shape:', tfidf_example.shape)
example_sim = cosine_similarity(tfidf_example)
# truncate descriptions
trunc_example_texts = [x[:20] for x in example_texts.values]
pd.DataFrame(example_sim,
             index=trunc_example_texts,
             columns=trunc_example_texts)

812    Basketball Training in Greenville, South Carol...
558    Personal Trainer | Nutrition Coach | Bolton, M...
610    Siding company in Millersburg, OH | Holmes Sid...
946    Glass solutions in Devils Lake, ND  | The Glas...
Name: content, dtype: object
tfidf shape: (4, 4591)


Unnamed: 0,Basketball Training,Personal Trainer | N,Siding company in Mi,Glass solutions in D
Basketball Training,1.0,0.118283,0.029231,0.04373
Personal Trainer | N,0.118283,1.0,0.009903,0.004952
Siding company in Mi,0.029231,0.009903,1.0,0.050079
Glass solutions in D,0.04373,0.004952,0.050079,1.0


In [389]:
# have industry category for subset of businesses
full_data.type.value_counts(dropna=False).head()

NaN                        77
Professional Services      53
Health and Fitness         49
Construction               40
Home & Home Improvement    40
Name: type, dtype: int64

#### Exercise: Which of the four techniques appears to work best?
For this more open-ended question, here's a suggestion for a workflow:

1) Take inventory of available vectorized data

2) Assess sources for "ground truth"

3) Determine a metric of performance for the techniques

4) Analyze and compare

In [395]:
vector_sims = {}
ind = 'Home & Home Improvement'

for m in vector_sets:
    # compute cosine similarity
    vector_sim = cosine_similarity(vector_sets[m])
    # remove self-comparison, would automatically up-weight self category
    np.fill_diagonal(vector_sim, np.NaN)
    vector_sims[m] = vector_sim
    s_df = pd.DataFrame(vector_sims[m],
            index=full_data.type,
            columns=full_data.type)
    t = s_df.groupby(level=0, axis=1).mean().groupby(level=0).mean()
    print(m, t.loc[ind].rank().loc[ind])

count 84.0
tfidf 85.0
lsi 82.0
nmf 79.0


In [398]:
# look at a case here
same_ind = np.where(s_df.index==ind)[0]
zero_similarity = np.where(s_df.loc[ind, ind]==0)
same_x = same_ind[zero_similarity[0][0]]
same_y = same_ind[zero_similarity[1][0]]

In [399]:
full_data.iloc[
    [same_x, same_y]].content

25     High-quality masonry work in Hilton, NY  | Ups...
237    Ames Clean Company Home About Us Services Clea...
Name: content, dtype: object