# In your own words: Computing customer similarity using website text data
### Workshop developed for DSS Austin '19
### By: Ben Batorsky [github](https://github.com/bpben)


### Import necessary libraries

In [69]:
%matplotlib inline
import pandas as pd
import numpy as np
import spacy 
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation, TruncatedSVD
from collections import Counter
import re

In [331]:
from sklearn.metrics.pairwise import cosine_similarity

In [5]:
import Stemmer
stemmer = Stemmer.Stemmer('english')

In [6]:
import spacy
nlp = spacy.load('en_core_web_md')

In [7]:
from gensim.models.phrases import Phrases

In [8]:
DATA_PATH = './data/'

### Data ingestion
The data for the workshop comes from a random set of business website text

In [9]:
w = pd.read_pickle('/Users/benjaminbatorsky/Documents/segmentation/data/website_parsed.pkl.gz')

In [347]:
from th_data_utilities import *

In [349]:
a = global_data_loader.read_sql('select * from views.v_get_accounts')

In [368]:
w_a = w.merge(a[['account_id', 'type']], on='account_id')

In [369]:
# other is going to be missing
w_a.loc[w_a.type=='Other*', 'type'] = np.NaN

In [370]:
w_a_sample = w_a.sample(1000)[['content', 'type']]
w_a_sample.reset_index(inplace=True, drop=True)

In [371]:
w_a_sample.to_pickle(DATA_PATH+'website_text.pkl')

In [374]:
full_data = pd.read_pickle(DATA_PATH+'website_text.pkl')

In [375]:
w_content = full_data['content']

### Preprocessing


In [14]:
# capitalization
text = 'Bill paid with a dollar bill'
print(Counter(text.split()))
print(Counter(text.lower().split()))

Counter({'Bill': 1, 'paid': 1, 'with': 1, 'a': 1, 'dollar': 1, 'bill': 1})
Counter({'bill': 2, 'paid': 1, 'with': 1, 'a': 1, 'dollar': 1})


In [16]:
# punctuation
#nlp('!').similarity(nlp('.'))
text = 'Happy birthday!'
strip_punct = '[^A-Za-z0-9 ]'
print(text)
print(re.sub(strip_punct, '', text))

Happy birthday!
Happy birthday


In [17]:
# numbers
text = 'Call 867-5309'
strip_num = '[0-9]'
print(text)
print(re.sub(strip_num, '', text))

Call 867-5309
Call -


In [18]:
# urls
SHORT_URL_REGEX = re.compile(
    r"(?:^|(?<![\w/.]))"
    # optional scheme
    r"(?:(?:https?://)?)"
    # domain
    r"(?:\w-?)*?\w+(?:\.[a-z]{2,12}){1,3}"
    r"/+"
    # hash
    r"[^\s.,?!'\"|+]{2,12}"
    r"(?:$|(?![\w?!+&/]))",
    flags=re.IGNORECASE)
text = 'Check out this conference: https://datascience.salon/austin/'
print(text)
print(SHORT_URL_REGEX.sub('', text))

Check out this conference: https://datascience.salon/austin/
Check out this conference: 


#### Exercise: Write your preprocessing script
Combine some of the regex expressions (or write your own!) to process the text data

In [19]:
def preprocess(text):
    # url
    text = SHORT_URL_REGEX.sub('', text)
    # numbers
    text = re.sub(strip_num, '', text)
    # punctuation
    text = re.sub(strip_punct, '', text)
    # capitalization
    text = text.lower()
    return(text)

In [20]:
w_processed = w_content.apply(preprocess)

### From text to vectors


In [21]:
CountVectorizer()

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [22]:
# n-grams
text = ['I have a lovely bunch of coconuts']
for n in range(1, 4):
    vec = CountVectorizer(ngram_range=(1, n))
    print('1 to {}-grams: '.format(n), list(vec.fit(text).vocabulary_.keys()))

1 to 1-grams:  ['have', 'lovely', 'bunch', 'of', 'coconuts']
1 to 2-grams:  ['have', 'lovely', 'bunch', 'of', 'coconuts', 'have lovely', 'lovely bunch', 'bunch of', 'of coconuts']
1 to 3-grams:  ['have', 'lovely', 'bunch', 'of', 'coconuts', 'have lovely', 'lovely bunch', 'bunch of', 'of coconuts', 'have lovely bunch', 'lovely bunch of', 'bunch of coconuts']


In [23]:
# choosing cutoffs
texts = ['I have a lovely bunch of coconuts']
texts = texts*9
texts.append('I have a lovely bunch of pears')
vec = CountVectorizer()
print('Default (no minimum): ', list(vec.fit(texts).vocabulary_.keys()))
vec = CountVectorizer(min_df=.1)
print('Appear in >10% of docs: ', list(vec.fit(texts).vocabulary_.keys()))
vec = CountVectorizer(max_df=.1)
print('Appear in <90% of docs: ', list(vec.fit(texts).vocabulary_.keys()))

Default (no minimum):  ['have', 'lovely', 'bunch', 'of', 'coconuts', 'pears']
Appear in >10% of docs:  ['have', 'lovely', 'bunch', 'of', 'coconuts', 'pears']
Appear in <90% of docs:  ['pears']


In [24]:
# stemming and lemmatizing
words = ['ponies', 'operation', 'are']
for w in words:
    print('Stem of {}: {}'.format(w, stemmer.stemWord(w)))
    print('Lemma of {}: {}'.format(w, nlp(w)[0].lemma_))
    print()

Stem of ponies: poni
Lemma of ponies: pony

Stem of operation: oper
Lemma of operation: operation

Stem of are: are
Lemma of are: be



In [25]:
# entities with spaCy
text = "I'm a Cowboys fan, but I'm not a cowboy"
ents = nlp(text).ents
for e in ents:
    print(e, e.label_)

Cowboys ORG


In [26]:
# entities with gensim - co-location
# This example is a bit odd: Likely not dealing with a bunch of duplicates
# Worth noting that, all likelihood being equal, gensim picks the first in a series of bigrams
texts = ['have a lovely bunch of coconuts']
texts = texts*4
texts.append('have a lovely bunch of pears')
texts.append('have a lovely bunch of pears')
split_texts = [x.split() for x in texts]
phrases = Phrases(split_texts, min_count=1, threshold=1)
list(phrases[split_texts])
#[x for x in phrases.export_phrases([split_texts[0]])]

[['have_a', 'lovely_bunch', 'of_coconuts'],
 ['have_a', 'lovely_bunch', 'of_coconuts'],
 ['have_a', 'lovely_bunch', 'of_coconuts'],
 ['have_a', 'lovely_bunch', 'of_coconuts'],
 ['have_a', 'lovely_bunch', 'of_pears'],
 ['have_a', 'lovely_bunch', 'of_pears']]

In [27]:
texts = ['I like ice cream', 'I will buy you ice cream', 'Go to the ice rink', 'I will buy you ice cream',]
split_texts = [x.split() for x in texts]
phrases = Phrases(split_texts, min_count=2, threshold=1)
bigram_sentences = phrases[split_texts]
[x for x in bigram_sentences]

[['I', 'like', 'ice_cream'],
 ['I', 'will', 'buy', 'you', 'ice_cream'],
 ['Go', 'to', 'the', 'ice', 'rink'],
 ['I', 'will', 'buy', 'you', 'ice_cream']]

In [28]:
# scores are based on https://radimrehurek.com/gensim/models/phrases.html#gensim.models.phrases.original_scorer
# higher = more likely to be a bigram
[x for x in phrases.export_phrases([split_texts[-1]])]

[(b'ice cream', 1.8333333333333333)]

In [29]:
# stopwords
#from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS
#print(ENGLISH_STOP_WORDS)
text = 'I have a lovely bunch of coconuts'
vec = CountVectorizer(stop_words='english')
print('With stopwords: ', text.split())
print('Without stopwords: ', list(vec.fit([text]).vocabulary_.keys()))

With stopwords:  ['I', 'have', 'a', 'lovely', 'bunch', 'of', 'coconuts']
Without stopwords:  ['lovely', 'bunch', 'coconuts']


In [30]:
# creating count vectors
texts = []
text = "We are Cowboys fans, but we are not cowboys"
text_tagged = "We are CowboysORG fans, but we are not cowboys"
# phrase without tags, lowercase
texts.append(text.lower())
# phrase with tags, lowercase
texts.append(text_tagged.lower())
# utility to display vectorizer
def display_vec(vec, data):
    df = pd.DataFrame(data.toarray(),
                     columns=vec.get_feature_names())
    return(df)
# count vector
vec = CountVectorizer()
data = vec.fit_transform(texts)
print('count vectors \n', display_vec(vec, data))
# binary count vector
b_vec = CountVectorizer(binary=True)
data = b_vec.fit_transform(texts)
print('binary vectors \n', display_vec(b_vec, data))

count vectors 
    are  but  cowboys  cowboysorg  fans  not  we
0    2    1        2           0     1    1   2
1    2    1        1           1     1    1   2
binary vectors 
    are  but  cowboys  cowboysorg  fans  not  we
0    1    1        1           0     1    1   1
1    1    1        1           1     1    1   1


#### Exercise: Remove the stopwords from the above texts
Use what we explored above to remove the stopwords from the count vectors of the following texts

In [31]:
texts = ['we are cowboys fans, but we are not cowboys',
 'we are cowboysorg fans, but we are not cowboys']

nostop_vec = CountVectorizer(stop_words='english')
data = nostop_vec.fit_transform(texts)
print('count vectors without stopwords \n', display_vec(nostop_vec, data))

count vectors without stopwords 
    cowboys  cowboysorg  fans
0        2           0     1
1        1           1     1


#### Exercise: Set limits on the vocabulary to remove potentially irrelevant words
With the following set of texts, set a limit to remove unimportant words like "Patriots"

In [32]:
texts = ['We are Cowboys fans',
        'We are cowboys',
        'We are Patriots fans']
texts_lower = [t.lower() for t in texts]

limit_vec = CountVectorizer(min_df=2)
data = limit_vec.fit_transform(texts)
print('Count vector with vocab limit \n', display_vec(limit_vec, data))

Count vector with vocab limit 
    are  cowboys  fans  we
0    1        1     1   1
1    1        1     0   1
2    1        0     1   1


In [33]:
# TF-IDF weighting
TfidfVectorizer()

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)

In [34]:
texts = ['We are Cowboys fans',
         'We are Patriots fans']

In [41]:
# calculate term frequency
vec = CountVectorizer()
count_vectors = vec.fit_transform(texts)
count_df = display_vec(vec, count_vectors)
print(count_df)

   are  cowboys  fans  patriots  we
0    1        1     1         0   1
1    1        0     1         1   1


Formula for inverse document frequency weight:

$$log(\frac{N}{df(t)}) + 1$$

"smooth" option ensures no zero-divisions:

$$log(\frac{N+1}{df(t)+1}) + 1$$

In [509]:
nlp('doing')[0].lemma_

'do'

In [54]:
# get document frequency
df = np.log(3/(1+count_df.sum()))+1
print(df)

are         1.000000
cowboys     1.405465
fans        1.000000
patriots    1.405465
we          1.000000
dtype: float64


In [57]:
# calculate tfidf
tfidf_df = count_df*df
# normalize
print(tfidf_df.apply(
    lambda x: x/np.sqrt(x.dot(x)), axis=1))

        are   cowboys      fans  patriots        we
0  0.448321  0.630099  0.448321  0.000000  0.448321
1  0.448321  0.000000  0.448321  0.630099  0.448321


In [58]:
# now with scikit-learn
tfidf = TfidfVectorizer()
data = tfidf.fit_transform(texts)
print(display_vec(tfidf, data))

        are   cowboys      fans  patriots        we
0  0.448321  0.630099  0.448321  0.000000  0.448321
1  0.448321  0.000000  0.448321  0.630099  0.448321


In [None]:
# TF-IDF weighted count vectors

#### Exercise: Turn text to vectors
Using what we've gone through above, create your own count vectorizer and TFIDF vectorizer.  Apply these vectorizers to the data, and store the result.

In [61]:
vector_params = {'min_df': .005, 'max_df': .3, 'stop_words':'english'}

In [67]:
count_vectorizer = CountVectorizer(**vector_params)
tfidf_vectorizer = TfidfVectorizer(**vector_params)

In [68]:
count_vecs = count_vectorizer.fit_transform(w_content)
tfidf_vecs = tfidf_vectorizer.fit_transform(w_content)

### Matrix factorization and topic modelling

#### Latent Semantic Indexing
In scikit-learn this is implemented as TruncatedSVD, a version of SVD where the top k elements are retained


In [73]:
TruncatedSVD()

TruncatedSVD(algorithm='randomized', n_components=2, n_iter=5,
       random_state=None, tol=0.0)

In [None]:
n_topics_s = 10
mparams = {'n_components':n_topics_s,  'init':'nndsvd'}


In [264]:
# couple example website text
# these come from the main data: indices = 723, 826, 900, 974
idxs = [723, 826, 900, 974]
example_texts = w_content.loc[idxs]
# LSI requires tfidf-weighted vectors, use from above
tfidf_example = tfidf_vectorizer.transform(example_texts)
# create display of examples
display_example = display_vec(tfidf_vectorizer, tfidf_example)
# for clarity, drop vocab that does not occur
display_example.loc[:, (display_example.sum(axis=0)>0).values]

Unnamed: 0,11am,12,13,24,30,40,573,607,614,660,...,want,way,welcome,wellbeing,wings,winter,world,wraps,wrong,york
0,0.0,0.0,0.033591,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.021712,0.0,0.0,0.094364,0.03339,0.049004,0.0,0.0,0.0
1,0.148714,0.088588,0.0,0.0,0.0,0.0,0.213669,0.0,0.206162,0.0,...,0.031885,0.0,0.095301,0.076279,0.148714,0.0,0.0,0.068721,0.0,0.0
2,0.0,0.0,0.0,0.034342,0.030106,0.0,0.0,0.10143,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.040288
3,0.0,0.0,0.0,0.0,0.0,0.058597,0.0,0.0,0.0,0.082193,...,0.0,0.0,0.037349,0.0,0.0,0.0,0.0,0.0,0.152498,0.0


In [547]:
def display_components(model, word_features, top_display=5):
    # utility for displaying respresentative words per component
    for topic_idx, topic in enumerate(model.components_):
        print("Topic %d:" % (topic_idx))
        top_words_idx = topic.argsort()[::-1][:top_display]
        top_words = [word_features[i] for i in top_words_idx]
        print(" ".join(top_words))

In [574]:
# specify number of components
n_components = 4
svd = TruncatedSVD(n_components=n_components)
svd_example = svd.fit_transform(tfidf_example)
display_components(svd, tfidf_vectorizer.get_feature_names())

Topic 0:
pizza columbia brick wings tasty
Topic 1:
brake repair trucks automotive fleet
Topic 2:
trucks fleet towing heavy duty
Topic 3:
columbia events sunrise 573 614


In [289]:
for i, t in enumerate(example_texts):
    print(t[:50], svd_example[i] )

Old-Fashioned Brick Oven Pizza in Adrian, MI | Piz [ 0.81355706 -0.1577552   0.09633749 -0.55132323]
Pizza and Wings Delivery Columbia MO Kostaki's Piz [ 0.82427545 -0.09087286 -0.04062145  0.55737062]
Truck, trailer, and auto repair | Sonny’s Service  [0.10128071 0.75363889 0.64855891 0.03379328]
Auto Repair, Kirksville, MO Kirksville Brake & Muf [ 0.17148069  0.74013027 -0.64484981 -0.08348823]


#### Exercise: Create LSI vectors
Using the TFIDF vectors from above, create LSI vectors for the website text data.

In [301]:
# likely better to use more than 5 components
n_components = 10
lsi = TruncatedSVD(n_components=n_components)
lsi_vecs = lsi.fit_transform(tfidf_vecs)

In [516]:
tfidf_vecs

<1000x4891 sparse matrix of type '<class 'numpy.float64'>'
	with 106980 stored elements in Compressed Sparse Row format>

In [302]:
display_components(lsi, tfidf_vectorizer.get_feature_names())

Topic 0:
design care 00 insurance repair
Topic 1:
insurance law financial health estate
Topic 2:
insurance law estate painting legal
Topic 3:
insurance repair auto car care
Topic 4:
insurance design interior marketing portfolio
Topic 5:
jpg squarespace static1 static https
Topic 6:
jpg static1 static squarespace insurance
Topic 7:
repair marketing car auto financial
Topic 8:
cleaning marketing catering menu financial
Topic 9:
00 cleaning financial tax pm


#### Non-negative matrix factorization

In [308]:
NMF()

NMF(alpha=0.0, beta_loss='frobenius', init=None, l1_ratio=0.0, max_iter=200,
  n_components=None, random_state=None, shuffle=False, solver='cd',
  tol=0.0001, verbose=0)

In [309]:
print(example_texts)

723    Old-Fashioned Brick Oven Pizza in Adrian, MI |...
826    Pizza and Wings Delivery Columbia MO Kostaki's...
900    Truck, trailer, and auto repair | Sonny’s Serv...
974    Auto Repair, Kirksville, MO Kirksville Brake &...
Name: content, dtype: object


In [311]:
# NMF also requires tfidf-weighted vectors
tfidf_example = tfidf_vectorizer.transform(example_texts)

In [321]:
# specify number of components
# with NMF, n_components must be <= number of documents
n_components = 4
nmf = NMF(n_components=n_components)
nmf_example = nmf.fit_transform(tfidf_example)
display_components(nmf, tfidf_vectorizer.get_feature_names())

Topic 0:
pizza brick baked old cooking
Topic 1:
brake mo repair car wrong
Topic 2:
trucks fleet towing heavy automotive
Topic 3:
columbia pizza events 573 sunrise


In [323]:
for i, t in enumerate(example_texts):
    print(t[:50], nmf_example[i] )

Old-Fashioned Brick Oven Pizza in Adrian, MI | Piz [8.62522139e-01 1.58644770e-11 2.90992053e-09 0.00000000e+00]
Pizza and Wings Delivery Columbia MO Kostaki's Piz [3.94129144e-04 0.00000000e+00 0.00000000e+00 1.01225434e+00]
Truck, trailer, and auto repair | Sonny’s Service  [0.00000000e+00 1.68115685e-15 1.05162122e+00 0.00000000e+00]
Auto Repair, Kirksville, MO Kirksville Brake & Muf [0.00000000e+00 8.94778284e-01 1.22455262e-16 4.17916518e-17]


#### Exercise: Create NMF vectors
Using the TFIDF vectors, create NMF vectors for the website text data.

In [324]:
# likely better to use more than 5 components
n_components = 10
nmf = NMF(n_components=n_components)
nmf_vecs = nmf.fit_transform(tfidf_vecs)

In [325]:
display_components(nmf, tfidf_vectorizer.get_feature_names())

Topic 0:
menu catering food pizza restaurant
Topic 1:
insurance financial agency coverage quote
Topic 2:
painting tree residential commercial landscape
Topic 3:
law estate legal real attorney
Topic 4:
design marketing interior portfolio interiors
Topic 5:
care health life training center
Topic 6:
jpg static static1 squarespace https
Topic 7:
repair car auto repairs heating
Topic 8:
00 pm hair spa salon
Topic 9:
cleaning window windows boston roofing


### Computing similarity


In [330]:
# taking inventory of the vectors we have
vector_sets = {'count':count_vecs,
               'tfidf':tfidf_vecs,
               'lsi':lsi_vecs,
               'nmf':nmf_vecs}
for k, v in vector_sets.items():
    print(k, 'shape:',  v.shape)

count shape: (1000, 4891)
tfidf shape: (1000, 4891)
lsi shape: (1000, 10)
nmf shape: (1000, 10)


In [343]:
# cosine similarity
# looking at our examples from above
print(example_texts)
print('tfidf shape:', tfidf_example.shape)
example_sim = cosine_similarity(tfidf_example)
# truncate descriptions
trunc_example_texts = [x[:20] for x in example_texts.values]
pd.DataFrame(example_sim,
             index=trunc_example_texts,
             columns=trunc_example_texts)

723    Old-Fashioned Brick Oven Pizza in Adrian, MI |...
826    Pizza and Wings Delivery Columbia MO Kostaki's...
900    Truck, trailer, and auto repair | Sonny’s Serv...
974    Auto Repair, Kirksville, MO Kirksville Brake &...
Name: content, dtype: object
tfidf shape: (4, 4891)


Unnamed: 0,Old-Fashioned Brick,Pizza and Wings Deli,"Truck, trailer, and","Auto Repair, Kirksvi"
Old-Fashioned Brick,1.0,0.373726,0.007357,0.006656
Pizza and Wings Deli,0.373726,1.0,0.007488,0.05375
"Truck, trailer, and",0.007357,0.007488,1.0,0.154114
"Auto Repair, Kirksvi",0.006656,0.05375,0.154114,1.0


In [407]:
# have industry category for subset of businesses
full_data.type.value_counts(dropna=False).head()
#k = 5
#top_k_industries = full_data['type'].value_counts().head(n=5)
#print(top_k_industries)

NaN                        77
Professional Services      53
Health and Fitness         49
Home & Home Improvement    40
Construction               40
Name: type, dtype: int64

#### Exercise: Which of the four techniques appears to work best?
For this more open-ended question, here's a suggestion for a workflow:

1) Take inventory of available vectorized data

2) Assess sources for "ground truth"

3) Determine a metric of performance for the techniques

4) Analyze and compare

In [490]:
vector_sims = {}
ind = 'Construction'

for m in vector_sets:
    vector_sim = cosine_similarity(vector_sets[m])
    # remove self-comparison, would automatically up-weight self category
    np.fill_diagonal(vector_sim, np.NaN)
    vector_sims[m] = vector_sim
    s_df = pd.DataFrame(vector_sims[m],
            index=full_data.type,
            columns=full_data.type)
    t = s_df.groupby(level=0, axis=1).mean().groupby(level=0).mean()
    print(m, t.loc[ind].rank().loc[ind])

count 21.0
tfidf 26.0
lsi 51.0
nmf 52.0
