## In your own words: Computing customer similarity using website text data
#### Workshop developed for DSS Austin '19
#### By: Ben Batorsky [github](https://github.com/bpben)

### Import necessary libraries

In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import NMF, TruncatedSVD
from sklearn.metrics.pairwise import cosine_similarity
from collections import Counter
import re

In [None]:
# optional, can run exercise code without these
# uncomment system commands for colab notebook
#!pip install PyStemmer
import Stemmer
stemmer = Stemmer.Stemmer('english')
#!pip install spacy
import spacy
#!python -m spacy download en_core_web_sm
nlp = spacy.load('en_core_web_sm')
#!pip install gensim
from gensim.models.phrases import Phrases

In [None]:
# if you use colab, modify this
DATA_PATH = './data/'

### Data ingestion
The data for the workshop comes from a random set of business website text

In [None]:
full_data = pd.read_csv(DATA_PATH+'website_text.csv')
w_content = full_data['content']

### Preprocessing


In [None]:
# capitalization
text = "We're Cowboys fans, but we're not cowboys"
print(Counter(text.split()))
print(Counter(text.lower().split()))

In [None]:
# punctuation
text = "We're Cowboys fans, but we're not cowboys"
strip_punct = '[^A-Za-z0-9 ]'
print(text)
print(re.sub(strip_punct, '', text))

In [None]:
# numbers
text = 'Call 867-5309'
strip_num = '[0-9]'
print(text)
print(re.sub(strip_num, '', text))

In [None]:
# urls
# regex from textacy: https://github.com/chartbeat-labs/textacy
SHORT_URL_REGEX = re.compile(
    r"(?:^|(?<![\w/.]))"
    # optional scheme
    r"(?:(?:https?://)?)"
    # domain
    r"(?:\w-?)*?\w+(?:\.[a-z]{2,12}){1,3}"
    r"/+"
    # hash
    r"[^\s.,?!'\"|+]{2,12}"
    r"(?:$|(?![\w?!+&/]))",
    flags=re.IGNORECASE)
text = 'Check out this conference: https://datascience.salon/austin/'
print(text)
print(SHORT_URL_REGEX.sub('', text))

#### Exercise: Write your preprocessing script
Combine some of the regex expressions (or write your own!) to process the text data

In [1]:
def preprocess(text):
    ## code here
    return(text)

In [None]:
w_processed = w_content.apply(preprocess)

### From text to vectors


In [None]:
CountVectorizer()

In [None]:
# n-grams
# note: 1-letter words are dropped by default
text = ['I have a lovely bunch of coconuts']
for n in range(1, 4):
    vec = CountVectorizer(ngram_range=(1, n))
    print('1 to {}-grams: '.format(n), list(vec.fit(text).vocabulary_.keys()))

In [None]:
# choosing cutoffs
texts = ['I have a lovely bunch of coconuts']
texts = texts*9
texts.append('I have a lovely bunch of pears')
vec = CountVectorizer()
print('Default (no minimum): ', list(vec.fit(texts).vocabulary_.keys()))
vec = CountVectorizer(min_df=.2)
print('Appear in >=20% of docs: ', list(vec.fit(texts).vocabulary_.keys()))
vec = CountVectorizer(max_df=.1)
print('Appear in <=10% of docs: ', list(vec.fit(texts).vocabulary_.keys()))

In [None]:
# stemming and lemmatizing
words = ['ponies', 'operation', 'are']
for w in words:
    print('Stem of {}: {}'.format(w, stemmer.stemWord(w)))
    print('Lemma of {}: {}'.format(w, nlp(w)[0].lemma_))
    print()

In [None]:
# entities with spaCy
text = "I'm a Cowboys fan, but I'm not a cowboy"
ents = nlp(text).ents
for e in ents:
    print(e, e.label_)

In [None]:
# entities with gensim - co-location
# This example is a bit odd: Likely not dealing with a bunch of duplicates
# Worth noting that, all likelihood being equal, gensim picks the first in a series of bigrams
texts = ['have a lovely bunch of coconuts']
texts = texts*4
texts.append('have a lovely bunch of pears')
texts.append('have a lovely bunch of pears')
split_texts = [x.split() for x in texts]
phrases = Phrases(split_texts, min_count=1, threshold=1)
phrases_m2 = Phrases(split_texts, min_count=2, threshold=1)
# scores are based on https://radimrehurek.com/gensim/models/phrases.html#gensim.models.phrases.original_scorer
# higher = more likely to be a bigram
print('with min count = 1:') 
print([x for x in phrases.export_phrases([split_texts[-1]])])
print('with min count = 2')
print([x for x in phrases_m2.export_phrases([split_texts[-1]])])

In [None]:
# creating count vectors
texts = []
text = "We are Cowboys fans, but we are not cowboys"
text_tagged = "We are CowboysORG fans, but we are not cowboys"
# phrase without tags, lowercase
texts.append(text.lower())
# phrase with tags, lowercase
texts.append(text_tagged.lower())
# utility to display vectorizer
def display_vec(vec, data):
    df = pd.DataFrame(data.toarray(),
                     columns=vec.get_feature_names())
    return(df)
# count vector
vec = CountVectorizer()
data = vec.fit_transform(texts)
print('count vectors \n', display_vec(vec, data))
# binary count vector
b_vec = CountVectorizer(binary=True)
data = b_vec.fit_transform(texts)
print('binary vectors \n', display_vec(b_vec, data))

#### Exercise: Remove the stopwords from the above texts
Use what we explored above to remove the stopwords from the count vectors of the following texts

In [None]:
texts = ['we are cowboys fans, but we are not cowboys',
 'we are cowboysorg fans, but we are not cowboys']

#### Exercise: Set limits on the vocabulary to remove potentially irrelevant words
With the following set of texts, set a limit to remove unimportant words like "Patriots"

In [None]:
texts = ['We are Cowboys fans',
        'We are cowboys',
        'We are Patriots fans']

In [None]:
# TF-IDF weighting
TfidfVectorizer()

In [None]:
texts = ['We are Cowboys fans',
         'We are Patriots fans']

In [None]:
# calculate term frequency
vec = CountVectorizer()
count_vectors = vec.fit_transform(texts)
count_df = display_vec(vec, count_vectors)
print(count_df)

Formula for inverse document frequency weight:

$$log(\frac{N}{df(t)}) + 1$$

"smooth" option ensures no zero-divisions:

$$log(\frac{N+1}{df(t)+1}) + 1$$

In [None]:
# get inverse document frequency
df = np.log(3/(1+count_df.sum()))+1
print(df)

In [None]:
# calculate tfidf
tfidf_df = count_df*df
# normalize: default in scikit-learn accounts for different-length documents
print(tfidf_df.apply(
    lambda x: x/np.sqrt(x.dot(x)), axis=1))

In [None]:
# now with scikit-learn
tfidf = TfidfVectorizer()
data = tfidf.fit_transform(texts)
print(display_vec(tfidf, data))

#### Exercise: Turn text to vectors
Using what we've gone through above, create your own count vectorizer and TFIDF vectorizer.  Apply these vectorizers to the data, and store the result.

In [None]:
vector_params = {'min_df': .005, 'max_df': .3, 'stop_words':'english'}

In [None]:
count_vectorizer = CountVectorizer(**vector_params)
tfidf_vectorizer = TfidfVectorizer(**vector_params)

### Matrix factorization and topic modelling

#### Latent Semantic Indexing
In scikit-learn this is implemented as TruncatedSVD, a version of SVD where the top k elements are retained


In [None]:
TruncatedSVD()

In [None]:
# couple examples website text
# choose some from pretty opposite industries
random_state = 9
example_inds = ['Health and Fitness', 'Home & Home Improvement']
example_idxs = []
for ind in example_inds:
    ind_data = full_data[full_data.type==ind]
    idxs = ind_data.sample(n=2, random_state=random_state).index
    example_idxs.extend(idxs.tolist())
example_texts = w_content.loc[example_idxs]
example_texts

In [None]:
# LSI requires tfidf-weighted vectors, use from above
tfidf_example = tfidf_vectorizer.transform(example_texts)
# create display of examples
display_example = display_vec(tfidf_vectorizer, tfidf_example)
# for clarity, drop vocab that does not occur
display_example.loc[:, (display_example.sum(axis=0)>0).values]

In [None]:
def display_components(model, word_features, top_display=5):
    # utility for displaying respresentative words per component
    for topic_idx, topic in enumerate(model.components_):
        print("Topic %d:" % (topic_idx))
        top_words_idx = topic.argsort()[::-1][:top_display]
        top_words = [word_features[i] for i in top_words_idx]
        print(" ".join(top_words))

In [None]:
# specify number of components
n_components = 4
svd = TruncatedSVD(n_components=n_components, random_state=random_state)
svd_example = svd.fit_transform(tfidf_example)
display_components(svd, tfidf_vectorizer.get_feature_names())

In [None]:
# for display
pd.DataFrame(svd_example,
             index=[t[:50] for t in example_texts])

#### Exercise: Create LSI vectors
Using the TFIDF vectors from above, create LSI vectors for the website text data.

In [None]:
# likely better to use more than 4 components
n_components = 10

#### Non-negative matrix factorization

In [None]:
NMF()

In [None]:
print(example_texts)

In [None]:
# NMF also requires tfidf-weighted vectors
tfidf_example = tfidf_vectorizer.transform(example_texts)

In [None]:
# specify number of components
# with NMF, n_components must be <= number of documents
n_components = 4
nmf = NMF(n_components=n_components)
nmf_example = nmf.fit_transform(tfidf_example)
display_components(nmf, tfidf_vectorizer.get_feature_names())

In [None]:
pd.DataFrame(nmf_example,
             index=[t[:50] for t in example_texts])

#### Exercise: Create NMF vectors
Using the TFIDF vectors, create NMF vectors for the website text data.

In [None]:
# likely better to use more than 4 components
n_components = 10

### Computing similarity


In [None]:
# taking inventory of the vectors we have
vector_sets = {'count':count_vecs,
               'tfidf':tfidf_vecs,
               'lsi':lsi_vecs,
               'nmf':nmf_vecs}
for k, v in vector_sets.items():
    print(k, 'shape:',  v.shape)

In [None]:
# cosine similarity
# looking at our examples from above
print(example_texts)
print('tfidf shape:', tfidf_example.shape)
example_sim = cosine_similarity(tfidf_example)
# truncate descriptions
trunc_example_texts = [x[:20] for x in example_texts.values]
pd.DataFrame(example_sim,
             index=trunc_example_texts,
             columns=trunc_example_texts)

In [None]:
# have industry category for subset of businesses
full_data.type.value_counts(dropna=False).head()

#### Exercise: Which of the four techniques appears to work best?
For this more open-ended question, here's a suggestion for a workflow:

1) Take inventory of available vectorized data

2) Assess sources for "ground truth"

3) Determine a metric of performance for the techniques

4) Analyze and compare