In [31]:
import pandas as pd
import numpy as np
import matplotlib
import seaborn as sns
%matplotlib inline
import scipy
import matplotlib.pyplot as plt
import networkx as nx
import spacy
from sklearn.feature_extraction.text import TfidfVectorizer

In [46]:
df = pd.read_csv('/Users/aaronbroderick/Desktop/Data Science Folders/Data/FineFoods.csv')

In [47]:
w = list(df.ProductId.value_counts().index)
title = w[50]
product = df[df['ProductId'] == title]
product = product.reset_index(drop = True)
product = pd.DataFrame(product)
product.shape

(455, 10)

## Gap statistic function

In [34]:
import numpy as np
from sklearn.cluster import AgglomerativeClustering
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.datasets import make_blobs


def calc_inertia(ag, data):
    labels = ag.labels_
    
    W = 0
    for l in labels:
        Dl = 1-cosine_similarity(data[labels==l, :])
        Sd = 0.5 * np.sum(Dl**2) / Dl.shape[0]
        W = W + Sd
    return W
        

#This function always uses cosine distance, probably should augment this to take an arbitrary distance function
def optimalK(data, nrefs=3, maxClusters=15):
    """
    Calculates KMeans optimal K using Gap Statistic from Tibshirani, Walther, Hastie
    Params:
        data: ndarry of shape (n_samples, n_features), assumed to be tf-idf with unit-normalized rows
        nrefs: number of sample reference datasets to create
        maxClusters: Maximum number of clusters to test for
    Returns: (gaps, optimalK)
    """
    N = data.shape[0]
    maxClusters = min(maxClusters, max(2, N/3))
    gaps = np.zeros((len(range(1, maxClusters)),))
    resultsdf = pd.DataFrame({'clusterCount':[], 'gap':[]})
    for gap_index, k in enumerate(range(1, maxClusters)):

        # Holder for reference dispersion results
        refDisps = np.zeros(nrefs)

        # For n references, generate random sample and perform kmeans getting resulting dispersion of each loop
        for i in range(nrefs):
            
            # Create new random reference set
            randomReference = np.zeros(data.shape)
            for qq in range(0, data.shape[1]):
                randomReference[:, qq] = np.max(data[:, qq]) * np.random.sample(size=(data.shape[0],))
            norms = np.sqrt(np.sum(randomReference**2, axis=1))
            norms.shape = (len(norms), 1)
            norms = np.tile(norms, (1, data.shape[1]))
            randomReference = randomReference / norms #Normalize for consistency
            
            # Fit to it
            cos_dist = 1-cosine_similarity(randomReference)
            ag = AgglomerativeClustering(n_clusters=k)
            ag.fit(cos_dist)
            
            #Calculate the deviation
            refDisp = calc_inertia(ag, randomReference)
            refDisps[i] = refDisp

        # Fit cluster to original data and create dispersion
        cos_dist = 1-cosine_similarity(data)
        ag = AgglomerativeClustering(n_clusters=k)
        ag.fit(cos_dist)
        
        origDisp = calc_inertia(ag, cos_dist)

        # Calculate gap statistic
        gap = np.log(np.mean(refDisps)) - np.log(origDisp)

        # Assign this loop's gap statistic to gaps
        gaps[gap_index] = gap
        
        resultsdf = resultsdf.append({'clusterCount':k, 'gap':gap}, ignore_index=True)

    return (gaps.argmax() + 1, resultsdf)  # Plus 1 because index of 0 means 1 cluster is optimal, index 2 = 3 clusters are optimal

## Clean Text

In [48]:
## Delete stop words, remove all words except nouns.
## Delete some annoying hypertext

parser = spacy.load('en')
def parse_text(string):
    z = parser(string)
    important_words = [word for word in z if word.is_stop==False and  word.pos_=='NOUN']
    important_words = [str(word) for word in important_words]
    important_words = ' '.join(important_words)
    #important_words = important_words.replace('<br','')  
    #important_words = important_words.replace('br','')
    return important_words

product['parse'] = product.Text.apply(parse_text)

In [49]:
## Remove overly common nouns
common_words = ['coffee','cups', 'cup']

def clean(string):
    for i in range(len(common_words)):
        string = string.replace(common_words[i],'')
    return string

product['parse'] = product.parse.apply(clean)

## Determine the number of clusters to use using the gap statistic.  

In [50]:
vectorizer = TfidfVectorizer(stop_words='english')
product_tfidf = vectorizer.fit_transform(product['parse'])

In [51]:
##  Calculate the gap statistic to determine number of clusters
g, df = optimalK(product_tfidf, maxClusters = 10)
print (g)
print (df)

4
   clusterCount       gap
0           1.0  7.142907
1           2.0  6.758828
2           3.0  6.773918
3           4.0  7.275388
4           5.0  7.233081
5           6.0  7.094605
6           7.0  6.942918
7           8.0  7.147632
8           9.0  6.993363


## Create word lists for 4 clusters using LSA, LDA and NNMF

In [52]:
# Getting the word list.
terms = vectorizer.get_feature_names()

# Number of topics.
ntopics=4

# Linking words to topics
def word_topic(tfidf,solution, wordlist):
    
    # Loading scores for each word on each topic/component.
    words_by_topic=tfidf.T * solution
    # Linking the loadings to the words in an easy-to-read way.
    components=pd.DataFrame(words_by_topic,index=wordlist)
    
    return components

# Extracts the top N words and their loadings for each topic.
def top_words(components, n_top_words):
    n_topics = range(components.shape[1])
    index= np.repeat(n_topics, n_top_words, axis=0)
    topwords=pd.Series(index=index)
    for column in range(components.shape[1]):
        # Sort the column so that highest loadings are at the top.
        sortedwords=components.iloc[:,column].sort_values(ascending=False)
        # Choose the N highest loadings.
        chosen=sortedwords[:n_top_words]
        # Combine loading and index into a string.
        chosenlist=chosen.index +" "+round(chosen,2).map(str) 
        topwords.loc[column]=chosenlist
    return(topwords)

# Number of words to look at for each topic.
n_top_words = 10

In [53]:
# LSA

from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Normalizer

svd= TruncatedSVD(ntopics)
lsa = make_pipeline(svd, Normalizer(copy=False))
product_lsa = lsa.fit_transform(product_tfidf)

components_lsa = word_topic(product_tfidf, product_lsa, terms)

topwords=pd.DataFrame()
topwords['LSA']=top_words(components_lsa, n_top_words)

In [54]:
# LDA
from sklearn.decomposition import LatentDirichletAllocation as LDA

lda = LDA(n_components=ntopics, 
          doc_topic_prior=None, # Prior = 1/n_documents
          topic_word_prior=1/ntopics,
          learning_decay=0.7, # Convergence rate.
          learning_offset=10.0, # Causes earlier iterations to have less influence on the learning
          max_iter=10, # when to stop even if the model is not converging (to prevent running forever)
          evaluate_every=-1, # Do not evaluate perplexity, as it slows training time.
          mean_change_tol=0.001, # Stop updating the document topic distribution in the E-step when mean change is < tol
          max_doc_update_iter=100, # When to stop updating the document topic distribution in the E-step even if tol is not reached
          n_jobs=-1, # Use all available CPUs to speed up processing time.
          verbose=0, # amount of output to give while iterating
          random_state=0
         )

product_lda = lda.fit_transform(product_tfidf) 

components_lda = word_topic(product_tfidf, product_lda, terms)
topwords['LDA']=top_words(components_lda, n_top_words) 



In [55]:
# NNMF

from sklearn.decomposition import NMF

nmf = NMF(alpha=0.0, 
          init='nndsvdar', # how starting value are calculated
          l1_ratio=0.0, # Sets whether regularization is L2 (0), L1 (1), or a combination (values between 0 and 1)
          max_iter=200, # when to stop even if the model is not converging (to prevent running forever)
          n_components=ntopics, 
          random_state=0, 
          solver='cd', # Use Coordinate Descent to solve
          tol=0.0001, # model will stop if tfidf-WH <= tol
          verbose=0 # amount of output to give while iterating
         )
product_nmf = nmf.fit_transform(product_tfidf) 

components_nmf = word_topic(product_tfidf, product_nmf, terms)

topwords['NNMF']=top_words(components_nmf, n_top_words)

In [56]:
for topic in range(ntopics):
    print('Topic {}:'.format(topic))
    print(topwords.loc[topic])

Topic 0:
             LSA           LDA            NNMF
0   flavor 33.21   taste 11.25      flavor 6.4
0    taste 16.12  flavor 10.65       blend 1.0
0  morning 10.92  morning 5.38       time 0.92
0  coconut 10.33    aroma 5.33    morning 0.89
0     blend 9.07  vanilla 4.87    caramel 0.88
0      time 8.64  husband 4.69    coconut 0.85
0        br 8.05  caramel 4.42      aroma 0.84
0      roast 7.9    price 4.36      taste 0.79
0   product 7.89  brewing 3.78  favorites 0.79
0   vanilla 7.67   product 3.7    vanilla 0.77
Topic 1:
             LSA           LDA          NNMF
1    taste 18.46   flavor 7.96    taste 6.73
1        br 3.46  product 3.46   flavor 1.13
1   tasting 3.28    taste 3.34       br 0.98
1       day 2.88      lot 3.08  coconut 0.95
1   brewing 2.45  reviews 2.92  morning 0.76
1     decaf 2.14      try 2.68  brewing 0.72
1       way 2.09  morning 2.05  husband 0.61
1   coconut 2.08     wife 1.99    aroma 0.58
1   husband 2.02    blend 1.94  product 0.57
1  purchase 1.9

## Make a .csv to send to Plotly 

In [132]:
topwords = topwords.reset_index(drop = True)
topic1 = topwords[:10]
topic2 = topwords[10:20]
topic3 = topwords[20:30]
topic4 = topwords[30:40]

In [95]:
z = topic1.LSA[0]
z

'flavor 33.23'

In [96]:
def LSA_topics(string):
    x = string.split()
    return x[0],float(x[1])

In [110]:
words = []
score = []

for i in range(0,10):
    x, y = LSA_topics(topic1.LSA[i])
    words.append(x)
    score.append(y)

topic1_df = pd.DataFrame()
topic1_df['words'] = words
topic1_df['score'] = score

words = []
score = []

for i in range(10,20):
    x, y = LSA_topics(topic2.LSA[i])
    words.append(x)
    score.append(y)

topic2_df = pd.DataFrame()
topic2_df['words'] = words
topic2_df['score'] = score

words = []
score = []

for i in range(20,30):
    x, y = LSA_topics(topic3.LSA[i])
    words.append(x)
    score.append(y)

topic3_df = pd.DataFrame()
topic3_df['words'] = words
topic3_df['score'] = score

words = []
score = []

for i in range(30,40):
    x, y = LSA_topics(topic4.LSA[i])
    words.append(x)
    score.append(y)

topic4_df = pd.DataFrame()
topic4_df['words'] = words
topic4_df['score'] = score
topic4_df

Unnamed: 0,words,score
0,product,9.95
1,price,4.37
2,description,2.44
3,box,2.19
4,money,2.08
5,vanilla,2.05
6,decaf,1.88
7,br,1.52
8,lot,1.51
9,pods,1.44


In [111]:
topic1_df.to_csv('topic1.csv')
topic2_df.to_csv('topic2.csv')
topic3_df.to_csv('topic3.csv')
topic4_df.to_csv('topic4.csv')