In [30]:
# key libs
import numpy as np
import re
import nltk
import pandas as pd
import glob
import codecs

# nlp libs
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer 

# processing
from sklearn.model_selection import train_test_split


# LDA
import pyLDAvis
import pyLDAvis.sklearn
pyLDAvis.enable_notebook()
from sklearn.decomposition import LatentDirichletAllocation
from scipy.sparse import csr_matrix
from sklearn.decomposition import NMF


# bring in my pickled vectorizers
import pickle
import dill

# Libraries for displying the data. 
from IPython.core.display import HTML 
from ipywidgets import interact
from IPython.display import display
from IPython.display import Image

### 1. LOAD DATA

In [31]:
full_data = pd.read_csv('../data/model/recommender_data.csv')
full_data = full_data.drop(columns='Unnamed: 0')

In [32]:
print(full_data.shape)
full_data.head()

(35965, 13)


Unnamed: 0,image_link,product_name,page_link,label,background,delete,item_no,description,style,material,type_class,color,rating
0,https://ak1.ostkcdn.com/images/products/139907...,Carson Carrington Brandbu Mid-century Fold Dow...,https://www.overstock.com/Home-Garden/Carson-C...,sofa,0,0.0,20370480,"Inspired by 1950's decor, the Brandbu futon is...","Mid-Century Modern, Scandinavian","Foam, Linen, Wood","Futon Set, Sofa","Blue, Green, Grey, Pink, Yellow",4.1
1,https://ak1.ostkcdn.com/images/products/139907...,Carson Carrington Brandbu Mid-century Fold Dow...,https://www.overstock.com/Home-Garden/Carson-C...,sofa,0,0.0,20370480,"Inspired by 1950's decor, the Brandbu futon is...","Mid-Century Modern, Scandinavian","Foam, Linen, Wood","Futon Set, Sofa","Blue, Green, Grey, Pink, Yellow",4.1
2,https://ak1.ostkcdn.com/images/products/139907...,Carson Carrington Brandbu Mid-century Fold Dow...,https://www.overstock.com/Home-Garden/Carson-C...,sofa,0,0.0,20370480,"Inspired by 1950's decor, the Brandbu futon is...","Mid-Century Modern, Scandinavian","Foam, Linen, Wood","Futon Set, Sofa","Blue, Green, Grey, Pink, Yellow",4.1
3,https://ak1.ostkcdn.com/images/products/139907...,Carson Carrington Brandbu Mid-century Fold Dow...,https://www.overstock.com/Home-Garden/Carson-C...,sofa,0,0.0,20370480,"Inspired by 1950's decor, the Brandbu futon is...","Mid-Century Modern, Scandinavian","Foam, Linen, Wood","Futon Set, Sofa","Blue, Green, Grey, Pink, Yellow",4.1
4,https://ak1.ostkcdn.com/images/products/139907...,Carson Carrington Brandbu Mid-century Fold Dow...,https://www.overstock.com/Home-Garden/Carson-C...,sofa,0,0.0,20370480,"Inspired by 1950's decor, the Brandbu futon is...","Mid-Century Modern, Scandinavian","Foam, Linen, Wood","Futon Set, Sofa","Blue, Green, Grey, Pink, Yellow",4.1


In [33]:
full_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 35965 entries, 0 to 35964
Data columns (total 13 columns):
image_link      35965 non-null object
product_name    35965 non-null object
page_link       35965 non-null object
label           35965 non-null object
background      35965 non-null int64
delete          35965 non-null float64
item_no         35965 non-null int64
description     35963 non-null object
style           34051 non-null object
material        35193 non-null object
type_class      35446 non-null object
color           34612 non-null object
rating          35965 non-null float64
dtypes: float64(2), int64(2), object(9)
memory usage: 3.6+ MB


### 2. PREP DATA FOR NLP

In [34]:
# make Nans into empty string 'description','style','type_class','color'
full_data.description = full_data.description.fillna('')
full_data['style'] = full_data['style'].fillna('')
full_data.type_class = full_data.type_class.fillna('')
full_data.color = full_data.color.fillna('')

In [35]:
full_data.iloc[1171,:]

image_link      https://ak1.ostkcdn.com/images/products/910917...
product_name    Serta Monaco Collection 77-inch Brown Leather ...
page_link       https://www.overstock.com/Home-Garden/Serta-Mo...
label                                                        sofa
background                                                      0
delete                                                          0
item_no                                                  16295308
description     Create a comfortable living room or den with t...
style                                                            
material                            Bonded Leather, Foam, Plastic
type_class                                                   Sofa
color                                                       Brown
rating                                                        3.6
Name: 1171, dtype: object

In [36]:
full_data['words'] = full_data[['product_name','description','style','type_class','color']].apply(lambda x: ' '.join(x),axis=1)

In [37]:
full_data['words'] = full_data['words'].str.strip()
full_data['words'] = full_data['words'].str.replace(',',' ')

In [38]:
full_data.head(2)

Unnamed: 0,image_link,product_name,page_link,label,background,delete,item_no,description,style,material,type_class,color,rating,words
0,https://ak1.ostkcdn.com/images/products/139907...,Carson Carrington Brandbu Mid-century Fold Dow...,https://www.overstock.com/Home-Garden/Carson-C...,sofa,0,0.0,20370480,"Inspired by 1950's decor, the Brandbu futon is...","Mid-Century Modern, Scandinavian","Foam, Linen, Wood","Futon Set, Sofa","Blue, Green, Grey, Pink, Yellow",4.1,Carson Carrington Brandbu Mid-century Fold Dow...
1,https://ak1.ostkcdn.com/images/products/139907...,Carson Carrington Brandbu Mid-century Fold Dow...,https://www.overstock.com/Home-Garden/Carson-C...,sofa,0,0.0,20370480,"Inspired by 1950's decor, the Brandbu futon is...","Mid-Century Modern, Scandinavian","Foam, Linen, Wood","Futon Set, Sofa","Blue, Green, Grey, Pink, Yellow",4.1,Carson Carrington Brandbu Mid-century Fold Dow...


In [39]:
nlp_headers = ['image_link','product_name','page_link','label','words']
df_nlp = full_data[nlp_headers]

In [40]:
print(df_nlp.shape)
df_nlp.head()

(35965, 5)


Unnamed: 0,image_link,product_name,page_link,label,words
0,https://ak1.ostkcdn.com/images/products/139907...,Carson Carrington Brandbu Mid-century Fold Dow...,https://www.overstock.com/Home-Garden/Carson-C...,sofa,Carson Carrington Brandbu Mid-century Fold Dow...
1,https://ak1.ostkcdn.com/images/products/139907...,Carson Carrington Brandbu Mid-century Fold Dow...,https://www.overstock.com/Home-Garden/Carson-C...,sofa,Carson Carrington Brandbu Mid-century Fold Dow...
2,https://ak1.ostkcdn.com/images/products/139907...,Carson Carrington Brandbu Mid-century Fold Dow...,https://www.overstock.com/Home-Garden/Carson-C...,sofa,Carson Carrington Brandbu Mid-century Fold Dow...
3,https://ak1.ostkcdn.com/images/products/139907...,Carson Carrington Brandbu Mid-century Fold Dow...,https://www.overstock.com/Home-Garden/Carson-C...,sofa,Carson Carrington Brandbu Mid-century Fold Dow...
4,https://ak1.ostkcdn.com/images/products/139907...,Carson Carrington Brandbu Mid-century Fold Dow...,https://www.overstock.com/Home-Garden/Carson-C...,sofa,Carson Carrington Brandbu Mid-century Fold Dow...


### 3. BAG OF WORDS 

In [41]:
### possible tweek change the stemmer and token pattern
# function to custom tokenize
from nltk import SnowballStemmer
from nltk import PorterStemmer
from nltk import LancasterStemmer
from nltk.stem import WordNetLemmatizer

# set up key parameters for vectorizer - ngrams - snowball or lemmatizer
n_gram = 3
 #change this to 'snow' or 'lemma' or 'porter'



def my_tokenizer(doc, tkpat=re.compile('\\b[a-z][a-z][a-z]+\\b')):
    from nltk import SnowballStemmer
    from nltk import PorterStemmer
    from nltk import LancasterStemmer
    from nltk.stem import WordNetLemmatizer
    stem_type = 'snow'
    snow = SnowballStemmer('english') # change stemmer here
    wordnet_lemmatizer = WordNetLemmatizer()
    porter = PorterStemmer(mode='NLTK_EXTENSIONS')
    if (stem_type == 'snow'):       
        return [snow.stem(token) for token in tkpat.findall(doc)]
    if (stem_type == 'lemma'):
        return [wordnet_lemmatizer.lemmatize(token) for token in tkpat.findall(doc)]
    if (stem_type == 'porter'):       
        return [porter.stem(token) for token in tkpat.findall(doc)]

In [42]:
# basic Count Vectorizer
tf_vectorizer = CountVectorizer(max_df=0.70, min_df=0.05,ngram_range=(1,n_gram),\
                                analyzer ='word',\
                                stop_words='english', tokenizer = my_tokenizer)

tf_vectorizer.fit(df_nlp['words'])

# tf_vector_test = tf_vectorizer.transform(corpus_test)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=0.7, max_features=None, min_df=0.05,
        ngram_range=(1, 3), preprocessor=None, stop_words='english',
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=<function my_tokenizer at 0x7fd975debb70>,
        vocabulary=None)

In [43]:
tf_vector = tf_vectorizer.transform(df_nlp['words'])

In [44]:
print(tf_vector.shape)

(35965, 212)


In [45]:
# save stuff the vectorizer
import dill 
dill.dump(tf_vectorizer, open('../data/model/nlp_tf_vectorizer', 'wb'))

### 4. TOPIC MODELLING

In [46]:
### helper functions

In [47]:
def display_topics(model, feature_names, no_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print (f'Topic: {topic_idx}')
        print (" , ".join([feature_names[i]
                        for i in topic.argsort()[:-no_top_words - 1:-1]]))

In [48]:
#function to display books for a topic
def find_topic_books(df,number):
    topics = list(df.columns)[3:]
    #print(topics)
    for t in topics:
        temp = df.sort_values(by=t,ascending=False)
        print('\n------'+t+'-------') 
        print(temp.iloc[0:number+1,0:2])
        #print(temp.iloc[0:number+1,1])
        #display(Image(url=recommend_image[i][0],width=244, height=244))
        

In [49]:
# set how many topics you want
topic_number = 7

In [50]:
# Run LDA 
lda_tf = LatentDirichletAllocation(learning_method= 'batch',n_components=topic_number, random_state=36, batch_size=128,n_jobs=-1)
lda_tf.fit(tf_vector)

LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
             evaluate_every=-1, learning_decay=0.7,
             learning_method='batch', learning_offset=10.0,
             max_doc_update_iter=100, max_iter=10, mean_change_tol=0.001,
             n_components=7, n_jobs=-1, n_topics=None, perp_tol=0.1,
             random_state=36, topic_word_prior=None,
             total_samples=1000000.0, verbose=0)

In [51]:
# transform vectors
lda_vector = lda_tf.transform(tf_vector)

In [52]:
# dill the lda
dill.dump(lda_tf, open('../data/model/nlp_lda','wb'))

In [53]:
print('------PERPLEXITY------')
print(lda_tf.perplexity(tf_vector))

------PERPLEXITY------
96.45804851356803


In [55]:
no_top_words = 10
tf_feature_names = tf_vectorizer.get_feature_names()
display_topics(lda_tf, tf_feature_names, no_top_words)

Topic: 0
centuri , mid , mid centuri , centuri modern , mid centuri modern , modern modern , modern modern contemporari , centuri modern modern , leg , grey
Topic: 1
sofa , contemporari , comfort , seat , grey , live , modern contemporari , cushion , leather , room
Topic: 2
tabl , cocktail , cocktail tabl , tabl coffe , tabl coffe tabl , coffe , accent , coffe tabl , accent tabl , cocktail tabl coffe
Topic: 3
tabl , coffe , coffe tabl , wood , brown , rustic , home , room , end , live
Topic: 4
ottoman , chair , pouf , armchair , contemporari , home , seat , blue , storag , comfort
Topic: 5
stand , consol , entertain , storag , media , contemporari , brown , shelv , modern contemporari , inch
Topic: 6
tabl , coffe , coffe tabl , contemporari , modern contemporari , glass , contemporari coffe , contemporari coffe tabl , modern contemporari coffe , black


In [56]:
# add back details
lda_df = pd.DataFrame(lda_vector, columns=['topic_'+ str(i)for i in range(1,topic_number+1)])
lda_df.insert(loc=0, column='label', value=full_data['label'])
lda_df.insert(loc=0, column='page_link', value=full_data['page_link'])
lda_df.insert(loc=0, column='product_name', value=full_data['product_name'])
lda_df.insert(loc=0, column='image_link', value=full_data['image_link'])
lda_df.head()

Unnamed: 0,image_link,product_name,page_link,label,topic_1,topic_2,topic_3,topic_4,topic_5,topic_6,topic_7
0,https://ak1.ostkcdn.com/images/products/139907...,Carson Carrington Brandbu Mid-century Fold Dow...,https://www.overstock.com/Home-Garden/Carson-C...,sofa,0.469838,0.367417,0.003979,0.003982,0.146828,0.003979,0.003977
1,https://ak1.ostkcdn.com/images/products/139907...,Carson Carrington Brandbu Mid-century Fold Dow...,https://www.overstock.com/Home-Garden/Carson-C...,sofa,0.469838,0.367417,0.003979,0.003982,0.146828,0.003979,0.003977
2,https://ak1.ostkcdn.com/images/products/139907...,Carson Carrington Brandbu Mid-century Fold Dow...,https://www.overstock.com/Home-Garden/Carson-C...,sofa,0.469838,0.367417,0.003979,0.003982,0.146828,0.003979,0.003977
3,https://ak1.ostkcdn.com/images/products/139907...,Carson Carrington Brandbu Mid-century Fold Dow...,https://www.overstock.com/Home-Garden/Carson-C...,sofa,0.469838,0.367417,0.003979,0.003982,0.146828,0.003979,0.003977
4,https://ak1.ostkcdn.com/images/products/139907...,Carson Carrington Brandbu Mid-century Fold Dow...,https://www.overstock.com/Home-Garden/Carson-C...,sofa,0.469838,0.367417,0.003979,0.003982,0.146828,0.003979,0.003977


In [57]:
# find documents for topic
no_documents = 5
find_topic_books(lda_df,no_documents)


------label-------
                                              image_link  \
10057  https://ak1.ostkcdn.com/images/products/177651...   
12028  https://ak1.ostkcdn.com/images/products/167488...   
12017  https://ak1.ostkcdn.com/images/products/162021...   
12018  https://ak1.ostkcdn.com/images/products/162021...   
12019  https://ak1.ostkcdn.com/images/products/162021...   
12020  https://ak1.ostkcdn.com/images/products/162021...   

                                            product_name  
10057  Porch & Den East Village Stuyvesant Open Shelv...  
12028  Harper Blvd Norwin Metal/Glass Corner TV Stand...  
12017             Inval Modern Espresso 60-inch TV Stand  
12018             Inval Modern Espresso 60-inch TV Stand  
12019             Inval Modern Espresso 60-inch TV Stand  
12020             Inval Modern Espresso 60-inch TV Stand  

------topic_1-------
                                             image_link  \
6373  https://ak1.ostkcdn.com/images/products/187069...   
6374  

In [58]:
# log transform and write to file
lda_df.loc[:,'topic_1':'topic_7'] = lda_df.loc[:,'topic_1':'topic_7'].apply(np.log)
lda_df.head()
lda_df.to_csv('../data/model/nlp_matrix.csv')

In [59]:
# setup for visualization
pyLDAvis.sklearn.prepare(lda_tf,tf_vector,tf_vectorizer)

In [116]:
# nlp_matrix

In [None]:
# word2vec

In [43]:
import gensim

In [46]:
#model_vec  = KeyedVectors.load_word2vec_format('./model/GoogleNews-vectors-negative300.bin', binary=True)

NameError: name 'KeyedVectors' is not defined