In [1]:
#! pip install gensim
import re
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
from nltk.tokenize import word_tokenize
from string import punctuation 
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from nltk.tokenize import RegexpTokenizer
import gensim
from gensim.test.utils import get_tmpfile


In [2]:
def preprocessText(text):
            
        stops = stopwords.words("english")
        #add more stopwords in this particular case
        stops.extend(['love','like','hate','amazing','favorite','dislike',"don't",'awesome','great','good','bad','horrible','excellent'])
        stemmer = SnowballStemmer('english')

        text = str(text).lower() # convert text to lower-case
        text = word_tokenize(text) # remove repeated characters (helloooooooo into hello)    
     
        stemmed_words = [stemmer.stem(word) for word in text]
        text = " ".join(stemmed_words) # word stemmer
    
        tokenizer = RegexpTokenizer(r'\w+') #tokenize
        text = tokenizer.tokenize(text)
    
        stop_words = [word for word in text if word not in stops]
        text = " ".join(stop_words)
   
        return text

In [3]:
#Text preprocess 
df_review = pd.read_csv('flaskexample/data/fragrance_data.csv')
df_review['preprocess_text_tot'] = df_review['text_tot'].apply(lambda x: preprocessText(x))

In [4]:
df_review.head()

Unnamed: 0,name,product_id,ratings,description,product_image_url,review_text_combine,rating,brand,price,Fragrance Family,Type,Key Notes,About,text_tot,display,preprocess_text_tot
0,1 Million,P269120,4.74124,This high-impact fragrance creates a multiface...,https://www.sephora.com/productimages/sku/s120...,My favorite men’s fragrance. My husband was we...,5,Paco Rabanne,$70.00,,,,,My favorite men’s fragrance. My husband was ...,This high-impact fragrance creates a multiface...,favorit men fragranc husband wear night met he...
1,1 Million Lucky,P431565,4.76,Fragrance Family: Earthy & WoodyScent Type: Wa...,https://www.sephora.com/productimages/sku/s203...,"Everytime I spray this on in the morning, I'm ...",5,Paco Rabanne,$70.00,Earthy & Woody Scent,Warm Woods,"Hazelnut, Green Plum, Cedar","With unlimited potential and endless fun, lif...","Hazelnut, Green Plum, Cedar With unlimited p...","With unlimited potential and endless fun, lif...",hazelnut green plum cedar unlimit potenti endl...
2,212,P4437,4.559055,"A long-lasting, off beat, light floral fragran...",https://www.sephora.com/productimages/sku/s110...,its one of my favorites....I always get compli...,5,Carolina Herrera,$84.00,,,,,its one of my favorites....I always get comp...,"A long-lasting, off beat, light floral fragran...",one favorit alway get compliment wear highscho...
3,3 L’IMPERATRICE,P245901,4.62236,Succulent exotic fruits and bright pink floral...,https://www.sephora.com/productimages/sku/s120...,I love this scent. I found it a few years ago ...,5,DOLCE&GABBANA,$80.00,,,,,I love this scent. I found it a few years ag...,Succulent exotic fruits and bright pink floral...,scent found year ago fell perfect daytim wear ...
4,ALLURE HOMME SPORT EAU EXTRÊME Eau de Parfum,P377719,5.0,Fragrance Family: Earthy & WoodyScent Type: Wa...,https://www.sephora.com/productimages/sku/s141...,This is one of my favorites (besides Polo Red)...,5,CHANEL,$95.00,Earthy & Woody Scent,Warm Woods,"Mandarin, Cypress, Tonka Bean","Allure to the extreme. A powerful, dynamic, a...","Mandarin, Cypress, Tonka Bean Allure to the ...","Allure to the extreme. A powerful, dynamic, a...",mandarin cypress tonka bean allur extrem power...


# Use doc2vec

In [5]:
# build vocabuary using doc2vec
documents = df_review['preprocess_text_tot']
formatted_documents = [gensim.models.doc2vec.TaggedDocument(doc, [i]) for i, doc in enumerate(documents)]

model = gensim.models.doc2vec.Doc2Vec(vector_size=200, min_count=2, epochs=200, seed=0, window=3, dm=1)
model.build_vocab(formatted_documents)

In [6]:
# train model
%time model.train(formatted_documents, total_examples=model.corpus_count, epochs=model.epochs)


CPU times: user 9min 24s, sys: 5.73 s, total: 9min 30s
Wall time: 4min 18s


In [7]:
model.save("flaskexample/models/doc2vec_model")
model = gensim.models.doc2vec.Doc2Vec.load("flaskexample/models/doc2vec_model")


  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL
  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


In [8]:
# build doc2vec matrix
doctovec_feature_matrix = pd.DataFrame(model.docvecs.vectors_docs, index=df_review['name'])
print(doctovec_feature_matrix.shape)
doctovec_feature_matrix.head(3)


(255, 200)


Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,190,191,192,193,194,195,196,197,198,199
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1 Million,0.609882,-0.099592,-0.446978,-0.301711,-0.264819,-0.457173,-0.406461,0.794946,0.381589,-0.400898,...,-0.110958,0.479705,-0.141484,-0.663649,0.616857,0.133134,-0.145503,-0.832296,-0.032334,0.406526
1 Million Lucky,0.330852,0.225811,0.194002,0.243792,0.100231,-0.282537,-0.364011,0.339874,-0.193019,-0.076803,...,-0.564919,0.440399,-0.525062,-1.185361,0.682527,-0.186386,0.121757,-0.715621,-0.174773,0.099017
212,-0.564836,0.681433,-0.442044,0.256194,-0.652333,-0.099249,-0.184168,-0.324872,-0.549143,-0.446559,...,0.019913,-0.296466,0.034359,-0.311299,0.306722,-0.253562,-0.020502,-0.400058,-0.165856,0.080751


In [9]:
pickle.dump(doctovec_feature_matrix, open("flaskexample/models/doctovec_embeddings.pkl", "wb"))

# Use LSA, TF-IDF

In [10]:
#Use gensim
# create dictionary
from gensim import corpora, models
dictionary = corpora.Dictionary(df_review['preprocess_text_tot'].str.split())
dictionary.save('flaskexample/models/dictionary')

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


In [11]:
#create corpus
corpus = [dictionary.doc2bow(text) for text in df_review['preprocess_text_tot'].str.split()]

In [12]:
#create TF-IDF vectors
tfidf = models.TfidfModel(corpus) # fit model
corpus_tfidf = tfidf[corpus]
tfidf.save('flaskexample/models/tfidf')  # same for tfidf, lda, ...

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


In [13]:
from gensim.models import LsiModel

lsi = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=100)
lsi.save('flaskexample/models/lsimodel') 

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL
  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


In [14]:
lsi = models.LsiModel.load('flaskexample/models/lsimodel')
lsi.show_topics(num_topics=2, num_words=10, log=False, formatted=True)


  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL
  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


[(0,
  '-0.169*"cologn" + -0.110*"floral" + -0.108*"perfum" + -0.095*"man" + -0.094*"boyfriend" + -0.090*"sexi" + -0.088*"rollerbal" + -0.086*"vanilla" + -0.085*"flowerbomb" + -0.081*"chanel"'),
 (1,
  '0.507*"cologn" + 0.266*"man" + 0.235*"boyfriend" + 0.175*"polo" + 0.155*"husband" + 0.155*"men" + 0.151*"homm" + 0.144*"masculin" + 0.116*"christma" + 0.116*"dad"')]

In [15]:
vector = lsi[corpus_tfidf]
lsivector = gensim.matutils.corpus2csc(vector)
lsivector = lsivector.T.toarray()
lsi_matrix = pd.DataFrame(lsivector, index=df_review['name'])
print(lsi_matrix.shape)
lsi_matrix.head(3)


(255, 100)


Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1 Million,-0.467841,0.428517,0.009555,-0.003349,-0.016063,-0.056009,-0.033895,-0.010874,-0.003556,-0.065571,...,0.017304,-0.011877,0.004369,-0.020748,0.002405,-0.00798,-0.019678,-0.017812,0.007469,-0.000872
1 Million Lucky,-0.183082,0.179645,0.007961,0.002281,-0.011931,-0.007423,-0.014003,-0.005371,0.006768,-0.017325,...,-0.214369,0.145128,-0.051369,-0.051401,-0.129629,0.079632,-0.015159,0.064932,0.009753,0.080314
212,-0.150758,-0.025201,-0.003733,0.016565,-0.026173,-0.008756,-0.016384,0.017724,-0.021241,0.016391,...,-0.31569,0.016326,-0.055855,-0.010899,0.011396,-0.274093,0.093314,-0.240771,0.018558,0.027801


In [16]:
# save the lsi matrix
pickle.dump(lsi_matrix, open("flaskexample/models/lsi_embeddings.pkl", "wb"))

In [17]:
#vecterization test
test = 'This is a test. I like a lot of flowers'
pre_test = preprocessText(test)
test_corpus = [dictionary.doc2bow(pre_test.split())]
test_corpus_tfidf = tfidf[test_corpus]
test_vector = lsi[test_corpus_tfidf]
test_vector = gensim.matutils.corpus2csc(test_vector)
test_vector = test_vector.T.toarray()
test_df = pd.DataFrame(test_vector)
test_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,-0.070583,-0.048474,-0.00368,0.010814,0.033683,0.015864,-0.028998,0.042152,0.039423,-0.007251,...,-0.027436,-0.048733,0.018072,0.00878,-0.024179,0.020852,-0.024534,-0.032178,-0.039751,0.001126
