In [1]:
#! pip install gensim
import re
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
from nltk.tokenize import word_tokenize
from string import punctuation 
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from nltk.tokenize import RegexpTokenizer
import gensim
from gensim.test.utils import get_tmpfile


In [2]:
def preprocessText(text):
            
        stops = set(stopwords.words("english"))
        #add more stopwords in this particular case
        stops.extend(['love','like','hate','amazing','favorite','dislike',"don't",'awesome','great','good','bad','horrible','excellent'])
        stemmer = SnowballStemmer('english')

        text = str(text).lower() # convert text to lower-case
        text = word_tokenize(text) # remove repeated characters (helloooooooo into hello)    
     
        stemmed_words = [stemmer.stem(word) for word in text]
        text = " ".join(stemmed_words) # word stemmer
    
        tokenizer = RegexpTokenizer(r'\w+') #tokenize
        text = tokenizer.tokenize(text)
    
        stop_words = [word for word in text if word not in stops]
        text = " ".join(stop_words)
   
        return text

In [3]:
#Text preprocess 
df_review = pd.read_csv('flaskexample/data/fragrance_data.csv')
df_review['preprocess_text_tot'] = df_review['text_tot'].apply(lambda x: preprocessText(x))

# Use doc2vec

In [4]:
# build vocabuary using doc2vec
documents = df_review['preprocess_text_tot']
formatted_documents = [gensim.models.doc2vec.TaggedDocument(doc, [i]) for i, doc in enumerate(documents)]

model = gensim.models.doc2vec.Doc2Vec(vector_size=200, min_count=2, epochs=200, seed=0, window=3, dm=1)
model.build_vocab(formatted_documents)

In [5]:
# train model
%time model.train(formatted_documents, total_examples=model.corpus_count, epochs=model.epochs)


CPU times: user 9min 26s, sys: 3.99 s, total: 9min 30s
Wall time: 4min 4s


In [6]:
model.save("flaskexample/models/doc2vec_model")
model = gensim.models.doc2vec.Doc2Vec.load("flaskexample/models/doc2vec_model")


  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL
  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


In [7]:
# build doc2vec matrix
doctovec_feature_matrix = pd.DataFrame(model.docvecs.vectors_docs, index=df_review['name'])
print(doctovec_feature_matrix.shape)
doctovec_feature_matrix.head(3)


(255, 200)


Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,190,191,192,193,194,195,196,197,198,199
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1 Million,-0.190247,-0.220047,-0.540574,0.046998,-0.118783,0.163167,-0.016003,0.295755,0.31759,0.126865,...,-0.430949,0.459229,0.332098,-0.33204,-0.478645,0.07988,-0.018344,-0.133258,0.507208,0.48655
1 Million Lucky,-0.437599,0.42391,-0.445861,0.197835,0.278395,0.914936,0.995753,1.078198,-0.63399,-1.153638,...,-0.419501,-0.037868,0.210415,0.244514,-0.302211,0.36863,-0.070497,-0.409927,0.428931,0.205837
212,1.055334,1.164508,-1.165695,0.964227,-0.085889,-0.775081,-0.129363,0.309515,0.044691,0.003909,...,0.295341,0.346382,0.179234,0.293057,-0.417461,-0.390325,0.547509,0.277828,0.139696,0.474434


In [8]:
pickle.dump(doctovec_feature_matrix, open("flaskexample/models/doctovec_embeddings.pkl", "wb"))

# Use LSA, TF-IDF

In [10]:
#Use gensim
# create dictionary
from gensim import corpora, models
dictionary = corpora.Dictionary(df_review['preprocess_text_tot'].str.split())
dictionary.save('flaskexample/models/dictionary')

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


In [11]:
#create corpus
corpus = [dictionary.doc2bow(text) for text in df_review['preprocess_text_tot'].str.split()]

In [12]:
#create TF-IDF vectors
tfidf = models.TfidfModel(corpus) # fit model
corpus_tfidf = tfidf[corpus]
tfidf.save('flaskexample/models/tfidf')  # same for tfidf, lda, ...

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


In [13]:
from gensim.models import LsiModel

lsi = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=100)
lsi.save('flaskexample/models/lsimodel') 

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL
  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


In [14]:
lsi = models.LsiModel.load('flaskexample/models/lsimodel')
lsi.show_topics(num_topics=2, num_words=10, log=False, formatted=True)


  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL
  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


[(0,
  '-0.166*"cologn" + -0.124*"floral" + -0.107*"perfum" + -0.093*"man" + -0.092*"boyfriend" + -0.088*"sexi" + -0.087*"rollerbal" + -0.085*"vanilla" + -0.085*"wear" + -0.083*"flowerbomb"'),
 (1,
  '0.507*"cologn" + 0.266*"man" + 0.235*"boyfriend" + 0.173*"polo" + 0.156*"men" + 0.155*"husband" + 0.148*"homm" + 0.143*"masculin" + -0.123*"floral" + 0.116*"christma"')]

In [15]:
vector = lsi[corpus_tfidf]
lsivector = gensim.matutils.corpus2csc(vector)
lsivector = lsivector.T.toarray()
lsi_matrix = pd.DataFrame(lsivector, index=df_review['name'])
print(lsi_matrix.shape)
lsi_matrix.head(3)


(255, 100)


Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1 Million,-0.469914,0.429129,0.010879,0.005836,-0.01652,-0.057047,0.02734,0.016567,0.000308,0.060164,...,0.000788,0.015144,0.025692,0.02613,0.001525,-0.017042,-0.032255,-0.044291,-0.015056,0.032069
1 Million Lucky,-0.183438,0.180727,0.008531,-0.001544,-0.011745,-0.008588,0.014114,0.00679,-0.007634,0.014258,...,-0.146239,0.143173,-0.066091,-0.071018,-0.010466,-0.119574,0.102747,0.055805,-0.163496,0.044886
212,-0.152107,-0.024145,-0.002433,-0.015369,-0.027649,-0.00569,0.012132,-0.026884,0.011333,-0.008147,...,-0.040183,0.092698,-0.002633,-0.054884,0.071075,0.025962,-0.044496,0.103815,0.028886,-0.103164


In [16]:
# save the lsi matrix
pickle.dump(lsi_matrix, open("flaskexample/models/lsi_embeddings.pkl", "wb"))

In [17]:
#vecterization test
test = 'This is a test. I like a lot of flowers'
pre_test = preprocessText(test)
test_corpus = [dictionary.doc2bow(pre_test.split())]
test_corpus_tfidf = tfidf[test_corpus]
test_vector = lsi[test_corpus_tfidf]
test_vector = gensim.matutils.corpus2csc(test_vector)
test_vector = test_vector.T.toarray()
test_df = pd.DataFrame(test_vector)
test_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,-0.072951,-0.052694,-0.004479,-0.014586,0.032088,0.01571,0.017012,-0.04619,-0.057551,0.016297,...,-0.017886,0.019205,0.05236,-0.015531,-0.018121,0.006101,-0.022618,0.034547,0.01316,0.065428
