In [1]:
#! pip install gensim
import re
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
from nltk.tokenize import word_tokenize
from string import punctuation 
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from nltk.tokenize import RegexpTokenizer
import gensim
from gensim.test.utils import get_tmpfile


In [2]:
def preprocessText(text):
            
        stops = stopwords.words("english")
        #add more stopwords in this particular case
        stops.extend(['love','like','hate','amazing','favorite','dislike',"don't",'awesome','great','good','bad','horrible','excellent'])
        stemmer = SnowballStemmer('english')

        text = str(text).lower() # convert text to lower-case
        text = word_tokenize(text) # remove repeated characters (helloooooooo into hello)    
     
        stemmed_words = [stemmer.stem(word) for word in text]
        text = " ".join(stemmed_words) # word stemmer
    
        tokenizer = RegexpTokenizer(r'\w+') #tokenize
        text = tokenizer.tokenize(text)
    
        stop_words = [word for word in text if word not in stops]
        text = " ".join(stop_words)
   
        return text

In [3]:
#Text preprocess 
df_review = pd.read_csv('flaskexample/data/fragrance_data.csv')
df_review['preprocess_text_tot'] = df_review['text_tot'].apply(lambda x: preprocessText(x))

# Use doc2vec

In [4]:
# build vocabuary using doc2vec
documents = df_review['preprocess_text_tot']
formatted_documents = [gensim.models.doc2vec.TaggedDocument(doc, [i]) for i, doc in enumerate(documents)]

model = gensim.models.doc2vec.Doc2Vec(vector_size=200, min_count=3, epochs=200, window=10, dm=1,workers=1,alpha = 0.01)
model.build_vocab(formatted_documents)

In [5]:
# train model
%time model.train(formatted_documents, total_examples=model.corpus_count, epochs=model.epochs)


CPU times: user 7min 47s, sys: 2.74 s, total: 7min 50s
Wall time: 7min 48s


In [6]:
#test
inputmessage = "I love sea"
vector = model.infer_vector(inputmessage.split(" "),epochs=10)
vector

array([ 9.2984090e-04,  2.1324467e-03, -4.1767088e-04,  8.2117837e-04,
       -2.8487531e-04, -1.1723568e-03, -1.8401407e-03,  1.0269838e-03,
        1.8920006e-03, -4.0799665e-04,  2.4522243e-03, -3.1235043e-04,
       -4.0117686e-04,  6.5788877e-04,  3.5301846e-04,  6.9271051e-04,
        2.3944173e-03, -1.0596224e-03,  1.1843473e-03,  2.2393286e-03,
       -1.1506575e-03,  1.4495827e-03,  2.3033267e-03, -2.4168526e-03,
       -2.3579563e-03, -6.9102767e-04, -2.4613319e-03, -1.4113291e-03,
        8.8263070e-04, -1.5255756e-03,  1.0207603e-03, -1.3480903e-04,
        9.4947807e-04, -7.3342249e-05,  1.3557486e-03,  1.5164035e-03,
       -7.2662788e-04, -1.4728388e-03, -2.0321498e-03,  1.5249919e-03,
        1.7045502e-03,  2.7438879e-04, -1.6036112e-03,  5.0542637e-04,
        1.0619115e-03,  1.4562886e-03, -1.3032815e-03, -6.4416870e-04,
       -1.5000931e-03,  4.9698958e-04,  2.3950429e-03, -1.8324794e-03,
        1.7780322e-03,  1.8322080e-03, -1.5263227e-03,  1.3336759e-03,
      

In [7]:
model.save("flaskexample/models/doc2vec_model")
model = gensim.models.doc2vec.Doc2Vec.load("flaskexample/models/doc2vec_model")


  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL
  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


In [8]:
# build doc2vec matrix
doctovec_feature_matrix = pd.DataFrame(model.docvecs.vectors_docs, index=df_review['name'])
print(doctovec_feature_matrix.shape)
doctovec_feature_matrix.head(3)


(255, 200)


Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,190,191,192,193,194,195,196,197,198,199
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1 Million,-0.285015,-0.781768,-0.283619,0.630631,0.439969,-0.453569,-0.694097,-0.83071,-1.28773,1.053587,...,0.243208,-0.663966,-0.631537,0.560223,0.334696,-1.47305,1.224162,0.442532,0.185852,-1.226574
1 Million Lucky,-0.2456,-0.884978,-0.145119,-0.323488,0.276236,-0.401541,0.688834,0.099906,-0.050438,0.055179,...,0.547446,-0.572283,-0.269162,0.547626,0.530881,0.137398,-0.320326,0.276737,0.68857,0.170296
212,-1.033316,-0.204336,-1.532184,-0.504835,-1.135732,-0.300239,-1.480776,0.50266,-0.469252,0.821148,...,-0.177869,0.86982,0.222755,0.640796,-0.837944,0.282531,-1.186029,-0.216785,-1.835332,-1.839382


In [9]:
pickle.dump(doctovec_feature_matrix, open("flaskexample/models/doctovec_embeddings.pkl", "wb"))

# Use LSA, TF-IDF

In [10]:
#Use gensim
# create dictionary
from gensim import corpora, models
dictionary = corpora.Dictionary(df_review['preprocess_text_tot'].str.split())
dictionary.save('flaskexample/models/dictionary')

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


In [11]:
#create corpus
corpus = [dictionary.doc2bow(text) for text in df_review['preprocess_text_tot'].str.split()]

In [12]:
#create TF-IDF vectors
tfidf = models.TfidfModel(corpus) # fit model
corpus_tfidf = tfidf[corpus]
tfidf.save('flaskexample/models/tfidf')  # same for tfidf, lda, ...

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


In [13]:
from gensim.models import LsiModel

lsi = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=100)
lsi.save('flaskexample/models/lsimodel') 

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL
  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


In [14]:
lsi = models.LsiModel.load('flaskexample/models/lsimodel')
lsi.show_topics(num_topics=2, num_words=10, log=False, formatted=True)


  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL
  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


[(0,
  '-0.169*"cologn" + -0.110*"floral" + -0.108*"perfum" + -0.095*"man" + -0.094*"boyfriend" + -0.090*"sexi" + -0.088*"rollerbal" + -0.086*"vanilla" + -0.085*"flowerbomb" + -0.081*"chanel"'),
 (1,
  '0.507*"cologn" + 0.266*"man" + 0.235*"boyfriend" + 0.175*"polo" + 0.155*"husband" + 0.155*"men" + 0.151*"homm" + 0.144*"masculin" + 0.116*"dad" + 0.116*"christma"')]

In [15]:
vector = lsi[corpus_tfidf]
lsivector = gensim.matutils.corpus2csc(vector)
lsivector = lsivector.T.toarray()
lsi_matrix = pd.DataFrame(lsivector, index=df_review['name'])
print(lsi_matrix.shape)
lsi_matrix.head(3)


(255, 100)


Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1 Million,-0.467836,0.428476,-0.009583,-0.003225,-0.015282,-0.055295,-0.033855,0.010762,0.002709,0.064704,...,0.024274,0.017948,-0.005488,-0.007281,-0.019485,-0.013152,-0.009666,0.021718,-7.4e-05,0.013393
1 Million Lucky,-0.183084,0.179611,-0.007931,0.002365,-0.012043,-0.0075,-0.013948,0.004978,-0.00698,0.017636,...,-0.063313,-0.131907,0.047674,-0.09652,-0.125163,0.044155,0.003431,-0.023883,-0.059151,-0.083248
212,-0.150757,-0.025141,0.003826,0.016608,-0.026302,-0.008933,-0.01612,-0.017192,0.02085,-0.015466,...,-0.250134,-0.112182,0.049766,0.148457,0.036002,-0.225343,-0.299844,0.032035,0.057944,0.022443


In [16]:
# save the lsi matrix
pickle.dump(lsi_matrix, open("flaskexample/models/lsi_embeddings.pkl", "wb"))

In [17]:
#vecterization test
test = 'This is a test. I like a lot of flowers'
pre_test = preprocessText(test)
test_corpus = [dictionary.doc2bow(pre_test.split())]
test_corpus_tfidf = tfidf[test_corpus]
test_vector = lsi[test_corpus_tfidf]
test_vector = gensim.matutils.corpus2csc(test_vector)
test_vector = test_vector.T.toarray()
test_df = pd.DataFrame(test_vector)
test_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,-0.070583,-0.048519,0.003678,0.010937,0.033745,0.015789,-0.029174,-0.042261,-0.039508,0.007239,...,-0.033118,0.056092,-0.002053,-0.025589,0.022807,0.017541,-0.044739,0.01606,-0.033129,-0.004375
