In [1]:
#! pip install gensim
import re
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
from nltk.tokenize import word_tokenize
from string import punctuation 
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from nltk.tokenize import RegexpTokenizer
import gensim
from gensim.test.utils import get_tmpfile


In [2]:
def preprocessText(text):
            
        stops = stopwords.words("english")
        #add more stopwords in this particular case
        stops.extend(['love','like','hate','amazing','favorite','dislike',"don't",'awesome','great','good','bad','horrible','excellent'])
        stemmer = SnowballStemmer('english')

        text = str(text).lower() # convert text to lower-case
        text = word_tokenize(text) # remove repeated characters (helloooooooo into hello)    
     
        stemmed_words = [stemmer.stem(word) for word in text]
        text = " ".join(stemmed_words) # word stemmer
    
        tokenizer = RegexpTokenizer(r'\w+') #tokenize
        text = tokenizer.tokenize(text)
    
        stop_words = [word for word in text if word not in stops]
        text = " ".join(stop_words)
   
        return text

In [3]:
#Text preprocess 
df_review = pd.read_csv('flaskexample/data/fragrance_data.csv')
df_review['preprocess_text_tot'] = df_review['text_tot'].apply(lambda x: preprocessText(x))

# Use doc2vec

In [4]:
# build vocabuary using doc2vec
documents = df_review['preprocess_text_tot']
formatted_documents = [gensim.models.doc2vec.TaggedDocument(doc, [i]) for i, doc in enumerate(documents)]

model = gensim.models.doc2vec.Doc2Vec(vector_size=200, min_count=3, epochs=200, window=10, dm=1,workers=1,alpha = 0.01)
model.build_vocab(formatted_documents)

In [5]:
# train model
%time model.train(formatted_documents, total_examples=model.corpus_count, epochs=model.epochs)


CPU times: user 7min 59s, sys: 4.05 s, total: 8min 3s
Wall time: 8min 5s


In [6]:
#test
inputmessage = "I love sea"
vector = model.infer_vector(inputmessage.split(" "),epochs=10)
vector

array([-1.6599757e-03,  4.7895854e-04, -1.1084810e-03, -6.2570791e-05,
       -3.4315092e-04,  9.3989569e-04, -1.1118054e-03, -1.0570225e-03,
        3.2591397e-05,  1.4691929e-03, -5.7970575e-04,  6.6327269e-04,
       -2.1968475e-03, -4.3763238e-04,  1.1705793e-03, -1.2900918e-03,
        1.9383549e-03, -1.7033012e-03, -3.2131435e-04,  1.2948684e-03,
       -1.8146278e-03, -4.2696780e-04,  1.5852125e-03,  1.9819664e-03,
       -2.0441697e-03, -7.8427262e-04, -1.5735693e-03,  7.1230228e-04,
        1.6297232e-03, -1.7283623e-04,  1.4628111e-03,  1.7514838e-03,
       -1.6147085e-03, -1.9867991e-03,  4.8428398e-04, -1.9696024e-03,
       -2.3769338e-03, -1.5698755e-03, -3.3900543e-04,  1.9324881e-03,
       -1.8121253e-03,  8.4625330e-04,  2.0443310e-03,  4.3857360e-04,
        1.9232263e-03, -1.8398515e-03, -1.7525350e-03, -1.7357144e-03,
        6.6767732e-04, -1.5929957e-03,  1.9157456e-03,  1.1282664e-03,
       -1.2679981e-03,  2.2083309e-03, -4.4496828e-05, -4.9189851e-04,
      

In [7]:
model.save("flaskexample/models/doc2vec_model")
model = gensim.models.doc2vec.Doc2Vec.load("flaskexample/models/doc2vec_model")


  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL
  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


In [8]:
# build doc2vec matrix
doctovec_feature_matrix = pd.DataFrame(model.docvecs.vectors_docs, index=df_review['name'])
print(doctovec_feature_matrix.shape)
doctovec_feature_matrix.head(3)


(255, 200)


Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,190,191,192,193,194,195,196,197,198,199
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1 Million,-0.562577,-0.519505,0.69566,-0.477294,-0.028094,0.697598,1.437936,1.34756,0.828583,-1.434612,...,-0.040419,-0.417882,0.084524,0.17407,0.299372,-0.522378,0.814809,-0.51587,0.260511,0.419477
1 Million Lucky,0.101255,-0.351903,-0.13784,-0.381925,-0.23274,0.14786,-0.423483,0.858232,0.279154,-0.050326,...,-0.375922,0.444531,-0.260744,0.787415,0.949675,-0.640151,0.988527,-0.061021,0.105629,-1.264991
212,-1.395752,-1.745973,0.674756,0.623539,-0.611494,1.273818,0.527928,-1.231759,-1.320426,-0.839876,...,-0.229858,0.171408,-1.194211,1.206744,0.329017,-0.75183,1.68544,0.255388,-0.351774,-0.421122


In [9]:
pickle.dump(doctovec_feature_matrix, open("flaskexample/models/doctovec_embeddings.pkl", "wb"))

# Use LSA, TF-IDF

In [10]:
#Use gensim
# create dictionary
from gensim import corpora, models
dictionary = corpora.Dictionary(df_review['preprocess_text_tot'].str.split())
dictionary.save('flaskexample/models/dictionary')

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


In [11]:
#create corpus
corpus = [dictionary.doc2bow(text) for text in df_review['preprocess_text_tot'].str.split()]

In [12]:
#create TF-IDF vectors
tfidf = models.TfidfModel(corpus) # fit model
corpus_tfidf = tfidf[corpus]
tfidf.save('flaskexample/models/tfidf')  # same for tfidf, lda, ...

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


In [13]:
from gensim.models import LsiModel

lsi = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=200)
lsi.save('flaskexample/models/lsimodel') 

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL
  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


In [14]:
lsi = models.LsiModel.load('flaskexample/models/lsimodel')
lsi.show_topics(num_topics=2, num_words=10, log=False, formatted=True)


  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL
  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


[(0,
  '0.169*"cologn" + 0.110*"floral" + 0.108*"perfum" + 0.095*"man" + 0.094*"boyfriend" + 0.090*"sexi" + 0.088*"rollerbal" + 0.086*"vanilla" + 0.085*"flowerbomb" + 0.081*"chanel"'),
 (1,
  '0.507*"cologn" + 0.266*"man" + 0.235*"boyfriend" + 0.175*"polo" + 0.155*"husband" + 0.155*"men" + 0.151*"homm" + 0.144*"masculin" + 0.116*"dad" + 0.116*"christma"')]

In [15]:
vector = lsi[corpus_tfidf]
lsivector = gensim.matutils.corpus2csc(vector)
lsivector = lsivector.T.toarray()
lsi_matrix = pd.DataFrame(lsivector, index=df_review['name'])
print(lsi_matrix.shape)
lsi_matrix.head(3)


(255, 200)


Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,190,191,192,193,194,195,196,197,198,199
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1 Million,0.467838,0.428484,-0.009549,0.003317,-0.015806,0.055306,-0.033424,0.010779,0.003014,-0.064551,...,0.007689,0.052416,0.027065,-0.093919,-0.055425,-0.000665,0.024111,0.044285,-0.065849,-0.044501
1 Million Lucky,0.183083,0.17963,-0.007974,-0.002339,-0.011847,0.00745,-0.014036,0.004839,-0.006911,-0.017567,...,-0.012898,-0.007712,-0.002897,0.006447,0.009649,-0.001082,0.000793,-0.000853,0.002476,0.014148
212,0.150757,-0.025135,0.003744,-0.016544,-0.026246,0.008955,-0.016241,-0.017428,0.021355,0.015735,...,-0.000472,0.009419,-0.041621,-0.015855,0.034099,0.028615,0.024081,-0.002326,0.000438,-0.005792


In [16]:
# save the lsi matrix
pickle.dump(lsi_matrix, open("flaskexample/models/lsi_embeddings.pkl", "wb"))

In [17]:
#vecterization test
test = 'This is a test. I like a lot of flowers'
pre_test = preprocessText(test)
test_corpus = [dictionary.doc2bow(pre_test.split())]
test_corpus_tfidf = tfidf[test_corpus]
test_vector = lsi[test_corpus_tfidf]
test_vector = gensim.matutils.corpus2csc(test_vector)
test_vector = test_vector.T.toarray()
test_df = pd.DataFrame(test_vector)
test_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,190,191,192,193,194,195,196,197,198,199
0,0.070584,-0.048531,0.003678,-0.010868,0.033721,-0.015858,-0.029104,-0.042187,-0.039633,-0.007146,...,-0.036133,-0.037529,0.001101,-0.03984,-0.004625,-0.024633,0.009543,-0.023062,0.001513,0.005495
