In [24]:
import re
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
from nltk.tokenize import word_tokenize
from string import punctuation 
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from nltk.tokenize import RegexpTokenizer
import gensim
from gensim.test.utils import get_tmpfile


In [25]:
def preprocessText(text):
            
        stops = stopwords.words("english")
        #add more stopwords in this particular case
        stops.extend(['love','like','hate','amazing','favorite','dislike',"don't",'awesome','great','good','bad','horrible','excellent'])
        stemmer = SnowballStemmer('english')

        text = str(text).lower() # convert text to lower-case
        text = word_tokenize(text) # remove repeated characters (helloooooooo into hello)    
     
        stemmed_words = [stemmer.stem(word) for word in text]
        text = " ".join(stemmed_words) # word stemmer
    
        tokenizer = RegexpTokenizer(r'\w+') #tokenize
        text = tokenizer.tokenize(text)
    
        stop_words = [word for word in text if word not in stops]
        text = " ".join(stop_words)
   
        return text

In [26]:
#Text preprocess 
df_review = pd.read_csv('flaskexample/data/fragrance_data.csv')
df_review['preprocess_text_tot'] = df_review['text_tot'].apply(lambda x: preprocessText(x))

In [27]:
#unable to read directly
#train the model again
from gensim import corpora, models
dictionary = corpora.Dictionary(df_review['preprocess_text_tot'].str.split())
#create corpus
corpus = [dictionary.doc2bow(text) for text in df_review['preprocess_text_tot'].str.split()]
#create TF-IDF vectors
tfidf = models.TfidfModel(corpus) # fit model
corpus_tfidf = tfidf[corpus]


In [30]:
tfidfvector = gensim.matutils.corpus2csc(corpus_tfidf)
tfidfvector = tfidfvector.T.toarray()
tfidf_matrix = pd.DataFrame(tfidfvector, index=df_review['name'])
print(tfidf_matrix.shape)
tfidf_matrix.head(3)

(255, 18901)


Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,18891,18892,18893,18894,18895,18896,18897,18898,18899,18900
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1 Million,0.007463,0.02117,0.018286,0.07553,0.007271,0.011376,0.016372,0.010459,0.00967,0.031287,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1 Million Lucky,0.0,0.0,0.0,0.014412,0.016186,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
212,0.0,0.0,0.0,0.002266,0.010179,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [34]:
tfidfvector

array([[0.0074628 , 0.02117018, 0.018286  , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.0083488 , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.03671832, 0.03671832,
        0.03671832]])

In [67]:
from sklearn.decomposition import TruncatedSVD
svd = TruncatedSVD(n_components=200,n_iter=10)
svd.fit(tfidfvector)  



TruncatedSVD(algorithm='randomized', n_components=200, n_iter=10,
       random_state=None, tol=0.0)

In [68]:
print(svd.explained_variance_ratio_.sum())  


0.9372175682306161


In [51]:
svd.singular_values_


array([4.81483556, 2.61140459, 2.31029444, 2.08209007, 1.99403052,
       1.82569384, 1.71527602, 1.67024162, 1.63998637, 1.62377595,
       1.60188158, 1.58106594, 1.57437442, 1.53520446, 1.52628286,
       1.50927503, 1.49269773, 1.47400217, 1.45830651, 1.4056428 ,
       1.40171867, 1.37031096, 1.36011905, 1.35798581, 1.33096218,
       1.32129729, 1.30738461, 1.2879087 , 1.28030267, 1.27417099,
       1.25655478, 1.24038144, 1.23167056, 1.2285243 , 1.21670199,
       1.20962948, 1.20348022, 1.19020867, 1.18458046, 1.17883124,
       1.17016599, 1.15542991, 1.15234882, 1.15037424, 1.14485603,
       1.14018553, 1.13166888, 1.12450712, 1.11075348, 1.11003782,
       1.09909841, 1.09260624, 1.08682695, 1.08148771, 1.08028199,
       1.07611661, 1.06421424, 1.06286382, 1.06063753, 1.05928822,
       1.05041158, 1.04996543, 1.04382895, 1.04176209, 1.0373383 ,
       1.02887164, 1.02753251, 1.02487223, 1.02118495, 1.01843659,
       1.01456021, 1.01047822, 1.00770554, 1.00108292, 0.99933

In [40]:
# compare to the singular values obtained by gensim.lsimodel
lsi = models.LsiModel.load('flaskexample/models/lsimodel')
lsi.projection.s

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL
  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


array([4.81483556, 2.61140328, 2.31029242, 2.08208411, 1.99402544,
       1.82567742, 1.71525166, 1.67022121, 1.63994589, 1.62370665,
       1.60182378, 1.58102264, 1.57430991, 1.53512549, 1.52617852,
       1.50917556, 1.49260206, 1.47390863, 1.4582081 , 1.40536772,
       1.40154535, 1.37014659, 1.35992935, 1.35769438, 1.33065821,
       1.32092854, 1.30714085, 1.28769599, 1.28006589, 1.27369208,
       1.25616317, 1.23981042, 1.23106456, 1.2281145 , 1.21623786,
       1.20910058, 1.20295242, 1.18935242, 1.18396779, 1.17798097,
       1.16944256, 1.15472199, 1.15174006, 1.14952505, 1.14424076,
       1.13928736, 1.13029302, 1.1235779 , 1.10977311, 1.10884242,
       1.09769528, 1.09155279, 1.08561004, 1.08031535, 1.07890647,
       1.07427319, 1.06324446, 1.06149532, 1.05936182, 1.05804335,
       1.04958882, 1.04852465, 1.04314241, 1.04017455, 1.03666629,
       1.02762633, 1.02589918, 1.02351372, 1.01983117, 1.01711655,
       1.0122475 , 1.00840034, 1.00646706, 0.99963748, 0.99702