## LDA

In [18]:
import numpy as np
import pandas as pd


reviews_datasets = pd.read_csv('Reviews.csv')
reviews_datasets = reviews_datasets.head(50000)
reviews_datasets.dropna()

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,5,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...
1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,1,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...
2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,4,1219017600,"""Delight"" says it all",This is a confection that has been around a fe...
3,4,B000UA0QIQ,A395BORC6FGVXV,Karl,3,3,2,1307923200,Cough Medicine,If you are looking for the secret ingredient i...
4,5,B006K2ZZ7K,A1UQRSCLF8GW1T,"Michael D. Bigham ""M. Wassir""",0,0,5,1350777600,Great taffy,Great taffy at a great price. There was a wid...
...,...,...,...,...,...,...,...,...,...,...
49995,49996,B00430B73W,AKW867T7C6HF8,HWJ3,0,0,5,1261180800,Grown to really like them,My sister introduced me to these. They have be...
49996,49997,B00430B73W,A2PEHNEDMHOYTW,L. Allen,0,0,5,1258934400,Healthy Snack at a great price!,A friend recommended these bars and I fell in ...
49997,49998,B00430B73W,A2QH2KF2IAB143,"J. A. Meyers ""Jan""",0,0,5,1257379200,Raw Revolution Hazelnut Cashew,Fabulous! I take one to work with me every da...
49998,49999,B00430B73W,AMX286UGXISMA,Aaron Dragushan,0,0,4,1247702400,"fantastic, but chew carefully",I love these bars and will continue to buy the...


In [3]:
reviews_datasets['Text'][200]

"Even with small containers, they don't fill them up.  These little tins are less than half filled and at the price charged it seems a rip-off. Is there some exotic ingredient as costly as gold contained in those tiny squares?  Or how about the cereal ploy, they were filled at the factory but settled in transport.<br />Can manufacturers be honest in their dealings?"

In [4]:
# vectorize data

from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(max_df=0.8, min_df=2, stop_words='english')

doc_term_matrix = vectorizer.fit_transform(reviews_datasets['Text'].values.astype('U'))

In [5]:
doc_term_matrix

<50000x22562 sparse matrix of type '<class 'numpy.int64'>'
	with 1503354 stored elements in Compressed Sparse Row format>

In [6]:
from sklearn.decomposition import LatentDirichletAllocation

LDA = LatentDirichletAllocation(n_components=5, random_state=42)
LDA.fit(doc_term_matrix)

LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
                          evaluate_every=-1, learning_decay=0.7,
                          learning_method='batch', learning_offset=10.0,
                          max_doc_update_iter=100, max_iter=10,
                          mean_change_tol=0.001, n_components=5, n_jobs=None,
                          perp_tol=0.1, random_state=42, topic_word_prior=None,
                          total_samples=1000000.0, verbose=0)

In [7]:
first_topic = LDA.components_[0]

In [8]:
first_topic

array([8.32040219, 5.18797456, 0.20000045, ..., 0.20001857, 0.20000802,
       2.19318742])

In [9]:
top_topic_words = first_topic.argsort()[-10:]

In [10]:
top_topic_words

array([11099,  6750,  9043,  5462,  8146, 19955, 11753,  2985, 20004,
        4475], dtype=int64)

In [11]:
for i in top_topic_words:
    print(f'{vectorizer.get_feature_names()[i]}')

just
drink
good
cup
flavor
taste
like
br
tea
coffee


In [13]:
# top ten words for each topic

for i, topic in enumerate(LDA.components_):
    print(f'Top ten words for topic: {[i]}')
    print(f'{[vectorizer.get_feature_names()[i] for i in topic.argsort()[-10:]]}')
    print('\n')

Top ten words for topic: [0]
['just', 'drink', 'good', 'cup', 'flavor', 'taste', 'like', 'br', 'tea', 'coffee']


Top ten words for topic: [1]
['water', 'use', 'just', 'flavor', 'good', 'taste', 'sugar', 'product', 'like', 'br']


Top ten words for topic: [2]
['love', 'just', 'great', 'chips', 'flavor', 'good', 'taste', 'br', 'chocolate', 'like']


Top ten words for topic: [3]
['just', 'store', 'order', 'good', 'buy', 'br', 'great', 'price', 'product', 'amazon']


Top ten words for topic: [4]
['cats', 'treat', 'loves', 'cat', 'like', 'dogs', 'treats', 'br', 'dog', 'food']




In [14]:
# assign pobability of all topics to each document

topic_values = LDA.transform(doc_term_matrix)
topic_values.shape

(50000, 5)

In [15]:
reviews_datasets['Topic'] = topic_values.argmax(axis=1)

In [17]:
reviews_datasets.head(2)

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text,Topic
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,5,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...,4
1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,1,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...,3


## NNMF

In [19]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(max_df=0.8, min_df=2, stop_words='english')

doc_term_matrix = tfidf.fit_transform(reviews_datasets['Text'].values.astype('U'))

In [20]:
from sklearn.decomposition import NMF

nmf = NMF(n_components=5, random_state=42)
nmf.fit(doc_term_matrix)

NMF(alpha=0.0, beta_loss='frobenius', init=None, l1_ratio=0.0, max_iter=200,
    n_components=5, random_state=42, shuffle=False, solver='cd', tol=0.0001,
    verbose=0)

In [22]:
for i, topic in enumerate(nmf.components_):
    print(f'Top 10 words for topic: {i}')
    print(f'{[tfidf.get_feature_names()[i] for i in topic.argsort()[-10:]]}')
    print('\n')

Top 10 words for topic: 0
['love', 'chips', 'chocolate', 'just', 'product', 'flavor', 'taste', 'good', 'great', 'like']


Top 10 words for topic: 1
['product', 'like', 'ingredients', 'ginger', 'amazon', 'juice', 'drink', 'sugar', 'water', 'br']


Top 10 words for topic: 2
['smooth', 'blend', 'like', 'bold', 'flavor', 'roast', 'cups', 'strong', 'cup', 'coffee']


Top 10 words for topic: 3
['bags', 'good', 'like', 'flavor', 'cup', 'iced', 'teas', 'drink', 'green', 'tea']


Top 10 words for topic: 4
['eat', 'love', 'cats', 'treat', 'cat', 'loves', 'dogs', 'treats', 'food', 'dog']




In [23]:
topic_values = nmf.transform(doc_term_matrix)
reviews_datasets['Topic'] = topic_values.argmax(axis=1)

In [24]:
reviews_datasets.head(2)

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text,Topic
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,5,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...,4
1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,1,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...,0
