# NLP: Recommendations and Sentiment Analysis

We will perform two common NLP tasks: 
 1. Generate recommendations for products based on product descriptions using an LDA topic model.
 2. Perform sentiment analysis based on product reviews using sklearn Pipelines.

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

# Part 1: Generate Recommendations from LDA Transformation

I transform a set of product descriptions using TfIdf and LDA topic modeling to generate product recommendations based on similarity in LDA space. 

## Load data and transform text using TfIDF

In [3]:
df_jcp = pd.read_csv('../data/jcpenney-products_subset.csv.zip')

print(df_jcp.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 2 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   name_title   5000 non-null   object
 1   description  5000 non-null   object
dtypes: object(2)
memory usage: 78.2+ KB
None


In [4]:
print(df_jcp.name_title.iloc[0])

print('-'*50) 


print(df_jcp.description.iloc[0])

Invicta® Sl Rally Mens Black Leather Strap Chronograph Watch 16012
--------------------------------------------------
A timepiece you can enjoy every day of the week, this sports car-inspired chronograph watch packs plenty of information into an easy-to-read dial.   Brand: Invicta Dial Color: Black Strap: Black leather Clasp: Buckle Movement: Quartz Water Resistance: 100m Case Width: 48mm Case Thickness: 13.5mm Bracelet Dimensions: 210mm long; 22mm wide Model No.: 16012 Special Features: Stopwatch; 3 multifunction sub dials   Jewelry photos are enlarged to show detail.


In [5]:
# Transform Descriptions using TfIdf

# Import TfidfVectorizer from sklearn
from sklearn.feature_extraction.text import TfidfVectorizer
    
#  Instantiate a TfidfVectorizer that will
tfidf = TfidfVectorizer(ngram_range=(1,2), min_df=10, max_df=.10)

X_tfidf = tfidf.fit_transform(df_jcp.description)

print(X_tfidf.shape)

(5000, 5678)


In [6]:
print(tfidf.inverse_transform(X_tfidf[0]))

[array(['jewelry photos', 'features stopwatch', 'special features',
       'model no', 'wide model', '22mm wide', 'long 22mm',
       'bracelet dimensions', 'case thickness', 'case width',
       'resistance 100m', 'water resistance', 'quartz water',
       'movement quartz', 'buckle movement', 'clasp buckle',
       'leather clasp', 'black leather', 'strap black', 'black strap',
       'color black', 'dial color', 'to read', 'easy to', 'an easy',
       'plenty of', 'of the', 'day of', 'every day', 'you can', 'sub',
       'stopwatch', 'special', 'no', 'model', 'wide', '22mm',
       'dimensions', 'bracelet', '5mm', '13', 'thickness', 'width',
       'case', '100m', 'resistance', 'water', 'quartz', 'movement',
       'buckle', 'clasp', 'leather', 'strap', 'black', 'color', 'brand',
       'dial', 'read', 'into', 'plenty', 'watch', 'chronograph',
       'inspired', 'car', 'sports', 'week', 'day', 'every', 'enjoy',
       'can'], dtype='<U24')]


In [7]:
# Format Bigrams
vocab = tfidf.get_feature_names_out()

vocab = [term.replace(' ', '_') for term in vocab]

# Printing the last 5 terms
print(vocab[-5:])

['zipper_pocket', 'zipper_pockets', 'zippered', 'zirconia', 'zone']


## Transform product descriptions into topics and print sample terms from topics


In [8]:
# Perform Topic Modeling with LDA

# I use Latent Direchlet Allocation to learn 
#   per-document topic distributions and per-topic term distributions.

from sklearn.decomposition import LatentDirichletAllocation

# Instantiate a LatentDirichletAllocation model 
lda = LatentDirichletAllocation(n_components=20, n_jobs=-1, random_state=512)

X_lda = lda.fit_transform(X_tfidf)

X_lda.shape

(5000, 20)

In [9]:
# Get Assigned Topics for Product at df_jcp row 0

theta_0 = X_lda[0].round(2)
print(f'{theta_0 = :}\n')

# LDA will assign a small weight (or proability) to each topic for a document
n_topics_assigned_0 = [topic for topic in theta_0 if topic > 0.01]
print(f'{n_topics_assigned_0 = :}\n')

assigned_topics_0 = np.array(np.argsort(theta_0)[::-1])[:len(n_topics_assigned_0)]
print(f'{assigned_topics_0 = :}')

theta_0 = [0.01 0.74 0.01 0.01 0.01 0.01 0.01 0.01 0.01 0.01 0.01 0.01 0.01 0.01
 0.16 0.01 0.01 0.01 0.01 0.01]

n_topics_assigned_0 = [0.74, 0.16]

assigned_topics_0 = [ 1 14]


In [10]:
# Print Top Topic Terms

vocab = np.array(vocab)

# assert that vocab is the correct datatype
assert type(vocab) is np.ndarray, "vocab needs to be converted to a numpy array"

topic_dist = np.array(np.argsort(lda.components_)[::-1])[:,:5]
topic_vocab = vocab[topic_dist]
for topic_idx, topic_words in enumerate(topic_vocab): 
    words = ' '.join(topic_words)
    print(f'Topic #{topic_idx:2d} : {words}')

Topic # 0 : great_gift edge_to dimensions_18 sale_of edta
Topic # 1 : ci seating_arrangement it_does screwdriver_needed screwdriver
Topic # 2 : for_proper recommended_yes shirttail ring_with the_dial
Topic # 3 : compression_fit yes_use retardant_yes yes_slip garment_should
Topic # 4 : vitamin white_gold shaft_circumference are_extremely comforter_shams
Topic # 5 : petite_short twin_comforter sodium_benzoate mesh_panels adjustable_cuffs
Topic # 6 : solar sunlight_measures alterations crevice crevice_tool
Topic # 7 : purchase_this bamboo redness fresh_food refund
Topic # 8 : suit_pants and_plywood fit_snugly gold_over fit_straight
Topic # 9 : fit_snugly hoop hoop_earrings i1 i2_setting
Topic #10 : sits_below seams_to seam_pockets screen_printed savings
Topic #11 : care_some elastane_machine chafing hemmed_cotton chain_jewelry
Topic #12 : cycles skintone pillowcases_king pillows_are skin_types
Topic #13 : and_thighs cotton_comfort booties easy_gift sleeves_regular
Topic #14 : require_spec

## Generate recommendations using topics

In [12]:
# Generate Similarity Matrix

# I use Content-Based Filtering to make recommendations based on a query product.
# Each product will be represented by its LDA topic weights learned above (X_lda).
# I try to recommend similar products in LDA space using cosine distance as our measure of similarity, 
# where lower distance means more similar.

from sklearn.metrics.pairwise import cosine_distances

# Use cosine_distances to generate similarity scores on our X_lda data
distances = cosine_distances(X_lda)

distances.shape

(5000, 5000)

In [13]:
# Find Recommended Products

print(df_jcp.name_title[np.argsort(distances[0])[:10]].values)

['Invicta® Sl Rally Mens Black Leather Strap Chronograph Watch 16012'
 'Seiko® Mens Two-Tone Brown Dial Chronograph Watch SSC142'
 'Despicable Me Minions Kids Flashing and Sound Digital Watch'
 'Citizen® Eco-Drive® Womens Crystal-Accent Stainless Steel Watch EX1320-54E'
 'Womens Crystal-Accent White Lizard Faux Leather Cuff Bangle Watch'
 'Star Wars® Stormtrooper Kids Flashing and Sound Digital Watch'
 'Casio® Mens Champagne Dial Black Resin Strap Sport Watch MW600F-9AV'
 'TKO ORLOGI Womens Crystal-Accent Chain-Link Blue Silicone Strap Stretch Watch'
 'Pulsar® Mens Silver-Tone Black Ion Watch PS9273'
 'Armitron® ProSport Womens Digital Sport Chronograph Watch 45/7036PNK']


# Sentiment Analysis Using Pipelines

Here I train a model to classify positive vs negative sentiment on a set of pet supply product reviews using sklearn Pipelines.

In [14]:
# The dataset I am working with is a set of product reviews of pet supply items on Amazon.
# This data is taken from https://nijianmo.github.io/amazon/index.html
#   "Justifying recommendations using distantly-labeled reviews and fined-grained aspects"
#   Jianmo Ni, Jiacheng Li, Julian McAuley
#   Empirical Methods in Natural Language Processing (EMNLP), 2019

df_amzn = pd.read_csv('../data/amazon-petsupply-reviews_subset.csv.zip')

print(df_amzn.info())
print() 
print(df_amzn.review[0])
print(df_amzn.rating[0])

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   review  10000 non-null  object
 1   rating  10000 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 156.4+ KB
None

My cats are considerably more happy with this toy...and I don't have to leave the sofa to use it, given the long wand length. yay laziness!!
5


In [15]:
# Transform Target

# I turn rating on 5 point scale into a binary classification task to approximate positive vs negative sentiment

print(df_amzn.value_counts(subset='rating', normalize=True).round(2))

y = df_amzn.rating.replace(to_replace={5:True, 4:False, 3:False, 2:False, 1:False})

print()

print(y.value_counts(normalize=True).round(2))

rating
5    0.66
4    0.14
3    0.09
1    0.06
2    0.05
dtype: float64

True     0.66
False    0.34
Name: rating, dtype: float64


In [16]:
# Train-test split

from sklearn.model_selection import train_test_split

reviews_train,reviews_test,y_train,y_test = train_test_split(df_amzn.review, y, stratify=y, test_size=0.2, random_state=512)

print(y_train.value_counts(normalize=True).round(2))

True     0.66
False    0.34
Name: rating, dtype: float64


In [17]:
# Create a Pipeline of TfIdf transformation and Classification

from sklearn.pipeline import Pipeline
from sklearn.ensemble import GradientBoostingClassifier

pipe_gbc = Pipeline([('tfidf', TfidfVectorizer(min_df=5, max_df=0.50)),('gbc', GradientBoostingClassifier(n_estimators=20))])

print(pipe_gbc)

Pipeline(steps=[('tfidf', TfidfVectorizer(max_df=0.5, min_df=5)),
                ('gbc', GradientBoostingClassifier(n_estimators=20))])


In [18]:
# Perform Grid Search on pipe_gbc

from sklearn.model_selection import GridSearchCV

param_grid = {'tfidf__ngram_range':[(1,1), (1,2)], 'gbc__max_depth':[2,10]}

gs_pipe_gbc = GridSearchCV(estimator=pipe_gbc, param_grid=param_grid, cv=2, n_jobs=-1).fit(reviews_train, y_train)

print(gs_pipe_gbc.best_params_)

print(gs_pipe_gbc.best_score_.round(2))

{'gbc__max_depth': 10, 'tfidf__ngram_range': (1, 2)}
0.75


In [19]:
# Evaluate on the test set

print(gs_pipe_gbc.score(reviews_test,y_test).round(2))

0.76


In [20]:
# Evaluate on example reviews

print(gs_pipe_gbc.predict(['This is a great product.', 'This product is not great.']))

[ True False]
