# Setup

In [None]:
import requests
import time, os
from mongoengine import *
from sklearn.feature_extraction.text import CountVectorizer
import re
import string
import sys
import pymongo
from pymongo import MongoClient
import pandas as pd
from nltk import word_tokenize, pos_tag
from sklearn.metrics.pairwise import cosine_similarity
from nltk.tokenize import word_tokenize
from sklearn.decomposition import NMF
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
import numpy as np
from nltk import word_tokenize, pos_tag
from sklearn.preprocessing import normalize
import collections

"""
MongoDB Document Setup
"""
connect("mongodb_sephora_reviews")



In [None]:


def nouns(text):
    '''Given a string of text, tokenize the text and pull out only the nouns.'''
    is_adj = lambda pos: pos[:2] == 'JJ' 
    is_verb = lambda pos: pos[:2] == 'RB' 
    is_other_adj = lambda pos: pos[:2] == 'JJR'
    is_other_other_adj = lambda pos: pos[:2] == 'JJS' 
    is_other_verb = lambda pos: pos[:2] == 'RBR'
    tokenized = word_tokenize(text)
    all_adj = [word for (word, pos) in pos_tag(tokenized) if is_adj(pos) or is_verb(pos) or is_other_adj(pos)
              or is_other_other_adj(pos) or is_other_verb(pos)]
    return ' '.join(all_adj)


def display_topics(model, feature_names, no_top_words, topic_names=None):
    for ix, topic in enumerate(model.components_):
        if not topic_names or not topic_names[ix]:
            print("\nTopic ", ix)
        else:
            print("\nTopic: ''",topic_names[ix],"''")
        print(", ".join([feature_names[i]
                        for i in topic.argsort()[:-no_top_words - 1:-1]]))
        
def clean_text(txt):
    txt = "".join([c.lower() for c in txt if c not in string.punctuation])
    tokens = re.split('\W+', txt)
    txt = " ".join([ps.stem(word) for word in tokens if word not in stopwords and ps.stem(word) not in blocker_words])
    return txt

# Data Cleaning

In [None]:
client = MongoClient('mongodb://localhost:27017/')

db = client.mongodb_sephora_reviews

df = pd.DataFrame(list(db.reviews.find({})))

stopwords = stopwords.words('english')
ps = PorterStemmer()
blocker_words = ['skin', 'would', 'cleanser', 'around', 'say', 'month', 'day', 'product', 'time', 'week',
                'use', 'ive', 'get', 'first', 'start', 'night', 'year', 'sinc', 'everi', 'balm', 'receiv', 'free',
                'size', 'test', 'think', 'still', 'like', 'yet', 'cleanser', 'price', 'everyth', 'influenst', 
                 'sampl', 'thing', 'stuff', 'littl', 'almost', 'thought', 'long', 'amount', 'one', 'cloth', 
                'sephora', 'morn', 'afterward', 'im', 'go', 'also', 'stuff', 'second', 'took', 'cloth', 
                 'cotton', 'way', 'face', 'feel', 'love', 'realli', 'tri', 'smell', 'great', 'make', 'well'
                , 'good', 'take', 'definit', 'amaz', 'super', 'best', 'got', 'never', 'buy', 'bit', 'lot', 'job'
                'want', 'goe', 'review', 'last', 'enough', 'actual', 'sure', 'though', 'usual', 'back', 'seem', 
                'far', 'anyth', 'howev', 'bought', 'routin', 'perfect', 'see', 'someth', 'come', 'away', 
                'give', 'dont', 'much', 'even', 'find', 'rins', 'star', 'small', 'know', '2', 'skincar', 'noth'
                'right', 'cant', 'two', 'part', 'done', 'came', 'brush', 'wasnt', 'twice', 'pump', 'squeaki',  
                'honestli', 'old', 'top', 'prior', 'write', 'wipe', 'nervou', 'eye', 'eyes', 'green', 'blue',
                'hazel', 'brown', 'hair', 'not', 'list', 'listed', 'brunett', 'blond', 'chin', 'real', 'bottl', 
                 'doesnt', 'open', 'auburn', 'aubur', 'gray', 'grey', 'black', 'red', 'other', 'forward', 'nice'
                ,'ok']


In [None]:
df['cleaned_data'] = df['review_text'].apply(lambda x: clean_text(x))
df['review_rating_cleaned'] = df['review_rating'].apply(lambda x : x.strip('star" '))
df['review_rating_cleaned'] = df['review_rating_cleaned'].apply(lambda x : x.strip('stars'))
df['review_rating_cleaned'] = df['review_rating_cleaned'].astype(int)

In [None]:
df_pos = df[df['review_rating_cleaned'] >=3].reset_index()

In [None]:
df_pos.head()

# NMF Model & CountVectorizer

In [None]:
cv1 = CountVectorizer(ngram_range=(1,1))
list_trained =cv1.fit_transform(df_pos['cleaned_data'])


nmf_cleanser_reviews = NMF(5)

nmf_cleanser_reviews.fit(list_trained)

display_topics(nmf_cleanser_reviews, cv1.get_feature_names(), 30)

# User Input Recommendation System

In [None]:
text = "New to skincare. Need acne remover that will remove oils and clear pores"
text_cleaned = clean_text(text)


user_input = cv1.transform([text_cleaned])
user_input_transform = nmf_cleanser_reviews.transform(user_input)


sephora_nmf_model = nmf_cleanser_reviews.fit_transform(list_trained)
nmf_model_df = pd.DataFrame(sephora_nmf_model).add_prefix('topic_')



# Calculating Cosine Similarities

In [None]:
norm_features = normalize(sephora_nmf_model)
df_user_input = pd.DataFrame(user_input_transform)
user_matrix = df_user_input.loc[0, :]


df_features = pd.DataFrame(norm_features)
similarities = df_features.dot(user_matrix)
sim_dict = similarities.nlargest(100).to_dict()

# Adding Additional Columns to NMF Model Output

In [None]:
dominant_topic = np.argmax(nmf_model_df.values, axis=1)
nmf_model_df['dominant_topic'] = dominant_topic

In [None]:
nmf_model_df[['product', 'user_skin_type', 'review_text', 'cleaned_data', 
              'review_rating', 'review_rating_cleaned']] = df_pos[['product', 'user_skin_type', 'review_text', 
                                      'cleaned_data', 'review_rating', 'review_rating_cleaned']]

# Product's DataFrame

In [None]:
df_products = pd.DataFrame(list(db.product.find({})))
final_df = nmf_model_df.merge(df_products, how='left', left_on='product', right_on='_id')

In [None]:
product_rating = nmf_model_df.groupby('product')['review_rating_cleaned'].mean().round(2).reset_index()

In [None]:
product_df = df_products.merge(product_rating, how='left', left_on='_id', right_on='product')

In [None]:
product_df.head(2)

# Recommendation System Algorithm

In [None]:

top_matches_dict = collections.defaultdict(list)
for k in sim_dict.keys():
    if int(nmf_model_df.iloc[k]['review_rating_cleaned']) > 3:
        top_matches_dict[k] = nmf_model_df.iloc[k]['product']
    else:
        continue

top_matches_list = set( val for dic in top_matches_dict for val in top_matches_dict.values())

In [None]:
product_display_dict = collections.defaultdict(list)
for i, k in enumerate(top_matches_list):
    if len(product_display_dict) != 5:
        product_display_dict[i].append(product_df[product_df['_id'] == k]['product_url'].item())
        product_display_dict[i].append(product_df[product_df['_id'] == k]['brand_name'].item())
        product_display_dict[i].append(product_df[product_df['_id'] == k]['product_name'].item())
        product_display_dict[i].append(product_df[product_df['_id'] == k]['price'])
        product_display_dict[i].append(product_df[product_df['_id'] == k]['product_img_url'].item())
        product_display_dict[i].append(product_df[product_df['_id'] == k]['review_rating_cleaned'].item())
    else:
        break

product_display_dict

In [None]:
for i in top_matches_dict.keys():
    print(nmf_model_df.iloc[i]['review_text']+ ",_____ ")

   

# Pickles

In [None]:
import pickle 

#pickle.dump(nmf_cleanser_reviews, open("finalized_nmf_model.pkl", 'wb'))

#pickle.dump(cv1, open("countvector.pkl", 'wb'))

#pickle.dump(df_features, open("df_nmf_features.pkl", 'wb'))

#pickle.dump(product_df, open("product_dataframe.pkl", 'wb'))

#pickle.dump(nmf_model_df, open("review_dataframe.pkl", 'wb'))


# Visualizations 

In [None]:
topic_df = nmf_model_df['dominant_topic'].value_counts().reset_index()
topic_df['index'] = topic_df['index'].replace({4: 'Topic: Acne', 3: 'Topic: General Cleansing', 1: 'Topic: Dry Skin', 0: 'Topic: Make-up Removing', 
                     2: 'Topic: Oily Skin'})

topic_df

In [None]:
import altair as alt
import seaborn as sns


bars = alt.Chart(topic_df).mark_bar().encode(
   x='dominant_topic',
   y=alt.Y('index', sort='-x')
    )
text = bars.mark_text(
    align='left',
    baseline='bottom',
    fontSize = 10,
    dx=3  # Nudges text to right so it doesn't appear on top of the bar
).encode(
    text='dominant_topic'
)
#).properties(height=300)
              
(bars + text).properties(height=400, width = 650)
