In [1]:
import pandas as pd
import sqlite3 as sq
import numpy as np
import surprise

## Import data from processed database


In [2]:
#Set up data
path = '/db/wrangled_reviews.db'
def import_data(db_path):
    conn = sq.connect(db_path) #sqliteDB path goes in parantheses
    crsr = conn.cursor()

    df = pd.read_sql_query('''
                SELECT customer_id, product_id, product_title, product_parent, star_rating, helpful_votes, 
                review_word_count, review_hl_count, cleaned_sentiment_star_rating, difference
                FROM processed
                ;
                ''', conn)

    df['star_rating'] = df['star_rating'].astype(float)
    df['star_rating'] = df['star_rating'].astype(int) #convert rating to integer type
    df['helpful_votes'] = df['helpful_votes'].astype(int) #convert rating to integer type  


    return df

df = import_data(path)

In [3]:
df.head(5)

Unnamed: 0,customer_id,product_id,product_title,product_parent,star_rating,helpful_votes,review_word_count,review_hl_count,cleaned_sentiment_star_rating,difference
0,40676812,1938067126,Crimes of the Educators: How Utopians Are Usin...,402004849,5,0,1,2,5.0,0.0
1,2784618,014017737X,The Pearl,779170984,5,0,8,2,3.0,2.0
2,2876528,0982207743,Primal Blueprint Quick and Easy Meals: Delicio...,225126623,3,1,10,2,2.0,1.0
3,33678379,080072433X,Trial Run (Fault Lines),42136245,3,0,277,4,3.00942,0.00942
4,32159651,0615815650,Methods of Persuasion: How to Use Psychology t...,625464646,3,1,22,4,2.636364,0.363636


In [None]:
len(df)
df.dtypes

# YellowBrick Viz SKIP


In [None]:
from yellowbrick.features import Rank2D
%matplotlib inline

visualizer = Rank2D(algorithm="pearson")
visualizer.fit_transform(df)
visualizer.poof()

In [None]:
from yellowbrick.features import JointPlotVisualizer

visualizer = JointPlotVisualizer(feature='star_rating', target='difference')
visualizer.fit(df['star_rating'], df['difference'])
visualizer.poof()

In [None]:
from yellowbrick.features import JointPlotVisualizer

visualizer = JointPlotVisualizer(feature='sentiment_star_rating', target='cleaned_sentiment_star_rating')
visualizer.fit(X_dat['sentiment_star_rating'], X_dat['cleaned_sentiment_star_rating'])
visualizer.poof()

In [None]:
from yellowbrick.features import JointPlotVisualizer

visualizer = JointPlotVisualizer(feature='sentiment_star_rating', target='star_rating')
visualizer.fit(X_dat['sentiment_star_rating'], X_dat['star_rating'])
visualizer.poof()

np.corrcoef(X_dat['sentiment_star_rating'], X_dat['star_rating'])


In [None]:
from yellowbrick.features import JointPlotVisualizer

visualizer = JointPlotVisualizer(feature='cleaned_sentiment_star_rating', target='star_rating')
visualizer.fit(X_dat['cleaned_sentiment_star_rating'], X_dat['star_rating'])
visualizer.poof()

np.corrcoef(X_dat['cleaned_sentiment_star_rating'], X_dat['star_rating'])


In [None]:
from yellowbrick.features import JointPlotVisualizer

visualizer = JointPlotVisualizer(feature='difference', target='star_rating')
visualizer.fit(X_dat['difference'], X_dat['star_rating'])
visualizer.poof()

np.corrcoef(X_dat['difference'], X_dat['star_rating'])

## Clustering  SKIP

In [None]:
from sklearn.cluster import MiniBatchKMeans

from yellowbrick.cluster import KElbowVisualizer

# Instantiate the clustering model and visualizer
visualizer = KElbowVisualizer(MiniBatchKMeans(), k=(4,12))

visualizer.fit(X_dat) # Fit the training data to the visualizer
visualizer.poof() # Draw/show/poof the data

In [None]:
from sklearn.cluster import MiniBatchKMeans

from yellowbrick.cluster import SilhouetteVisualizer

# Instantiate the clustering model and visualizer
model = MiniBatchKMeans(7)
visualizer = SilhouetteVisualizer(model)

visualizer.fit(X_dat) # Fit the training data to the visualizer
visualizer.poof() # Draw/show/poof the data

# Modeling in Sci-Kit Learn

### SVD 

In [None]:
from scipy.sparse import csr_matrix
from sklearn.decomposition import TruncatedSVD
df_pivot = df.pivot_table(index='customer_id',columns='product_title',values='star_rating',fill_value=0)
X = df_pivot.T
SVD = TruncatedSVD(n_components=12, random_state=17)
matrix = SVD.fit_transform(X)
corr = np.corrcoef(matrix)
book_title = df_pivot.columns

In [None]:
from joblib import dump, load
dump((corr,book_title), 'svd_mod2.joblib', compress=True) 

In [None]:
def print_recs(book_title, corr, title):
    book_list = book_title.tolist()
    book_title = np.asarray(book_title)

    book_idx = book_list.index(title)
    corr_target = corr[book_idx]
    corrs = np.concatenate((book_title,corr_target),axis=0)

    top_5_idx = np.argsort(corr_target)[-6:-1]
    top_5_values = [book_title[i] for i in top_5_idx]
    print(top_5_values)


print_recs(book_title, corr, "The Stand")

### NMF


In [None]:
from scipy.sparse import csr_matrix
from sklearn.decomposition import NMF
df_pivot = df.pivot_table(index='customer_id',columns='product_title',values='star_rating',fill_value=0)
X = df_pivot.T
NMFmod = NMF(n_components=12)
matrix = NMFmod.fit_transform(X)
corr = np.corrcoef(matrix)
book_title = df_pivot.columns

In [None]:
def print_recs(book_title, corr, title):
    book_list = book_title.tolist()
    book_title = np.asarray(book_title)

    book_idx = book_list.index(title)
    corr_target = corr[book_idx]
    corrs = np.concatenate((book_title,corr_target),axis=0)

    top_5_idx = np.argsort(corr_target)[-6:-1]
    top_5_values = [book_title[i] for i in top_5_idx]
    print(top_5_values)


print_recs(book_title, corr, "The Stand")

# Modeling in LightFM
Will allow for incorporation of product metadata!

In [4]:
def create_interaction_matrix(df,user_col, item_col, rating_col, norm= False, threshold = None):
    '''
    Function to create an interaction matrix dataframe from transactional type interactions
    Required Input -
        - df = Pandas DataFrame containing user-item interactions
        - user_col = column name containing user's identifier
        - item_col = column name containing item's identifier
        - rating col = column name containing user feedback on interaction with a given item
        - norm (optional) = True if a normalization of ratings is needed
        - threshold (required if norm = True) = value above which the rating is favorable
    Expected output - 
        - Pandas dataframe with user-item interactions ready to be fed in a recommendation algorithm
    '''
    interactions = df.groupby([user_col, item_col])[rating_col] \
            .sum().unstack().reset_index(). \
            fillna(0).set_index(user_col)
    if norm:
        interactions = interactions.applymap(lambda x: 1 if x > threshold else 0)
    return interactions


def create_user_dict(interactions):
    '''
    Function to create a user dictionary based on their index and number in interaction dataset
    Required Input - 
        interactions - dataset create by create_interaction_matrix
    Expected Output -
        user_dict - Dictionary type output containing interaction_index as key and user_id as value
    '''
    user_id = list(interactions.index)
    user_dict = {}
    counter = 0 
    for i in user_id:
        user_dict[i] = counter
        counter += 1
    return user_dict
    
def create_item_dict(df,id_col,name_col):
    '''
    Function to create an item dictionary based on their item_id and item name
    Required Input - 
        - df = Pandas dataframe with Item information
        - id_col = Column name containing unique identifier for an item
        - name_col = Column name containing name of the item
    Expected Output -
        item_dict = Dictionary type output containing item_id as key and item_name as value
    '''
    item_dict ={}
    for i in range(df.shape[0]):
        item_dict[(df.loc[i,id_col])] = df.loc[i,name_col]
    return item_dict


def runMF(interactions, n_components=30, loss='warp', k=15, epoch=30,n_jobs = 4):
    '''
    Function to run matrix-factorization algorithm
    Required Input -
        - interactions = dataset create by create_interaction_matrix
        - n_components = number of embeddings you want to create to define Item and user
        - loss = loss function other options are logistic, brp
        - epoch = number of epochs to run 
        - n_jobs = number of cores used for execution 
    Expected Output  -
        Model - Trained model
    '''
    x = csr_matrix(interactions.values)
    model = LightFM(no_components= n_components, loss=loss,k=k)
    model.fit(x,epochs=epoch,num_threads = n_jobs)
    return model

def create_item_emdedding_distance_matrix(model,interactions):
    '''
    Function to create item-item distance embedding matrix
    Required Input -
        - model = Trained matrix factorization model
        - interactions = dataset used for training the model
    Expected Output -
        - item_emdedding_distance_matrix = Pandas dataframe containing cosine distance matrix b/w items
    '''
    df_item_norm_sparse = csr_matrix(model.item_embeddings)
    similarities = cosine_similarity(df_item_norm_sparse)
    item_emdedding_distance_matrix = pd.DataFrame(similarities)
    item_emdedding_distance_matrix.columns = interactions.columns
    item_emdedding_distance_matrix.index = interactions.columns
    return item_emdedding_distance_matrix

def item_item_recommendation(item_emdedding_distance_matrix, item_id, 
                             item_dict, n_items = 10, show = True):
    '''
    Function to create item-item recommendation
    Required Input - 
        - item_emdedding_distance_matrix = Pandas dataframe containing cosine distance matrix b/w items
        - item_id  = item ID for which we need to generate recommended items
        - item_dict = Dictionary type input containing item_id as key and item_name as value
        - n_items = Number of items needed as an output
    Expected Output -
        - recommended_items = List of recommended items
    '''
    recommended_items = list(pd.Series(item_emdedding_distance_matrix.loc[item_id,:]. \
                                  sort_values(ascending = False).head(n_items+1). \
                                  index[1:n_items+1]))
    if show == True:
        print("Item of interest :{0}".format(item_dict[item_id]))
        print("Item similar to the above item:")
        counter = 1
        for i in recommended_items:
            print(str(counter) + '- ' +  item_dict[i])
            counter+=1
    return recommended_items

In [5]:
# Creating interaction matrix using rating data
interactions = create_interaction_matrix(df = df,
                                         user_col = 'customer_id',
                                         item_col = 'product_id',
                                         rating_col = 'star_rating')
interactions.head()

product_id,0002250519,000713326X,0007149832,0007162219,0007236360,0007256817,0007320817,0007398557,0007446977,0007447868,...,B00EC8YBD8,B00ERNP2JU,B00ES27PRM,B00FY2QHU6,B00IID8Z12,B00JKSTQE4,B00MNM6EKI,B00MPREWL4,B00N4FLH82,B00QQ1RFEG
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
96533,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
421085,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
543134,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
565775,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
705489,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [6]:
# Create User Dict
user_dict = create_user_dict(interactions=interactions)
# Create Item dict
movies_dict = create_item_dict(df = df,
                               id_col = 'product_id',
                               name_col = 'product_title')

In [7]:
from scipy.sparse import csr_matrix
from lightfm import LightFM
mf_model = runMF(interactions = interactions,
                 n_components = 30,
                 loss = 'warp',
                 epoch = 30,
                 n_jobs = 4)

In [8]:
## Creating item-item distance matrix
from sklearn.metrics.pairwise import cosine_similarity
item_item_dist = create_item_emdedding_distance_matrix(model = mf_model,
                                                       interactions = interactions)

MemoryError: 

In [None]:
## Calling 10 recommended items for item id 
rec_list = item_item_recommendation(item_emdedding_distance_matrix = item_item_dist,
                                    item_id = 5378,
                                    item_dict = movies_dict,
                                    n_items = 10)

# Modeling in Suprise  Works best for User-Item and no metadata


In [None]:
from surprise import Reader, Dataset

# to load dataset from pandas df, we need `load_fromm_df` method in surprise lib

ratings_dict = {'itemID': list(df.product_title),
                'userID': list(df.customer_id),
                'rating': list(df.star_rating)}
df = pd.DataFrame(ratings_dict)

# A reader is still needed but only the rating_scale param is required.
# The Reader class is used to parse a file containing ratings.
reader = Reader(rating_scale=(1, 5))

# The columns must correspond to user id, item id and ratings (in that order).
data = Dataset.load_from_df(df[['userID', 'itemID', 'rating']], reader)


In [None]:
from __future__ import (absolute_import, division, print_function,
                        unicode_literals)
from collections import defaultdict

from surprise import SVD
from surprise import Dataset


def get_top_k(predictions, k):
    '''Return a top_k dicts where keys are user ids and values are lists of
    tuples [(item id, rating estimation) ...].

    Takes in a list of predictions as returned by the test method.
    '''

    # First map the predictions to each user.
    top_k = defaultdict(list)
    for uid, iid, true_r, est, _ in predictions:
        top_k[uid].append((iid, est))

    # Then sort the predictions for each user and retrieve the k highest ones.
    for uid, user_ratings in top_k.items():
        user_ratings.sort(key=lambda x:x[1], reverse=True)
        top_k[uid] = user_ratings[:k]

    return top_k

In [None]:
trainset = data.build_full_trainset()

algo = SVD()
algo.fit(trainset)

In [None]:
# We are here testing on the WHOLE dataset. Which means that all the ratings we
# are predicting are already known, but it does not really matter.
testset = data.construct_testset(raw_testset=data.raw_ratings)
predictions = algo.test(testset)
#accuracy.rmse(predictions, verbose=True)  # ~ 0.68 (which is low)

#print(predictions)




In [None]:
top_k = get_top_k(predictions, 5)

# Print the recommended items
for uid, user_ratings in top_k.items():
    print(uid, [iid for (iid, _) in user_ratings])



# Compute the total number of recommended items.
all_recommended_items = set(iid for (_, user_ratings) in top_k.items() for
                            (iid, _) in user_ratings)

print('Number of recommended items:', len(all_recommended_items), 'over',
      len(top_k), 'users')