In [22]:
# Import necessary packages
import numpy as np
import pandas as pd

from operator import add
from pyspark.ml.feature import RegexTokenizer, CountVectorizer
from pyspark.ml.feature import StopWordsRemover, VectorAssembler
from pyspark.ml.feature import Word2Vec, Word2VecModel, HashingTF
from pyspark.ml.feature import IDF

from pyspark.ml import Pipeline, PipelineModel
import pyspark.sql.functions as F

from pyspark.sql.functions import col

from pyspark.sql.functions import *
import pyspark.sql.functions as F
from pyspark.sql.functions import mean as _mean, stddev as _stddev, col
from pyspark.sql.functions import lit

from pyspark.sql.types import *

from scipy.spatial import distance
import scipy.stats

import warnings
warnings.filterwarnings("ignore")

# Data Import

In [2]:
df = spark.read \
    .option("quote", "\"")  \
    .option("escape", "\"") \
    .option("ignoreLeadingWhiteSpace",True) \
    .csv("/user/klaurens/project/project/amazon_reviews_us_Digital_Video_Download_v1_00.tsv",inferSchema=True,header=True, sep='\t') 

In [3]:
df = df.select('product_id', 'star_rating', 'product_category', 'product_title', 'customer_id', 'review_headline', 'review_body', 'helpful_votes', 'total_votes')

# Data Cleaning 
Following agreed procedures, we clean the data.

In [4]:
df.dropna()

DataFrame[product_id: string, star_rating: int, product_category: string, product_title: string, customer_id: int, review_headline: string, review_body: string, helpful_votes: int, total_votes: int]

In [5]:
df2 = df.select([count(when(col(c).contains('None') | \
                            col(c).contains('NULL') | \
                            (col(c) == '' ) | \
                            col(c).isNull() | \
                            isnan(c), c 
                           )).alias(c)
                    for c in df.columns])
df2.count()
df2.show(5, vertical = True)

-RECORD 0----------------
 product_id       | 19   
 star_rating      | 0    
 product_category | 0    
 product_title    | 195  
 customer_id      | 0    
 review_headline  | 600  
 review_body      | 4460 
 helpful_votes    | 0    
 total_votes      | 0    



In [5]:
df = df.filter(col('total_votes') > 10)

In [6]:
df = df.fillna("", "review_body")
df = df.fillna("", "review_headline")

In [8]:
df = df.withColumn('review_text', F.concat('review_headline', F.lit(" "), 'review_body'))
# df.show(1, vertical = True, truncate = False)

In [9]:
df.filter((col("review_text").isNull()) | ( col("review_text") == "")).show(1, vertical = True, truncate = False)

(0 rows)



In [11]:
df.select([count(when((col(c) == '' ) | \
                            col(c).isNull() | \
                            isnan(c), c 
                           )).alias(c)
                    for c in df.columns]).show(5, vertical = True)

-RECORD 0---------------
 product_id       | 0   
 star_rating      | 0   
 product_category | 0   
 product_title    | 0   
 customer_id      | 0   
 review_headline  | 1   
 review_body      | 23  
 helpful_votes    | 0   
 total_votes      | 0   
 review_text      | 0   



In [10]:
df = df.withColumn('helpful_ratio', F.col('helpful_votes') / F.col('total_votes'))
# df.show(1, vertical = True, truncate = False)

In [11]:
df.filter(col('helpful_ratio') < 0).count()

0

In [12]:
df = df.withColumn('helpful', when(col("helpful_ratio") < 0.5, 0).otherwise(1))

In [13]:
df.show(2, vertical = True)

-RECORD 0--------------------------------
 product_id       | B01489L5LQ           
 star_rating      | 4                    
 product_category | Digital_Video_Dow... 
 product_title    | After Words          
 customer_id      | 52895410             
 review_headline  | Charming movie       
 review_body      | This movie isn't ... 
 helpful_votes    | 17                   
 total_votes      | 18                   
 review_text      | Charming movie Th... 
 helpful_ratio    | 0.9444444444444444   
 helpful          | 1                    
-RECORD 1--------------------------------
 product_id       | B00SZT6I3G           
 star_rating      | 1                    
 product_category | Digital_Video_Dow... 
 product_title    | The Cicret Bracel... 
 customer_id      | 41521698             
 review_headline  | If it can't be br... 
 review_body      | If it can't be br... 
 helpful_votes    | 11                   
 total_votes      | 18                   
 review_text      | If it can't be

## Cleaning review_text
Using regex, clean the review_text before running models.

In [14]:
df_clean = df

In [15]:
from pyspark.sql.functions import col, lower, regexp_replace, split

def clean_text(c):
    c = lower(c)
    c = regexp_replace(c, "\"", "")
    c = regexp_replace(c, "^rt ", "")
    c = regexp_replace(c, "(https?\://)\S+", "")
    c = regexp_replace(c, "[^a-zA-Z0-9\\s]", "")
    c = regexp_replace(c, "<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});", "")
    return c

In [16]:
df_clean = df_clean.withColumn('review_text',clean_text(col("review_text")).alias("text"))

In [19]:
df_clean.select('review_text').show(5,vertical = True, truncate = False)

-RECORD 0-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
 review_text | charming movie this movie isnt perfect but it gets a lot of things right yes the librarian character played by marcia gay harden is stereotypical and played a bit heavyhanded but the universal nature of the story the beautiful setting and the likability of the characters overcome this flaw the quote at the end brought tears to my eyes  if you want to take a break from hollywoods standard fare of dark violent or stupid movies then give this a try it is is entertaining and though

In [29]:
df_clean.dtypes

[('product_id', 'string'),
 ('star_rating', 'int'),
 ('product_category', 'string'),
 ('product_title', 'string'),
 ('customer_id', 'int'),
 ('review_headline', 'string'),
 ('review_body', 'string'),
 ('helpful_votes', 'int'),
 ('total_votes', 'int'),
 ('review_text', 'string'),
 ('helpful_ratio', 'double'),
 ('helpful', 'int')]

## Filtering dataset with star rating > 3 and helpful ratio >0.5 
Recommend only products that have great star ratings.
Helpful ratio is calculated by helpful votes/total votes per review and can signal here that the review text is more trustworthy and validated than others. So feature products that only have helpful ratio > 0.5. 

In [16]:
# print(df_clean.select(mean(col('star_rating')).alias('mean')).collect())
df_clean.filter(col('star_rating')>3).count()

15391

In [18]:
df_filtered = df_clean.filter(col('star_rating')>3)

In [19]:
df_filtered= df_filtered.filter(col('helpful_ratio')>0.5)

In [23]:
df_filtered.count()

14316

In [20]:
df_filtered = df_filtered.select('product_id', 'star_rating', 'product_category', 'product_title','customer_id', 'review_text')

In [21]:
df_aggtext = df_filtered.groupby("product_id").agg(F.concat_ws(" ", F.collect_list(df_filtered.review_text)).alias("review_text"))

In [23]:
df_aggtext = df_aggtext.withColumn('product_category', lit('Digital_Video'))

In [26]:
df_aggtext.show(3, vertical = True)

-RECORD 0--------------------------------
 product_id       | B0132TVFL8           
 review_text      | taking the veil o... 
 product_category | Digital_Video        
-RECORD 1--------------------------------
 product_id       | B008O7LJMO           
 review_text      | thank you for thi... 
 product_category | Digital_Video        
-RECORD 2--------------------------------
 product_id       | B00HJ1KOJA           
 review_text      | we are missing th... 
 product_category | Digital_Video        
only showing top 3 rows



# Build nlp pipeline to vectorize review and category

In [24]:
# nlp pipeline
regexTokenizer = RegexTokenizer(gaps = False, pattern = '\w+', inputCol = 'review_text', outputCol = 'token')
stopWordsRemover = StopWordsRemover(inputCol = 'token', outputCol = 'nostopwrd')
word2Vec = Word2Vec(vectorSize = 100, minCount = 5, inputCol = 'nostopwrd', outputCol = 'word_vec', seed=123)

In [25]:
pipeline = Pipeline(stages=[regexTokenizer, stopWordsRemover, word2Vec])

In [26]:
pipeline_mdl = pipeline.fit(df_aggtext)
df_trans = pipeline_mdl.transform(df_aggtext)

In [34]:
df_trans.select( 'review_text', 'nostopwrd', 'word_vec').show(5)

+--------------------+--------------------+--------------------+
|         review_text|           nostopwrd|            word_vec|
+--------------------+--------------------+--------------------+
|taking the veil o...|[taking, veil, sm...|[0.02734180316187...|
|most fringhtening...|[fringhteningly, ...|[0.04223927862767...|
|challenge is a am...|[challenge, amazo...|[-0.0012853314401...|
|love this show i ...|[love, show, love...|[0.05785441026091...|
|absolutely awesom...|[absolutely, awes...|[-0.0171413748171...|
+--------------------+--------------------+--------------------+
only showing top 5 rows



In [27]:
# Define the similarity function using three methods
# Decided to go for combination of CosineSim and Pearson 

def CosineSim(vec1, vec2): 
    return np.dot(vec1, vec2) / np.sqrt(np.dot(vec1, vec1)) / np.sqrt(np.dot(vec2, vec2)) 

def Euclidean(vec1, vec2):
    return np.sqrt(np.sum(np.square(vec1-vec2)))
# dst = distance.euclidean(a, b)

def Pearson(vec1, vec2):
    return scipy.stats.pearsonr(vec1, vec2)[0]

def CombDistance(vec1, vec2):
    return (CosineSim(vec1, vec2) + Pearson(vec1, vec2))/2 

In [28]:
reviews_comb = df_trans.select('product_id', 'word_vec', 'product_category').rdd.map(lambda x: (x[0], x[1],x[2])).collect()

In [18]:
reviews_comb[1][0]

'B000IZ8E3M'

In [19]:
reviews_comb[1][1]

DenseVector([-0.0117, 0.0534, 0.0294, 0.0255, 0.0389, 0.0115, 0.0356, 0.0032, 0.0116, 0.0123, 0.0189, -0.0161, 0.0029, 0.0123, -0.0202, 0.0061, -0.0311, -0.0278, -0.0296, 0.0789, 0.0131, -0.0213, 0.0574, -0.0849, 0.0103, -0.0297, 0.0656, 0.0048, -0.0271, -0.0194, -0.0194, -0.0512, 0.0436, 0.0026, -0.0406, 0.0296, -0.0341, 0.028, -0.0087, -0.0271, 0.0058, 0.0023, -0.0421, -0.0013, 0.0049, -0.0515, 0.0161, -0.0589, 0.0004, 0.0433, 0.0323, -0.0312, 0.0354, -0.0053, 0.0058, -0.0256, 0.0301, -0.0445, -0.0042, -0.0133, -0.0179, 0.0387, 0.0968, -0.0292, -0.0137, -0.0543, 0.0112, -0.0203, 0.0735, 0.0229, -0.0428, -0.0598, -0.003, 0.0021, -0.0648, -0.0353, -0.0621, 0.0519, 0.0671, 0.049, -0.0396, 0.0241, 0.0053, -0.0171, -0.0043, -0.0312, -0.01, -0.0141, -0.0452, -0.0394, -0.0202, 0.0323, 0.0078, -0.0564, 0.0777, -0.0123, -0.0227, -0.0968, 0.0154, 0.061])

In [20]:
reviews_comb

[('B000I9VZCU',
  DenseVector([0.0491, 0.0955, 0.002, -0.0214, 0.0212, 0.0355, 0.0439, -0.0338, -0.0389, -0.0743, 0.0947, -0.0501, -0.1283, 0.0592, -0.0719, 0.0354, -0.1215, -0.1018, 0.0437, -0.0943, 0.1468, -0.037, -0.0135, -0.0923, 0.2523, 0.2371, 0.0574, 0.0486, -0.0201, -0.188, 0.0089, -0.0345, -0.1846, 0.31, 0.057, 0.0516, 0.0299, -0.0434, 0.096, -0.1565, -0.0817, -0.01, 0.096, -0.0114, -0.0003, 0.0195, -0.1443, 0.0659, -0.0076, -0.008, -0.171, -0.0165, -0.1254, 0.0447, 0.1486, 0.1624, -0.0459, -0.1025, 0.0394, 0.0467, -0.0725, -0.0411, -0.2131, -0.1642, 0.0756, -0.1011, 0.0337, -0.0122, -0.0214, 0.0135, -0.0814, 0.0676, 0.0253, -0.0627, 0.0431, -0.0051, 0.0038, 0.1099, -0.0189, -0.0166, -0.0278, 0.0632, 0.0662, -0.0826, -0.0491, 0.0155, -0.1265, -0.0681, -0.1892, -0.0141, 0.1146, 0.0376, 0.0418, -0.1424, 0.0472, 0.0338, 0.1443, 0.1203, -0.1426, 0.1122]),
  'Digital_Video'),
 ('B000IZ8E3M',
  DenseVector([-0.0117, 0.0534, 0.0294, 0.0255, 0.0389, 0.0115, 0.0356, 0.0032, 0.0116, 0.0

# Keyword Search Based Recommendation

In [41]:
def getKeyWordsRecoms(key_words, limit):
        
    keyword_df = sc.parallelize([(0, key_words)]).toDF(['product_id', 'review_text'])
    
    # transform the the key words to vectors
    keyword_df = pipeline_mdl.transform(keyword_df)
    
    # choose word2vec vectors
    keyword_vec = keyword_df.select('word_vec').collect()[0][0]
    
    sim_prod = sc.parallelize((i[0], float(CombDistance(keyword_vec, i[1]))) for i in reviews_comb)
    
    print('\nFor "' + key_words + '"', 'here are product recommendations.')
    return sim_prod 
     

### Initially tried to compare different methods to compute similarities between keyword vector and review vector
Commonly used distance metrics are cosine, euclidean and pearson but euclidean has distance not in the same scale as others. Used average of cosine and pearson distance, the similarity for the vectors is determined. 

### Cosine 

In [37]:
key_words = 'fun'

keywords_recom_df = getKeyWordsRecoms(key_words, 10)
keywords_recom_pd = keywords_recom_df.toDF(['product_id', 'score']).toPandas()
keywords_recom_pd.sort_values(by='score', ascending=False).head(5)


For "fun" here are product recommendations.


Unnamed: 0,product_id,score
7404,B0013F1D02,0.891639
4295,B0041GKKNG,0.80354
5723,B00Q55646I,0.795012
7546,B0095R7LLK,0.781599
2809,B0029AAXAG,0.773123


### Euclidean

In [40]:
key_words = 'fun'

keywords_recom_df = getKeyWordsRecoms(key_words, 10)
keywords_recom_pd = keywords_recom_df.toDF(['product_id', 'score']).toPandas()
keywords_recom_pd.sort_values(by='score', ascending=False).head(5)


For "fun" here are product recommendations.


Unnamed: 0,product_id,score
3813,B008RKIBGA,2.574751
735,B008Y78AO4,2.574751
2358,B00ENYKBD0,2.574751
1734,B0043LOGCU,2.574751
5827,B014F4PCAE,2.397516


### Pearson

In [35]:
key_words = 'fun'

keywords_recom_df = getKeyWordsRecoms(key_words, 10)
keywords_recom_pd = keywords_recom_df.toDF(['product_id', 'score']).toPandas()
keywords_recom_pd.sort_values(by='score', ascending=False).head(5)


For "fun" here are product recommendations.


Unnamed: 0,product_id,score
7404,B0013F1D02,0.889682
4295,B0041GKKNG,0.810875
7546,B0095R7LLK,0.789697
5723,B00Q55646I,0.78785
2809,B0029AAXAG,0.774059


### Combined approach uses the mean score of both cosine and pearson methods

### Combined

In [42]:
key_words = 'fun'

keywords_recom_df = getKeyWordsRecoms(key_words, 10)
keywords_recom_pd = keywords_recom_df.toDF(['product_id', 'score']).toPandas()
keywords_recom_pd.sort_values(by='score', ascending=False).head(5)


For "fun" here are product recommendations.


Unnamed: 0,product_id,score
7404,B0013F1D02,0.890661
4295,B0041GKKNG,0.807207
5723,B00Q55646I,0.791431
7546,B0095R7LLK,0.785648
2809,B0029AAXAG,0.773591


In [43]:
key_words = 'emotional nostalgic'

keywords_recom_df = getKeyWordsRecoms(key_words, 10)
keywords_recom_pd = keywords_recom_df.toDF(['product_id', 'score']).toPandas()
keywords_recom_pd.sort_values(by='score', ascending=False).head(5)


For "emotional nostalgic" here are product recommendations.


Unnamed: 0,product_id,score
5972,B00O2KLZBC,0.727005
5719,B00LM5KY6C,0.714798
5828,B008Y6OS64,0.70609
7874,B00LM4XJ38,0.702677
6157,B009VB7NWM,0.693747


# Based on product_id, finds similar products based on review_text analysis

In [47]:
def getSimilarProduct(product_id, limit=10):
    
    #Create a schema to store data
    schema = StructType([   
                            StructField("product_id", StringType(), True)
                            ,StructField("score", IntegerType(), True)
                            ,StructField("product_category", StringType(), True)
                            ,StructField("product_id2", StringType(), True)
                        ])
    
    similar_product_df = spark.createDataFrame([], schema)
    
    for product in product_id:
        
        #Search for target product in already created vector list 
        input_vec = [(r[1]) for r in reviews_comb if r[0] == product][0]

        #Find products that have the most similarities to the target product from the created vector list 
        similar_product_rdd = sc.parallelize((i[0], float(CombDistance(input_vec, i[1])), i[2]) for i in reviews_comb)
        
        #Create product result dataframe 
        similar_product_df = spark.createDataFrame(similar_product_rdd) \
            .withColumnRenamed('_1', 'product_id') \
            .withColumnRenamed('_2', 'score') \
            
        # Get product list with similarity scores, which does not resemble input products
        # Sorting based on scores here given nature of pyspark distributed data 
        similar_product_df = similar_product_df.filter(col("product_id") != product).sort('score',ascending=False).limit(limit)
        similar_product_df = similar_product_df.withColumn('product_id2', lit(product))

    return similar_product_df

## Test on two products 

In [45]:
product_ids = ['B0142KKMQU','B00F3WRIVY']

In [48]:
getSimilarProduct(product_ids).toPandas().sort_values(by='score', ascending=False)

Unnamed: 0,product_id,score,_3,product_id2
0,B0041GKKNG,0.860634,Digital_Video,B00F3WRIVY
1,B00AVRZG24,0.849858,Digital_Video,B00F3WRIVY
2,B00BE013YC,0.843139,Digital_Video,B00F3WRIVY
3,B008GTH6LI,0.84086,Digital_Video,B00F3WRIVY
4,B00BE019PA,0.840756,Digital_Video,B00F3WRIVY
5,B00Z89MIZ8,0.838077,Digital_Video,B00F3WRIVY
6,B0095R7LLK,0.835564,Digital_Video,B00F3WRIVY
7,B00FX9AQE8,0.83425,Digital_Video,B00F3WRIVY
8,B00ECSISGO,0.831101,Digital_Video,B00F3WRIVY
9,B0044D5YEG,0.826964,Digital_Video,B00F3WRIVY


## User-profile dependent product recommendation based on products' reviews
Content-based filtering methods are based on featuralization of items and user's information. It is best suited to problem with known data on items (in this case, reviews) and how the user historically interact with it. 

Based on products user reviewed on, find recommendable products. 

In [62]:
def ProductRecoms(u_id, limit=10):
    
    #Search for user
    df_user = df_filtered.filter(df_filtered['customer_id'] == u_id)
    #Find products that user reviewed on 
    usr_rate_product = df_user.select('product_id', 'product_title').distinct()
    #Store product list into the list 
    product_list = [i.product_id for i in usr_rate_product.collect()]
    
    print('User previously reviewed the following products:')
    
    user_prods = getProductTitle(product_list).select('product_id','product_title').toPandas()
    
    display(user_prods)
    
    #Find products similar to the products user previously reviewed on 
    sim_product = getSimilarProduct(product_list, limit)
    
    #Filter out those have been reviewd before by the user
    p = sim_product.alias("p")
    u = usr_rate_product.alias("u")
    
    j = p.join(u, col("p.product_id") == col("u.product_id"), 'left_outer').where(col("u.product_id").isNull()).select([col('p.product_id'),col('p.score')])
    
    result = j.orderBy("score", ascending = False).limit(limit)
    
    #Get product ids from final result 
    p_array = [row.product_id for row in result.collect()]
    
    #Get product titles for the recommended products 
    prods = getProductTitle(p_array).select('product_id','product_title').toPandas()
    
    print("Recommended products based on user's previous behavior:")
    display(result.toPandas().merge(prods, on = 'product_id'))
    

In [52]:
#Test getProductTile function result 
getProductTitle(['B00TPJEJQS','B0053PU034']).show()

+----------+----------------+
|product_id|   product_title|
+----------+----------------+
|B00TPJEJQS|   The Physician|
|B0053PU034|Against the Tide|
+----------+----------------+



In [50]:
import functools
from pyspark.sql import DataFrame

#Function to get product titles 
def getProductTitle(p_id):
    prod_list=df_filtered.filter(col('product_id').isin(p_id)).select('product_id', 'product_title').distinct()
    return prod_list

In [58]:
df_filtered.select('customer_id').show(3)

+-----------+
|customer_id|
+-----------+
|   52895410|
|   39817566|
|   41236720|
+-----------+
only showing top 3 rows



In [56]:
#Find products for a user '52895410'
ProductRecoms(52895410)

User previously reviewed the following products:


Unnamed: 0,product_id,product_title
0,B01489L5LQ,After Words


Recommended products based on user's previous behavior:


Unnamed: 0,product_id,score,product_title
0,B00JGMIU4G,0.904484,In The Blood
1,B00TTRBYGY,0.899473,A Girl Walks Home Alone At Night (English Subt...
2,B00932AA5G,0.890527,Headhunters
3,B00T9DO6RC,0.887819,The Voices
4,B00T9DLXYQ,0.887094,Jupiter Ascending
5,B009O8JSSO,0.884902,28 Hotel Rooms
6,B00LTMIHVW,0.880586,Under the Skin
7,B003E48UMY,0.873963,The Republic of Love
8,B00M7VB4OG,0.87345,Saving Grace
9,B00SY9MEX0,0.871544,Suburban Gothic


DataFrame[product_id: string, score: double]