In [1]:
dbfs_dir = 's3://projectnetflix/'
training = dbfs_dir + 'TrainingRatings.txt'
testing = dbfs_dir + 'TestingRatings.txt'

<h4> Defining the dataframe schema </h4>
<p> The test and train schema are the same thus, defining a single schema </p>

In [2]:
from pyspark.sql.types import *

df_schema = StructType(
  [StructField('movieID', IntegerType()),
   StructField('userID', IntegerType()),
   StructField('rating', DoubleType())]
)

<h4> Reading the data </h4>
<p>Creating dataframes with predefined schema for test and train data given to us initially <br>
And caching them for future use</p>

In [3]:
training_data = sqlContext.read.format('csv').options(header=False, inferSchema=False).schema(df_schema).load(training)
testing_data = sqlContext.read.format('csv').options(header=False, inferSchema=False).schema(df_schema).load(testing)

training_data.cache()
testing_data.cache()

print('Total train data = ', training_data.count())
print('Total test data = ', testing_data.count())

                                                                                

Total train data =  3255352
Total test data =  100478


<h4> Creating a sample </h4>
<p>Randomly selecting 20% of the data to create sample sets which can be used <br>
    for quicker analysis of the test and train datasets</p>

In [4]:
from pyspark.sql.functions import rand

sample_train = training_data.orderBy(rand(seed=5)).limit(651071)
sample_test = testing_data.orderBy(rand(seed=5)).limit(20096)

In [5]:
sample_train.show(4)

[Stage 7:>                                                          (0 + 1) / 1]

+-------+-------+------+
|movieID| userID|rating|
+-------+-------+------+
|  14137|1750812|   2.0|
|  17334| 769887|   3.0|
|    907|1300759|   5.0|
|   3290|2524741|   3.0|
+-------+-------+------+
only showing top 4 rows



                                                                                

In [6]:
sample_test.show(4)

+-------+-------+------+
|movieID| userID|rating|
+-------+-------+------+
|   6408| 116761|   3.0|
|   6281|1944206|   4.0|
|   7511|2487973|   4.0|
|    442|1460499|   4.0|
+-------+-------+------+
only showing top 4 rows



<h4> Problem 2 a)</h4>
<p>In order to be able to evaluate your approach the goal of your project is to predict the 
ratings for all user-item pairs in the test set (TestingRatings.txt). How many distinct 
items and how many distinct users are there in the test set (TestingRatings.txt)?</p>

In [7]:
#Problem 2a)
from pyspark.sql.functions import countDistinct
sample_train.cache()
sample_test.cache()

distinct_items = testing_data.select(countDistinct('movieID'))
print('Distinct items/movies in test set = ', distinct_items.collect()[0][0])

distinct_users = testing_data.select(countDistinct('userID'))
print('Distinct users in test set = ', distinct_users.collect()[0][0])

distinct_items = training_data.select(countDistinct('movieID'))
print('\nDistinct items/movies in training set = ', distinct_items.collect()[0][0])

distinct_users = training_data.select(countDistinct('userID'))
print('Distinct users in training set = ', distinct_users.collect()[0][0])

21/12/14 18:21:44 WARN Utils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.debug.maxToStringFields' in SparkEnv.conf.


Distinct items/movies in test set =  1701
Distinct users in test set =  27555


                                                                                


Distinct items/movies in training set =  1821
Distinct users in training set =  28978


<h4> Problem 2 b)</h4>
<p> Calculate estimated average overlap of items for users and estimated average overlap of users for items. </p>

In [8]:
#Problem 2b)

#Randomly selecting 10 users to consider for calculating overlap - as list of userIDs
randomly_selected_users = list(sample_test.select('userID').distinct().orderBy(rand()).limit(10)\
                               .select('userID').toPandas()['userID'])

#Maintaining an average overlap List for multiple users
avgOverlap = []

for user in randomly_selected_users:
    
    #Fetch all the items that have been rated by given user
    userRatings = sample_train.filter(sample_train.userID == user)
    #print('\nTotal items rated by user', user, ' = ', userRatings.select(countDistinct('movieID')).collect()[0][0])

    #All items rated by given user - as a list
    ratedMovies = list(userRatings.select('movieID').toPandas()['movieID'])
    
    #Counting number of users which rated the same movies/items as selected user
    otherRatingCounts = sample_train.filter(sample_train.movieID.isin(ratedMovies)).groupBy('movieID')\
    .agg(countDistinct('userID').alias('count'))
    
    #Calculating average over all the items of selected user
    itemOverlapForUser = list(otherRatingCounts.select('count').toPandas()['count'])
    avgForUser = sum(itemOverlapForUser)/len(itemOverlapForUser)
    avgOverlap.append(avgForUser)
    #print('Average overlap of items for selected user =', avgForUser)

#Calculating average overlap for multiple users from test set
finalAvgOverlap = sum(avgOverlap)/len(avgOverlap)
print('estimated average overlap of items for users = ', finalAvgOverlap)

#========================

#Randomly selecting 10 items to consider for calculating overlap - as list of movieIDs
randomly_selected_items = list(sample_test.select('movieID').distinct().orderBy(rand()).limit(10)\
                               .select('movieID').toPandas()['movieID'])

#Maintaining an average overlap List for multiple items
avgOverlap = []

for item in randomly_selected_items:
    
    #Fetch all the users that have rated given movie
    userRatings = sample_train.filter(sample_train.movieID == item)
    #print('\nTotal users given rating for item', item, ' = ', userRatings.select(countDistinct('userID')).collect()[0][0])

    #All users that rated given item - as a list
    ratedMovies = list(userRatings.select('userID').toPandas()['userID'])
    
    #Counting number of items which were rated by the same user as selected movie
    otherRatingCounts = sample_train.filter(sample_train.userID.isin(ratedMovies)).groupBy('userID')\
    .agg(countDistinct('movieID').alias('count'))
    
    #Calculating average over all the users of selected item
    itemOverlapForUser = list(otherRatingCounts.select('count').toPandas()['count'])
    avgForUser = sum(itemOverlapForUser)/len(itemOverlapForUser)
    avgOverlap.append(avgForUser)
    #print('Average overlap of user for selected item =', avgForUser)

#Calculating average overlap for multiple items from test set
finalAvgOverlap = sum(avgOverlap)/len(avgOverlap)
print('estimated average overlap of users for items = ', finalAvgOverlap)

                                                                                

estimated average overlap of items for users =  1872.3651925465838
estimated average overlap of users for items =  44.29447349630012


<h3> Problem 2c) </h3>
<p> From 2a :
<ul>
    <li>Distinct items/movies in test set =  1701</li>
    <li>Distinct users in test set =  27555</li>
    <br>
    <li>Distinct items/movies in training set =  1821</li>
    <li>Distinct users in training set =  28978</li>
</ul>
Since number of items << number of users,<br>
it would be expected that the overlap of items >> overlap of users<br>
This is because the SAME items could have been rated by MANY users<br>
however, the SAME users might or might not have rated the SAME items<br>

From 2b :
<ul>
    <li>estimated average overlap of items for users ~ 2287.272</li>
    <li>estimated average overlap of users for items ~  34.645</li>
</ul>

As expected, the overlap of items is much higher than the overlap of users<br>

From the calculations, <strong>higher overlap of items</strong>
indicates the data has many <strong>similar users</strong>.<br>
Thus, the predictions would likely be way more accurate if we were to use the <strong>user-user</strong> model
</p>

<h4> Pearson correlation </h4>
<p>item-item model</p>
<sub>Unable to pivot data to be able to perfom user-user model - because of VERY HIGH number of users</sub>

In [9]:
#Grouping data by userID and maintaining movie rating for each user
item_item_train = (training_data.groupBy('userID').pivot('movieID').sum('rating'))
item_item_train = item_item_train.na.fill(value=0)
item_item_train.cache()

item_item_train.count()

                                                                                

28978

In [10]:
#Maintaining a list of all movieIDs in the training data
allCol = []
for col in item_item_train.dtypes:
    allCol.append(col[0])
allCol.remove('userID')


from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler
#Creating a ratings vector - list all all ratings by that user for different movies(0 for null)
assembler = VectorAssembler(
    inputCols=allCol,
    outputCol="ratingsVector")

output = assembler.transform(item_item_train)
output.cache()

DataFrame[userID: int, 8: double, 28: double, 43: double, 48: double, 61: double, 64: double, 66: double, 92: double, 96: double, 111: double, 122: double, 123: double, 127: double, 140: double, 145: double, 154: double, 156: double, 174: double, 185: double, 192: double, 207: double, 214: double, 218: double, 222: double, 229: double, 237: double, 259: double, 267: double, 276: double, 287: double, 305: double, 318: double, 323: double, 336: double, 359: double, 361: double, 380: double, 395: double, 398: double, 409: double, 417: double, 440: double, 442: double, 443: double, 450: double, 452: double, 481: double, 507: double, 510: double, 518: double, 541: double, 543: double, 557: double, 566: double, 577: double, 578: double, 583: double, 594: double, 595: double, 606: double, 626: double, 634: double, 636: double, 638: double, 642: double, 647: double, 655: double, 666: double, 669: double, 681: double, 712: double, 718: double, 722: double, 723: double, 725: double, 735: double,

In [11]:
#output.toPandas().tail()

In [12]:
from pyspark.ml.stat import Correlation
import numpy as np

#Forming the correlation matrix on the pearson method
correlation_matrix = Correlation.corr(output, 'ratingsVector', 'pearson')
mat = correlation_matrix.collect()[0][0].toArray()
allColArray = np.array(allCol)

                                                                                

In [13]:
import pandas as pd

#viewing the correlation matrix as a pandas dataframe for easier visualization
pearsonCorr = pd.DataFrame(mat, columns=allCol)
pearsonCorr['movieID'] = allCol
pearsonCorr = pearsonCorr.set_index('movieID')

print('The similarity of one movie with another is as below:')
pearsonCorr

The similarity of one movie with another is as below:


Unnamed: 0_level_0,8,28,43,48,61,64,66,92,96,111,...,17654,17660,17689,17693,17706,17725,17728,17734,17741,17742
movieID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
8,1.000000,-0.042146,0.005975,-0.014675,-0.000750,0.012873,0.012119,-0.000779,0.022855,-0.023613,...,-0.033358,0.036393,0.007312,-0.030664,0.001916,0.025023,0.043411,0.026431,0.005980,0.018934
28,-0.042146,1.000000,0.014497,0.110906,-0.009361,0.001573,0.004635,0.003757,0.009357,0.123198,...,0.101463,0.004686,0.003791,0.157262,0.019206,-0.025760,-0.015913,0.002747,0.015772,0.003481
43,0.005975,0.014497,1.000000,0.037607,0.025534,0.050806,0.046491,0.005333,0.028172,-0.008141,...,0.015749,0.005587,0.030161,0.013084,0.131892,0.009742,0.025317,0.016258,0.024677,0.020845
48,-0.014675,0.110906,0.037607,1.000000,0.022967,0.028974,0.050921,0.052250,0.031889,-0.000721,...,0.034976,0.012718,0.030831,0.078753,0.050127,0.030731,0.001624,0.015319,0.042453,0.012585
61,-0.000750,-0.009361,0.025534,0.022967,1.000000,0.051848,0.086510,0.079832,0.037186,0.000258,...,0.005865,0.044564,0.116404,0.023404,0.025427,0.024588,0.028530,0.042600,0.014583,0.027635
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17725,0.025023,-0.025760,0.009742,0.030731,0.024588,0.008251,0.013599,0.023541,0.030360,0.016149,...,0.006480,0.011732,0.026723,0.009913,0.018162,1.000000,0.013895,0.020327,-0.003775,0.008061
17728,0.043411,-0.015913,0.025317,0.001624,0.028530,0.036705,0.028801,0.008176,0.053833,-0.011111,...,-0.001303,0.012518,0.059656,-0.003611,0.005868,0.013895,1.000000,0.029439,0.002085,0.026911
17734,0.026431,0.002747,0.016258,0.015319,0.042600,0.075105,0.078206,0.066338,0.051135,0.002920,...,-0.005808,0.085926,0.072782,0.015864,0.059971,0.020327,0.029439,1.000000,0.018625,0.032018
17741,0.005980,0.015772,0.024677,0.042453,0.014583,0.032127,0.054287,0.052028,0.006804,0.039247,...,0.002104,0.023743,0.015954,0.014572,0.039082,-0.003775,0.002085,0.018625,1.000000,0.014102


<h4> Problem 2d) </h4>

In [14]:
training_data.groupBy('rating').count().orderBy('rating').show()



+------+-------+
|rating|  count|
+------+-------+
|   1.0| 169886|
|   2.0| 374452|
|   3.0|1048538|
|   4.0|1044293|
|   5.0| 618183|
+------+-------+



                                                                                

With the count of each rating in the training set, it is significant that all of the ratings can be significant<br> (although ratings 3 and 4 have the most weightage)<br>

If I were to normalize the ratings, it would mean changing the rating scale from 1-5 to any other target rating. <br>
However, I would not like to do so since the 1-5 ratings are way more natural and easy to understand (than suppose a 1-3 rating).
<br>
<br>
In addition to that, since the test set could still have lower rated movies, it would be unfair to remove them from the predictions.<br> However, the lower rated movies can be filtered out during recommendation. It is always much better to suggest highly rated movies rather than poorly rated ones. 

<h3> Problem 3 </h3>
<p>3a) Prediction using item-item model</p>

In [15]:
from pyspark.sql import Row
import numpy as np

#Method to getch the top 15 movies correlated to current movie
def top15(row):

    item = row.movieID
    user = row.userID
    
    #Get the position of current movie in all movies
    pos = allCol.index(str(item))
    
    #Fetch the indexes of top 15 movies matches as current movie
    ind = np.argpartition(mat[pos], -15)[-15:]
    
    #Map top 15 indexes to top 15 movies
    top_n = allColArray[ind]
    top_n = [int(x) for x in top_n]
    
    return Row(movieID=item, top=top_n, userID=user, rating=row.rating)   

In [16]:
#Uncomment to try on larger data set
#small = sample_test.limit(700)
small = sample_test.limit(10)
small.show(5)

+-------+-------+------+
|movieID| userID|rating|
+-------+-------+------+
|   6408| 116761|   3.0|
|   6281|1944206|   4.0|
|   7511|2487973|   4.0|
|    442|1460499|   4.0|
|   8851|1739230|   3.0|
+-------+-------+------+
only showing top 5 rows



In [17]:
preds = small.rdd.map(lambda x : top15(x))

In [18]:
#Defining and creating a dataframe for final ratings
df_schema = StructType(
  [StructField('movieID', IntegerType()),
   StructField('userID', IntegerType()),
   StructField('rating', DoubleType()),
  StructField('pred', DoubleType())]
)

final = spark.createDataFrame(sc.emptyRDD(), df_schema)

In [19]:
final.printSchema()

root
 |-- movieID: integer (nullable = true)
 |-- userID: integer (nullable = true)
 |-- rating: double (nullable = true)
 |-- pred: double (nullable = true)



In [20]:
from pyspark.sql.functions import avg

#Predicting the ratings
def predictRating(row):
    
    item = row.movieID
    user = row.userID
    
    top_n = row.top
    
    #Fetch the ratings by current user for top 15 similar movies
    ratings = training_data.filter((training_data.movieID.isin(top_n)) & (training_data.userID == user))
    ratings = ratings.select('rating').rdd.flatMap(lambda x: x).collect()
    
    if len(ratings) >= 5:
        pred = round(sum(ratings)/len(ratings), 2)
   
    #If current user has not rated atlease 5 of top 15 movies, consider average of all user ratings
    else:
        ratings = training_data.filter((training_data.movieID.isin(top_n)))\
        .groupBy('movieID').agg(avg('rating').alias('rating'))
        ratings = ratings.select('rating').rdd.flatMap(lambda x: x).collect()
        pred = round(sum(ratings)/len(ratings), 2)
        
    return spark.createDataFrame([(item, user, row.rating, pred)], df_schema)

In [21]:
%%time

#Manually working with each row individually since
#external dataframes can not be accessed with rdd functions (map, foreach, etc)
for row in preds.collect():
    #Adding predicted rating for each test row
    final = final.union(predictRating(row))

                                                                                

CPU times: user 17.6 s, sys: 4.6 s, total: 22.2 s
Wall time: 5min 24s


In [22]:
#This is what the dataframe with predictions looks like
#Basically, it is the same as test dataframe, with a new column called pred
final.cache()
final.show(4)

+-------+-------+------+----+
|movieID| userID|rating|pred|
+-------+-------+------+----+
|   6408| 116761|   3.0| 3.5|
|   6281|1944206|   4.0|3.57|
|   7511|2487973|   4.0| 4.0|
|    442|1460499|   4.0| 4.0|
+-------+-------+------+----+
only showing top 4 rows



In [23]:
#Evaluating the mae and rmse for the predictions
from pyspark.ml.evaluation import RegressionEvaluator

print('For item-item model:')
mae_eval = RegressionEvaluator(predictionCol="pred", labelCol="rating", metricName="mae")
rmse_eval = RegressionEvaluator(predictionCol="pred", labelCol="rating", metricName="rmse")

mae = mae_eval.evaluate(final)
rmse = rmse_eval.evaluate(final)

print('The MAE is %.4f' % mae)
print('The RMSE is %.4f' % rmse)


For item-item model:




The MAE is 0.7394
The RMSE is 0.9554


                                                                                

<p>When the same prediction was done for lesser data (100 records), the MAE was around 0.91<br>
    with around 300 records, the MAE was ~ 0.86 <br>
    with more test data (700 records), the MAE is ~ 0.74. <br>
    This is enough to expect that for more records, the MAE could go further below. 
</p>

<h3> Problem 3 </h3>
3b) Recommendation using item-item model

In [24]:
my_user_id = 4

my_rated_movies = [
    (my_user_id, 10676, 4),
    (my_user_id, 14810, 4),
    (my_user_id, 16162, 2),
    (my_user_id, 11340, 5),
    (my_user_id, 4556, 2),
    (my_user_id, 6250, 4),
    (my_user_id, 13334, 1),
    (my_user_id, 11312, 1),
    (my_user_id, 15731, 5),
    (my_user_id, 10109, 4)]

my_rated_movies = sqlContext.createDataFrame(my_rated_movies, ['userID','movieID','rating'])

In [25]:
movies = dbfs_dir + 'movie_titles.txt'
movie_schema = StructType(
  [StructField('movieID', IntegerType()),
   StructField('releaseYear', IntegerType()),
   StructField('title', StringType())]
)

movie_data = sqlContext.read.format('csv').options(header=False, inferSchema=False).schema(movie_schema).load(movies)
movie_data.cache()

my_rated_movies = my_rated_movies.join(movie_data,on=['movieID'],how='inner')
print('My rated movies are:')
my_rated_movies.show(truncate=False)

My rated movies are:
+-------+------+------+-----------+-----------------------------------+
|movieID|userID|rating|releaseYear|title                              |
+-------+------+------+-----------+-----------------------------------+
|10676  |4     |4     |1933       |The Kennel Murder Case / Nancy Drew|
|14810  |4     |4     |2000       |Dolphins: IMAX                     |
|16162  |4     |2     |2002       |Kim Possible: The Secret Files     |
|11340  |4     |5     |1988       |Johnny Be Good                     |
|4556   |4     |2     |2001       |Stealing Time                      |
|6250   |4     |4     |1997       |Female Perversions                 |
|13334  |4     |1     |2000       |Catfish in Black Bean Sauce        |
|11312  |4     |1     |1998       |Mystery Kids                       |
|15731  |4     |5     |2002       |Roxy Music: Live at the Apollo     |
|10109  |4     |4     |1994       |Major League II                    |
+-------+------+------+-----------+--------

In [26]:
%%time
def getRecMovies():
    #Fetch all MY movies with higher ratings
    my_top = my_rated_movies.filter(my_rated_movies.rating > 3).select('movieID')
    my_top = my_top.rdd.flatMap(lambda x: x).collect()
    
    topRecs = []
    for item in my_top:
        pos = allCol.index(str(item))
        
        ind = np.argpartition(mat[pos], -3)[-3:]
        top_n = allColArray[ind]
        top_n = [int(x) for x in top_n]
        for x in top_n:
            topRecs.append(x)
        topRecs.remove(item)

    df_schema = StructType([StructField('movieID', IntegerType()),
                            StructField('avgRating', DoubleType())])
    
    recs = spark.createDataFrame(sc.emptyRDD(), df_schema)
    
    for item in topRecs:
        avR = round(training_data.filter(training_data.movieID == item).\
                    groupBy('movieID').agg(avg('rating').alias('rating')).collect()[0][1], 2)
        recs = recs.union(spark.createDataFrame([(item, avR)], df_schema))
    
    return recs

recs = getRecMovies()

CPU times: user 153 ms, sys: 22.2 ms, total: 175 ms
Wall time: 2.91 s


In [27]:
print('Recommended movies for my user:')
recs.orderBy('avgRating', ascending=False).join(movie_data,on=['movieID'],how='inner').\
select('title', 'movieID', 'releaseYear', 'avgRating').show(truncate=False)

Recommended movies for my user:




+------------------------------------------+-------+-----------+---------+
|title                                     |movieID|releaseYear|avgRating|
+------------------------------------------+-------+-----------+---------+
|Sherlock Holmes: Terror by Night          |15676  |1946       |3.69     |
|Charlie Chan: The Chinese Cat             |10195  |1944       |3.42     |
|Wildcats                                  |6523   |1986       |3.38     |
|Youngblood                                |2352   |1986       |3.21     |
|Encino Man                                |3670   |1992       |3.03     |
|Search for the Great Sharks: IMAX         |8121   |1999       |2.99     |
|Genesis: Live at Wembley Stadium          |5921   |2004       |2.97     |
|Niagara: Miracles                         |834    |1999       |2.96     |
|Teen Wolf / Teen Wolf Too (Double Feature)|2284   |1985       |2.91     |
|The Story of O                            |1252   |1975       |2.9      |
|Young Adam              

                                                                                

In [28]:
#Manually clear all cachd RDDs
testing_data.unpersist()
training_data.unpersist()

sample_test.unpersist()
sample_train.unpersist()

item_item_train.unpersist()
output.unpersist()
final.unpersist()
movie_data.unpersist()

DataFrame[movieID: int, releaseYear: int, title: string]