In [15]:
import pyspark
sc.stop()
from pyspark import SparkConf,SparkContext
from pyspark.sql.functions import regexp_extract
from pyspark.sql.types import *
from pyspark.sql import SQLContext

from pyspark.sql.functions import monotonically_increasing_id,row_number 

from pyspark.sql.functions import isnan, count, when, col, desc, udf, col,rand
from pyspark.sql.functions import sort_array, asc, avg
from pyspark.sql.functions import min as Fmin
from pyspark.sql.functions import max as Fmax
from pyspark.sql.functions import stddev as Fstddev
from pyspark.ml.feature import OneHotEncoderEstimator, StringIndexer, VectorAssembler


from pyspark.sql.functions import udf
from pyspark.sql.types import IntegerType

from pyspark.sql import Window

from pyspark.sql.functions import sum as Fsum
from pyspark.sql.functions import rank 
import pyspark.sql.functions as F
from pyspark.sql import DataFrameStatFunctions as statFunc
from pyspark.sql.functions import first
from pyspark.sql.functions import lit

from pyspark.sql.functions import col, countDistinct


import pandas as pd
import sklearn.metrics as metrics
import numpy as np
from sklearn.neighbors import NearestNeighbors
from scipy.spatial.distance import correlation
from sklearn.metrics.pairwise import pairwise_distances
import ipywidgets as widgets
from IPython.display import display, clear_output
from contextlib import contextmanager
import warnings
warnings.filterwarnings('ignore')
import os, sys
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS
from pyspark.sql import Row

### Loading the dataset from s3 bucket 

In [16]:
sc = SparkContext.getOrCreate()
sqlContext = SQLContext(sc)
df_schema = StructType([StructField('movieId', IntegerType()),StructField('userId', IntegerType()),
   StructField('rating', DoubleType())])

movies_schema = StructType([StructField('movieId', IntegerType()),StructField('YearOfRelease', IntegerType()),
   StructField('Title', StringType())])

In [17]:
test_df = sqlContext.read.format('csv').options(header=True, inferSchema=False).schema(df_schema).load('s3a://netfinal/TestingRatings.txt')
test_df.show(2, truncate=False)

+-------+-------+------+
|movieId|userId |rating|
+-------+-------+------+
|8      |2149668|3.0   |
|8      |1089184|3.0   |
+-------+-------+------+
only showing top 2 rows



In [18]:
train_df = sqlContext.read.format('csv').options(header=True, inferSchema=False).schema(df_schema).load("s3a://netfinal/TrainingRatings.txt")
train_df.count()

3255351

In [19]:
movies_df = sqlContext.read.format('csv').options(header=True, inferSchema=False).schema(movies_schema).load("s3a://netfinal/movie_titles.txt")
movies_df.show(5,truncate = False)

+-------+-------------+----------------------------+
|movieId|YearOfRelease|Title                       |
+-------+-------------+----------------------------+
|2      |2004         |Isle of Man TT 2004 Review  |
|3      |1997         |Character                   |
|4      |1994         |Paula Abdul's Get Up & Dance|
|5      |2004         |The Rise and Fall of ECW    |
|6      |1997         |Sick                        |
+-------+-------------+----------------------------+
only showing top 5 rows



### Joining the train and test dataframes with the movies dataframe

In [20]:
train_df = train_df.join(movies_df,on=['movieId'],how='inner')
test_df = test_df.join(movies_df,on=['movieId'],how='inner')

### ALS implementation 

In [24]:
als = ALS(maxIter=20, regParam=0.08, userCol="userId", itemCol="movieId", ratingCol="rating",
          coldStartStrategy="drop")
model = als.fit(train_df)

# Evaluate the model by computing the MSE and RMSE on the test data
predictions = model.transform(test_df)
evaluator = RegressionEvaluator(metricName="mse", labelCol="rating",
                                predictionCol="prediction")
                                            
evaluator1 = RegressionEvaluator(metricName="rmse", labelCol="rating",
                                predictionCol="prediction")


rmse = evaluator1.evaluate(predictions)
mse = evaluator.evaluate(predictions)

print("Mean squared error = " + str(mse))
print("Root-mean-square error = " + str(rmse))

Mean squared error = 0.7011394936850339
Root-mean-square error = 0.8373407273535869


### Checking the accuracy of our predictions

In [8]:
predictions.sort('userId','rating').show(10)

+-------+------+------+-------------+--------------------+----------+
|movieId|userId|rating|YearOfRelease|               Title|prediction|
+-------+------+------+-------------+--------------------+----------+
|   9528|     7|   5.0|         1957|        12 Angry Men|  4.527373|
|  15496|     7|   5.0|         1969|          Easy Rider|   3.88665|
|   8163|    79|   3.0|         2004|        Two Brothers| 3.4458613|
|  12497|    79|   4.0|         2000|         Bring It On|  3.410501|
|   2913|    79|   4.0|         2004|   Finding Neverland| 3.9096336|
|  14648|    79|   5.0|         2003|Finding Nemo (Ful...|  4.424679|
|   3541|   199|   3.0|         1981|History of the Wo...| 3.9816039|
|   8851|   199|   4.0|         1998|            Rounders|  4.093237|
|   2518|   199|   4.0|         1995|Things to Do in D...| 3.8148317|
|   3165|   199|   5.0|         1988|Dirty Rotten Scou...|  3.739738|
+-------+------+------+-------------+--------------------+----------+
only showing top 10 

### Adding self rated movie ratings to our dataset 

In [21]:
user_df = sqlContext.read.format('csv').options(header=True, inferSchema=False).schema(df_schema).load('s3://netfinal/self-user.txt')
user_df.show(2, truncate=False)

+-------+------+------+
|movieId|userId|rating|
+-------+------+------+
|28     |0     |3.0   |
|43     |0     |4.0   |
+-------+------+------+
only showing top 2 rows



In [22]:
user_df = user_df.join(movies_df,on=['movieId'],how='inner')

In [23]:
train_df = train_df.union(user_df)
test_df = test_df.union(user_df)

In [25]:
UserID = 0
self = train_df.filter(train_df.userId == UserID)
self.show(10)

+-------+------+------+-------------+--------------------+
|movieId|userId|rating|YearOfRelease|               Title|
+-------+------+------+-------------+--------------------+
|     28|     0|   3.0|         2002|     Lilo and Stitch|
|     43|     0|   4.0|         2000|      Silent Service|
|     48|     0|   4.0|         2001|      Justice League|
|     61|     0|   2.0|         1999|Ricky Martin: One...|
|     64|     0|   4.0|         2001|     Outside the Law|
|     66|     0|   4.0|         1989|   Barbarian Queen 2|
|     92|     0|   3.0|         2002|  ECW: Cyberslam '99|
|     96|     0|   4.0|         2000|Inside the Space ...|
|    111|     0|   4.0|         2003| Duplex (Widescreen)|
+-------+------+------+-------------+--------------------+



### Checking recommendations based on our ratings

In [26]:
recom_user_self = model.recommendForUserSubset(self, 10)
recom_user_self.show(truncate=False)

+------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|userId|recommendations                                                                                                                                                                               |
+------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|0     |[[3033, 4.814257], [5225, 4.6135697], [5484, 4.5843763], [7858, 4.5789685], [16559, 4.5649867], [14941, 4.5440726], [5785, 4.4674215], [7625, 4.4513564], [6991, 4.4305925], [192, 4.4123034]]|
+------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+


In [30]:
# Movie recommendations given by model
list1 = [3033,5225,5484,7858,16559,14941,5785,7625,6991,192]
df = movies_df[movies_df.movieId.isin(list1)]
df.reset_index(inplace=True,drop=True)
df.head(10)

Unnamed: 0,movieId,YearOfRelease,Title
0,192.0,2003.0,The SoulTaker
1,3033.0,2005.0,Ghost in the Shell: Stand Alone Complex: 2nd Gig
2,5225.0,1997.0,The Nazis: A Warning from History
3,5484.0,2000.0,Har dil jo Pyar karega...
4,5785.0,2000.0,The Cars: Live
5,6991.0,2001.0,A History of God
6,7625.0,1977.0,Young Lady Chatterley
7,7858.0,2004.0,Burst Angel
8,14941.0,2000.0,Vandread
9,16559.0,1991.0,Red Green: Stuffed and Mounted 1
