In [1]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt 



import pyspark
from pyspark.sql.types import *
from pyspark.ml.tuning import TrainValidationSplit
from pyspark.ml.recommendation import ALS
from pyspark.ml.evaluation import RegressionEvaluator


# Build our Spark Session and Context
spark = pyspark.sql.SparkSession.builder.getOrCreate()
sc = spark.sparkContext
spark, sc


from pyspark.sql.functions import lit
from pyspark.sql.types import StructType, StructField, IntegerType
from pyspark.sql.functions import countDistinct, col

In [2]:
movie_data = pd.read_csv('data/movies.dat',
                                   sep="\t|::",
                                   names=['movie_id','title','genres'], 
                                   header=None, 
                                   engine="python")

In [5]:
def load_data_to_spark():
    df = pd.read_csv('data/training.csv')
    s_df = spark.createDataFrame(df)
    return s_df, df
 

In [6]:
s_df, p_df = load_data_to_spark()

In [16]:
s_df.show()

+----+-----+------+---------+
|user|movie|rating|timestamp|
+----+-----+------+---------+
|6040|  858|     4|956703932|
|6040|  593|     5|956703954|
|6040| 2384|     4|956703954|
|6040| 1961|     4|956703977|
|6040| 2019|     5|956703977|
|6040| 1419|     3|956704056|
|6040|  573|     4|956704056|
|6040| 3111|     5|956704056|
|6040|  213|     5|956704056|
|6040| 3505|     4|956704056|
|6040| 1734|     2|956704081|
|6040|  912|     5|956704191|
|6040|  919|     5|956704191|
|6040| 2503|     5|956704191|
|6040|  527|     5|956704219|
|6040|  318|     4|956704257|
|6040| 1252|     5|956704257|
|6040|  649|     5|956704257|
|6040| 3289|     5|956704305|
|6040|  759|     5|956704448|
+----+-----+------+---------+
only showing top 20 rows



In [17]:
def get_density():
    # get density from original data
    p_df, _ = load_data_to_spark()
    n_ratings = s_df.count()
    n_users = s_df.select('user').distinct().count()
    n_movies = s_df.select('movie').distinct().count()
    density = n_ratings / (n_users * n_movies)
    print('The original density is: {} '.format(density))
    return density

In [7]:
def traintestsplit():
    ratings_df, _ = load_data_to_spark()
    train, test = ratings_df.randomSplit([0.8, 0.2], seed=427471138)
    return train, test

In [19]:
def get_train_density():
    # get density from original data
    train, _ = traintestsplit()
    n_ratings = train.count()
    n_users = train.select('user').distinct().count()
    n_movies = train.select('movie').distinct().count()
    density = n_ratings / (n_users * n_movies)
    print('The train desnsity is: {} '.format(density))
    return density

In [20]:
def get_test_density():
    # get density from original data
    _, test = traintestsplit()
    n_ratings = test.count()
    n_users = test.select('user').distinct().count()
    n_movies = test.select('movie').distinct().count()
    density = n_ratings / (n_users * n_movies)
    print('The test desnsity is: {} '.format(density))
    return density

In [21]:
get_density()

The original density is: 0.04046302241176001 


0.04046302241176001

In [22]:
get_train_density()

The train desnsity is: 0.03257245318242673 


0.03257245318242673

In [23]:
get_test_density()

The test desnsity is: 0.008776990345157408 


0.008776990345157408

In [9]:
# instantiate the model and set its parameters
als_model = ALS(
    itemCol='movie',
    userCol='user',
    ratingCol='rating',
    nonnegative=True,    
    regParam=0.1,
    rank=10) 

In [26]:
# fitting
train, test = traintestsplit()
train_df = train.toPandas()
test_df = test.toPandas()
train_df.to_csv('./data/a_training.csv')
test_df.to_csv('./data/a_testing.csv')
recommender = als_model.fit(train)

In [36]:
# find if there are some movies that do not appear in train
# test_df.drop(cols = ['rating', 'timestamp'], axis = 1, inplace=True)
# test_df.to_csv('data/a_requests.csv')
train_df


Unnamed: 0,user,movie,rating,timestamp
0,3375,265,1,967595224
1,3375,293,3,967595382
2,3375,434,4,967595560
3,3375,1090,5,967595382
4,3375,1562,3,967595188
5,3375,1587,3,967595560
6,3375,1625,4,967595451
7,3375,2000,5,967595474
8,3375,2001,4,967595513
9,3375,2302,5,967595474


In [37]:
# distinct = s_df.select("movie").distinct().show()
train_pd = train.toPandas()
# distinct = p_df.movie.unique()
# distinct
distinct = train_pd.movie.unique()

In [38]:
not_in_train = []
for row in s_df.rdd.collect():
    movie = row['movie']
    if movie not in distinct:
        not_in_train.append(movie)
not_in_train
    
    

[3522,
 530,
 2685,
 1842,
 887,
 1709,
 3228,
 3337,
 1685,
 2685,
 1470,
 655,
 139,
 1316,
 989,
 2685,
 2217,
 2484,
 579,
 601,
 3220,
 1118,
 3377,
 641,
 3126,
 1471,
 2556]

In [39]:
# idea for cold start - if movie doesn't show up in test, don't recommend it in top 5
# item to item similarity
# neighborhoods - size of these that you use
# use combinations of different models 
# types of similarity - cosine similarity
# cold start - user shows up that hasn't rated any movies or movie thats never been reviewed
# 

In [11]:
# Light FM
predictions = recommender.transform(test)
predictions.show()

+----+-----+------+---------+----------+
|user|movie|rating|timestamp|prediction|
+----+-----+------+---------+----------+
|1242|  148|     3|974909976| 2.9531837|
|3539|  148|     3|966932408| 2.7834134|
|1150|  148|     2|974875106| 2.4366071|
|3829|  148|     2|965940170| 2.5057168|
|2507|  148|     4|974082717| 3.0551972|
|3841|  463|     3|966003085| 2.5498762|
|2629|  463|     4|973625620| 3.0239043|
|3328|  463|     4|967918151|  3.043021|
|3683|  463|     1|966523740| 1.4682665|
|3717|  463|     2|967228367| 2.7757328|
|3562|  463|     2|966790403| 2.6724248|
|5249|  463|     3|961602410|  2.762345|
|1699|  471|     5|974711905| 3.8341877|
|4167|  471|     5|965425364| 3.6452205|
|5337|  471|     4|960747681| 3.7248907|
|5880|  471|     5|957546755|  3.367366|
| 731|  471|     5|975529161|  3.800399|
|3211|  471|     4|968548060| 3.5678315|
|3411|  471|     2|967583402| 3.0830169|
|1019|  471|     4|975010672| 3.1851416|
+----+-----+------+---------+----------+
only showing top

In [12]:
predictions_pd = predictions.toPandas()

In [24]:
grouped = predictions_pd.groupby('user')
grouped2 = grouped.apply(lambda x: x.sort_values(['prediction']))
# df2= df1.apply(lambda x: x.sort_values(["lifeExp"]

In [25]:
grouped2

Unnamed: 0_level_0,Unnamed: 1_level_0,user,movie,rating,timestamp,prediction
user,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
636,96131,636,3695,3,975752411,1.829173
636,108838,636,3693,3,975752146,3.033439
636,65451,636,1971,5,975752317,3.200827
636,25278,636,2668,3,975752288,3.245209
636,149983,636,2456,2,975752349,3.254828
636,159369,636,2746,5,975752146,3.489406
636,17454,636,332,3,975752800,3.497720
636,1087,636,2122,4,975752349,3.515913
636,137055,636,379,3,975753026,3.574463
636,102142,636,1320,4,975752996,3.616727


In [None]:
# run.py, runs recommender.py, submit.py
