In [2]:
import pyspark
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from pyspark.ml.recommendation import ALS, ALSModel
from pyspark.sql import functions, types
from pyspark.ml.evaluation import RegressionEvaluator

In [3]:
spark = pyspark.sql.SparkSession.builder.master("local").getOrCreate()

ratings = spark.read.json('data/ratings.json')
requests = spark.read.json('data/requests.json')

In [4]:
# convert format of datetime column 'timestamp' from epoch to standard 
ratings = (ratings.withColumn(
    'timestamp', functions.date_format(
        ratings.timestamp.cast(types.TimestampType()), "yyyy-MM-dd HH:mm:ss")))

# cutoff_ratio = 1
# cutoff = int(ratings.count() * cutoff_ratio)
# ratings_train = ratings.sort(ratings.timestamp.asc()).limit(cutoff)
# ratings_test = ratings.subtract(ratings_train)

## ALS Model

In [5]:
# build recommendation model using ALS on the training data
als = ALS(
    rank=10,
    maxIter=10,
    userCol='user_id',
    itemCol='movie_id',
    ratingCol='rating',
)
als_model = als.fit(ratings)
preds_requests = als_model.transform(requests)
#preds_requests.persist()

In [None]:
preds_requests.show(10)

## Address the Cold-start Problem
Since we still have `NaN`s in our prediction column, we need to figure out a way to predict based on user data, which can be found in `data/users.dat`

In [16]:
movies_df = pd.read_csv('data/movies_metadata.csv', 
                        usecols=['id', 'popularity', 'runtime', 'title', 'vote_average'], 
                        dtype={'id':str, 'popularity':str, 'runtime':float, 'title':str, 'vote_average':float},
                        error_bad_lines=False
                        )
movies_df['movie_id'] = movies_df['id'].astype(str)
movies_df.head(2)

Unnamed: 0,id,popularity,runtime,title,vote_average,movie_id
0,862,21.946943,81.0,Toy Story,7.7,862
1,8844,17.015539,104.0,Jumanji,6.9,8844


In [17]:
users = spark.read.load("data/users.dat",
     format="csv", sep=":", inferSchema="true").drop('_c1', '_c3', '_c5', '_c7')

users = (users.withColumnRenamed(users.schema.names[0], 'user_id')
              .withColumnRenamed(users.schema.names[1], 'gender')
              .withColumnRenamed(users.schema.names[2], 'age')
              .withColumnRenamed(users.schema.names[3], 'occupation')
              .withColumnRenamed(users.schema.names[4], 'zip'))

In [18]:
users.show(10)

+-------+------+---+----------+-----+
|user_id|gender|age|occupation|  zip|
+-------+------+---+----------+-----+
|      1|     F|  1|        10|48067|
|      2|     M| 56|        16|70072|
|      3|     M| 25|        15|55117|
|      4|     M| 45|         7|02460|
|      5|     M| 25|        20|55455|
|      6|     F| 50|         9|55117|
|      7|     M| 35|         1|06810|
|      8|     M| 25|        12|11413|
|      9|     M| 25|        17|61614|
|     10|     F| 35|         1|95370|
+-------+------+---+----------+-----+
only showing top 10 rows



In [19]:
# Alternate strategy
# with open('data/users.dat') as f:
#     with open('data/users.csv', 'w') as f2:
#         f2.write('user_id, gender, age, occupation, zip\n')
#         for line in f:
#             f2.write(line.replace('::', ','))
            
# users = spark.read.csv('data/users.csv', header=True)
# users.show(10)

In [20]:
df_raw = (users.join(ratings, 'user_id', 'inner')
               .select('gender', 'age', 'occupation', 'zip', 'movie_id', 'rating')
               .toPandas()
         )
del movies_df
df_raw['movie_id'] = df_raw[['movie_id']].astype(str)

In [21]:
df = df_raw.merge(movies_df, how='outer', on='movie_id').drop('id', axis=1)
df.head(2)

Unnamed: 0,gender,age,occupation,zip,movie_id,rating,popularity,runtime,title,vote_average
0,M,25.0,6.0,11106,858,4.0,10.234919,105.0,Sleepless in Seattle,6.5
1,F,45.0,1.0,76006,858,5.0,10.234919,105.0,Sleepless in Seattle,6.5


In [22]:
del df_raw

In [23]:
n = 3
df['zip'] = df['zip'].astype(str).map(lambda x: str('0'*n)[len(x[:n]):] + x[:n] + 'x' * (5-n))
df['gender'] = df['gender'].map({'M':0, 'F':1})
df['age'] = df['age'].map({1:15, 18:21, 25:30, 35:40, 45:47, 50:53, 56:65})
df['occupation'] = df['occupation'].fillna(99)

ohe = OneHotEncoder(sparse=True, categories='auto')
np_ohe_zips = ohe.fit_transform(df[['zip', 'occupation']])
# ohe_occu = OneHotEncoder(sparse=False, categories='auto')
# np_ohe_occu = ohe_occu.fit_transform(df[['occupation']].fillna(99))

In [26]:
np_df = df.drop(['zip', 'movie_id', 'occupation'], axis=1).values

In [30]:
np_final = np.hstack((np_df, np_ohe_zips.toarray()))
del np_df
del np_ohe_zips

In [31]:
ohe_zips_labels = (ohe.get_feature_names()).tolist()

In [32]:
col_names = (df.columns.drop(['zip', 'movie_id', 'occupation'])).tolist() + ohe_zips_labels

In [36]:
import sys
for var, obj in list(locals().items()):
    if sys.getsizeof(obj) > 10000:
        print(var, sys.getsizeof(obj))

movies_df 12715160
df 215628473
np_final 4258329976


In [None]:
df_final = pd.DataFrame(np_final, columns=col_names)

In [None]:
del np_final

df_final.head(2)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    df_final.drop('rating', axis=1), 
    df_final['rating'],
    train_size=.2
)

In [None]:
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor

In [None]:
forest = RandomForestRegressor(n_estimators=50, n_jobs=-1)

In [None]:
forest.fit(X_train, y_train)

In [None]:
print(forest.score(X_train, y_train), 'is the training score')
forest.score(X_test, y_test)

In [None]:
grad_boost = GradientBoostingRegressor(n_estimators=35)
grad_boost.fit(X_train, y_train)

In [None]:
print(grad_boost.score(X_train, y_train), 'is the training score')
grad_boost.score(X_test, y_test)