In [3]:
import pyspark
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from pyspark.ml.recommendation import ALS, ALSModel
from pyspark.sql import functions, types
from pyspark.ml.evaluation import RegressionEvaluator

In [28]:
spark = pyspark.sql.SparkSession.builder.master("local").getOrCreate()

ratings = spark.read.json('data/ratings.json')
requests = spark.read.json('data/requests.json')

In [29]:
# convert format of datetime column 'timestamp' from epoch to standard 
ratings = (ratings.withColumn(
    'timestamp', functions.date_format(
        ratings.timestamp.cast(types.TimestampType()), "yyyy-MM-dd HH:mm:ss")))

cutoff_ratio = 1
cutoff = int(ratings.count() * cutoff_ratio)
ratings_train = ratings.sort(ratings.timestamp.asc()).limit(cutoff)
ratings_test = ratings.subtract(ratings_train)

## ALS Model

In [30]:
# build recommendation model using ALS on the training data
als = ALS(
    rank=10,
    maxIter=10,
    userCol='user_id',
    itemCol='movie_id',
    ratingCol='rating',
)
als_model = als.fit(ratings_train)
preds_requests = als_model.transform(requests)
preds_requests.persist()

DataFrame[movie_id: bigint, rating: double, timestamp: double, user_id: bigint, prediction: float]

In [31]:
preds_requests.show(10)

+--------+------+-------------+-------+----------+
|movie_id|rating|    timestamp|user_id|prediction|
+--------+------+-------------+-------+----------+
|     148|   NaN| 9.77959026E8|     53|       NaN|
|     148|   NaN| 9.76559602E8|   4169| 3.1513598|
|     148|   NaN| 9.89024856E8|   5333| 2.5132897|
|     148|   NaN| 9.77005381E8|   4387| 2.1353645|
|     148|   NaN| 9.66907208E8|   3539| 2.8666208|
|     148|   NaN| 9.76266538E8|    840| 2.5286915|
|     148|   NaN| 9.76841639E8|    216|       NaN|
|     148|   NaN| 9.76191154E8|    482|       NaN|
|     148|   NaN|1.029283935E9|    752| 3.2438402|
|     148|   NaN|1.026978024E9|    424|       NaN|
+--------+------+-------------+-------+----------+
only showing top 10 rows



## Address the Cold-start Problem
Since we still have `NaN`s in our prediction column, we need to figure out a way to predict based on user data, which can be found in `data/users.dat`

In [103]:
movies_df = pd.read_csv('data/movies_metadata.csv', usecols=['id', 'title', 'runtime', 'vote_average', 'popularity'])
movies_df['movie_id'] = movies_df['id'].astype(str)
movies_df.head(2)

  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0,id,popularity,runtime,title,vote_average,movie_id
0,862,21.9469,81.0,Toy Story,7.7,862
1,8844,17.0155,104.0,Jumanji,6.9,8844


In [104]:
users = spark.read.load("data/users.dat",
     format="csv", sep=":", inferSchema="true").drop('_c1', '_c3', '_c5', '_c7')

users = (users.withColumnRenamed(users.schema.names[0], 'user_id')
              .withColumnRenamed(users.schema.names[1], 'gender')
              .withColumnRenamed(users.schema.names[2], 'age')
              .withColumnRenamed(users.schema.names[3], 'occupation')
              .withColumnRenamed(users.schema.names[4], 'zip'))

In [105]:
users.show(10)

+-------+------+---+----------+-----+
|user_id|gender|age|occupation|  zip|
+-------+------+---+----------+-----+
|      1|     F|  1|        10|48067|
|      2|     M| 56|        16|70072|
|      3|     M| 25|        15|55117|
|      4|     M| 45|         7|02460|
|      5|     M| 25|        20|55455|
|      6|     F| 50|         9|55117|
|      7|     M| 35|         1|06810|
|      8|     M| 25|        12|11413|
|      9|     M| 25|        17|61614|
|     10|     F| 35|         1|95370|
+-------+------+---+----------+-----+
only showing top 10 rows



In [100]:
# Alternate strategy
# with open('data/users.dat') as f:
#     with open('data/users.csv', 'w') as f2:
#         f2.write('user_id, gender, age, occupation, zip\n')
#         for line in f:
#             f2.write(line.replace('::', ','))
            
# users = spark.read.csv('data/users.csv', header=True)
# users.show(10)

In [126]:
df_raw = (users.join(ratings, 'user_id', 'inner')
               .select('gender', 'age', 'occupation', 'zip', 'movie_id', 'rating')
               .toPandas()
         )

df_raw['movie_id'] = df_raw[['movie_id']].astype(str)

In [127]:
df = df_raw.merge(movies_df, how='outer', on='movie_id').drop('id', axis=1)
df.head(2)

Unnamed: 0,gender,age,occupation,zip,movie_id,rating,popularity,runtime,title,vote_average
0,M,25.0,6.0,11106,858,4.0,10.2349,105.0,Sleepless in Seattle,6.5
1,F,45.0,1.0,76006,858,5.0,10.2349,105.0,Sleepless in Seattle,6.5


In [128]:
n = 3
df['zip'] = df['zip'].astype(str).map(lambda x: str('0'*n)[len(x[:n]):] + x[:n] + 'x' * (5-n))
df['gender'] = df['gender'].map({'M':0, 'F':1})
df['age'] = df['age'].map({1:15, 18:21, 25:30, 35:40, 45:47, 50:53, 56:65})

ohe = OneHotEncoder(sparse=False, categories='auto')

np_ohe_zips = ohe.fit_transform(df[['zip']])
ohe_occu = OneHotEncoder(sparse=False, categories='auto')
np_ohe_occu = ohe_occu.fit_transform(df[['occupation']].fillna(99))

In [None]:
np_df = np.array(df.drop(['zip', 'movie_id', 'occupation'], axis=1))
np_final = np.column_stack((np_df, np_ohe_zips, np_ohe_occu))
ohe_zips_labels = ['zip' + s[s.find('_'):] for s in ohe.get_feature_names()]
col_names = list(df.columns.drop(['zip', 'movie_id', 'occupation'])) + ohe_zips_labels + list(ohe_occu.get_feature_names())
df_final = pd.DataFrame(np_final, columns=col_names)

In [None]:
df_final.head(2)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    df_final.drop('rating', axis=1), 
    df_final['rating'],
    train_size=.2
)

In [13]:
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor

In [15]:
forest = RandomForestRegressor(n_estimators=50, n_jobs=-1, )

In [16]:
forest.fit(X_train, y_train)

KeyboardInterrupt: 

In [53]:
print(forest.score(X_train, y_train), 'is the training score')
forest.score(X_test, y_test)

0.7259077145317665 is the training score


0.3489609617951864

In [56]:
grad_boost = GradientBoostingRegressor(n_estimators=35)
grad_boost.fit(X_train, y_train)

GradientBoostingClassifier(criterion='friedman_mse', init=None,
                           learning_rate=0.1, loss='deviance', max_depth=3,
                           max_features=None, max_leaf_nodes=None,
                           min_impurity_decrease=0.0, min_impurity_split=None,
                           min_samples_leaf=1, min_samples_split=2,
                           min_weight_fraction_leaf=0.0, n_estimators=35,
                           n_iter_no_change=None, presort='auto',
                           random_state=None, subsample=1.0, tol=0.0001,
                           validation_fraction=0.1, verbose=0,
                           warm_start=False)

In [57]:
print(grad_boost.score(X_train, y_train), 'is the training score')
grad_boost.score(X_test, y_test)

0.3631274828457927 is the training score


0.3594215647691584