In [8]:
import pyspark
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from pyspark.ml.recommendation import ALS, ALSModel
from pyspark.sql import functions, types
from pyspark.ml.evaluation import RegressionEvaluator

In [9]:
spark = pyspark.sql.SparkSession.builder.master("local").getOrCreate()

ratings = spark.read.json('data/ratings.json')
requests = spark.read.json('data/requests.json')

In [10]:
# convert format of datetime column 'timestamp' from epoch to standard 
ratings = (ratings.withColumn(
    'timestamp', functions.date_format(
        ratings.timestamp.cast(types.TimestampType()), "yyyy-MM-dd HH:mm:ss")))

# cutoff_ratio = 1
# cutoff = int(ratings.count() * cutoff_ratio)
# ratings_train = ratings.sort(ratings.timestamp.asc()).limit(cutoff)
# ratings_test = ratings.subtract(ratings_train)

## ALS Model

In [11]:
# build recommendation model using ALS on the training data
als = ALS(
    rank=10,
    maxIter=10,
    userCol='user_id',
    itemCol='movie_id',
    ratingCol='rating',
)
als_model = als.fit(ratings)
preds_requests = als_model.transform(requests)
#preds_requests.persist()

In [21]:
preds_requests.show(10)

+--------+------+-------------+-------+----------+
|movie_id|rating|    timestamp|user_id|prediction|
+--------+------+-------------+-------+----------+
|     148|   NaN| 9.77959026E8|     53|       NaN|
|     148|   NaN| 9.76559602E8|   4169| 3.2358441|
|     148|   NaN| 9.89024856E8|   5333|  2.539416|
|     148|   NaN| 9.77005381E8|   4387| 2.3343704|
|     148|   NaN| 9.66907208E8|   3539| 2.8096974|
|     148|   NaN| 9.76266538E8|    840| 2.4295588|
|     148|   NaN| 9.76841639E8|    216|       NaN|
|     148|   NaN| 9.76191154E8|    482|       NaN|
|     148|   NaN|1.029283935E9|    752| 3.0107617|
|     148|   NaN|1.026978024E9|    424|       NaN|
+--------+------+-------------+-------+----------+
only showing top 10 rows



In [19]:
df = preds_requests.toPandas()
df['prediction'] = df['prediction'].fillna(df['prediction'].mean())
df.to_json('requests.json')

In [23]:
df.head(20)

Unnamed: 0,movie_id,rating,timestamp,user_id,prediction
0,148,,977959000.0,53,3.399569
1,148,,976559600.0,4169,3.235844
2,148,,989024900.0,5333,2.539416
3,148,,977005400.0,4387,2.33437
4,148,,966907200.0,3539,2.809697
5,148,,976266500.0,840,2.429559
6,148,,976841600.0,216,3.399569
7,148,,976191200.0,482,3.399569
8,148,,1029284000.0,752,3.010762
9,148,,1026978000.0,424,3.399569


## Address the Cold-start Problem
Since we still have `NaN`s in our prediction column, we need to figure out a way to predict based on user data, which can be found in `data/users.dat`

In [7]:
movies_df = pd.read_csv('data/movies_metadata.csv', 
                        usecols=['id', 'popularity', 'runtime', 'vote_average'], 
                        dtype={'id':float, 'popularity':float, 'runtime':float, 'vote_average':float},
                        error_bad_lines=False,
                        )
movies_df['movie_id'] = movies_df['id'].astype(str)
movies_df.head(2)

ValueError: could not convert string to float: '1997-08-20'

In [12]:
users = spark.read.load("data/users.dat",
     format="csv", sep=":", inferSchema="true").drop('_c1', '_c3', '_c5', '_c7')

users = (users.withColumnRenamed(users.schema.names[0], 'user_id')
              .withColumnRenamed(users.schema.names[1], 'gender')
              .withColumnRenamed(users.schema.names[2], 'age')
              .withColumnRenamed(users.schema.names[3], 'occupation')
              .withColumnRenamed(users.schema.names[4], 'zip'))

In [13]:
users.show(10)

+-------+------+---+----------+-----+
|user_id|gender|age|occupation|  zip|
+-------+------+---+----------+-----+
|      1|     F|  1|        10|48067|
|      2|     M| 56|        16|70072|
|      3|     M| 25|        15|55117|
|      4|     M| 45|         7|02460|
|      5|     M| 25|        20|55455|
|      6|     F| 50|         9|55117|
|      7|     M| 35|         1|06810|
|      8|     M| 25|        12|11413|
|      9|     M| 25|        17|61614|
|     10|     F| 35|         1|95370|
+-------+------+---+----------+-----+
only showing top 10 rows



In [14]:
# Alternate strategy
# with open('data/users.dat') as f:
#     with open('data/users.csv', 'w') as f2:
#         f2.write('user_id, gender, age, occupation, zip\n')
#         for line in f:
#             f2.write(line.replace('::', ','))
            
# users = spark.read.csv('data/users.csv', header=True)
# users.show(10)

In [15]:
df_raw = (users.join(ratings, 'user_id', 'inner')
               .select('gender', 'age', 'occupation', 'zip', 'movie_id', 'rating')
               .toPandas()
         )
df_raw['movie_id'] = df_raw[['movie_id']].astype(str)

In [31]:
df = df_raw.merge(movies_df, how='outer', on='movie_id').drop('id', axis=1)
del movies_df
del df_raw

NameError: name 'df_raw' is not defined

In [64]:
df = df[['rating', 'gender', 'age', 'occupation', 'zip', 'movie_id',
       'popularity', 'runtime', 'vote_average']]

In [65]:
n = 3
df['zip'] = df['zip'].astype(str).map(lambda x: str('0'*n)[len(x[:n]):] + x[:n] + 'x' * (5-n))
df['gender'] = df['gender'].map({'M':0, 'F':1})
df['age'] = df['age'].map({1:15, 18:21, 25:30, 35:40, 45:47, 50:53, 56:65})
df['occupation'] = df['occupation'].fillna(99)

ohe = OneHotEncoder(sparse=True, categories='auto')
np_ohe_zips = ohe.fit_transform(df[['zip', 'occupation']])
# ohe_occu = OneHotEncoder(sparse=False, categories='auto')
# np_ohe_occu = ohe_occu.fit_transform(df[['occupation']].fillna(99))

In [66]:
np_df = df.drop(['zip', 'movie_id', 'occupation'], axis=1).values

In [67]:
np_final = np.hstack((np_df, np_ohe_zips.toarray()))
del np_df
del np_ohe_zips

In [68]:
ohe_zips_labels = (ohe.get_feature_names()).tolist()

In [56]:
col_names = (df.columns.drop(['zip', 'movie_id', 'occupation'])).tolist() + ohe_zips_labels

In [57]:
col_names[:20]

['rating',
 'gender',
 'age',
 'popularity',
 'runtime',
 'vote_average',
 'x0_002xx',
 'x0_006xx',
 'x0_009xx',
 'x0_010xx',
 'x0_011xx',
 'x0_012xx',
 'x0_013xx',
 'x0_014xx',
 'x0_015xx',
 'x0_016xx',
 'x0_017xx',
 'x0_018xx',
 'x0_019xx',
 'x0_020xx']

In [41]:
import sys
for var, obj in list(locals().items()):
    if sys.getsizeof(obj) > 10000:
        print(var, sys.getsizeof(obj) // 10**6)

df 215
np_final 4258


In [58]:
X = np_final[:,1:].tonumeric()
y = np_final[:,0].tonumeric()
#del np_final

In [59]:
# df_final = pd.DataFrame(np_final, columns=col_names)

In [60]:
# X_train, X_test, y_train, y_test = train_test_split(
#     df_final.drop('rating', axis=1), 
#     df_final['rating'],
#     train_size=.2
# )

In [61]:
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor

In [62]:
forest = RandomForestRegressor(n_estimators=50, n_jobs=-1)

In [63]:
forest.fit(X, y)

ValueError: could not convert string to float: 'Beware Of Frost Bites'

In [None]:
print(forest.score(X_train, y_train), 'is the training score')
forest.score(X_test, y_test)

In [None]:
grad_boost = GradientBoostingRegressor(n_estimators=35)
grad_boost.fit(X_train, y_train)

In [None]:
print(grad_boost.score(X_train, y_train), 'is the training score')
grad_boost.score(X_test, y_test)