In [None]:
import findspark
findspark.init()

import pyspark

# SparkContext
from pyspark.sql import SparkSession
from pyspark.conf import SparkConf

In [None]:
pyspark.__version__

'2.3.1'

In [None]:
spark = pyspark.sql.SparkSession.builder.getOrCreate()


In [None]:
# File location and type
file_location = "rec_data_spark.csv"
file_type = "csv"

# CSV options
infer_schema = "false"
first_row_is_header = "true"
delimiter = ","

# The applied options are for CSV files. For other file types, these will be ignored.
df = spark.read.format(file_type) \
  .option("inferSchema", infer_schema) \
  .option("header", first_row_is_header) \
  .option("sep", delimiter) \
  .load(file_location)

display(df)

DataFrame[_c0: string, food_id: string, food_name: string, userid: string, stars: string]

In [None]:
columnsToKeep = ['food_id', 'userid', 'stars']

df = df.select(columnsToKeep)
df.cache().count()

183312

In [None]:
for column in ['food_id', 'userid', 'stars']:
    df = df.withColumn(column, df[column].cast('int'))
    
df.printSchema()

root
 |-- food_id: integer (nullable = true)
 |-- userid: integer (nullable = true)
 |-- stars: integer (nullable = true)



In [None]:
import numpy as np
import pandas as pd

def preview(df, n=3):
    return pd.DataFrame(df.take(n), columns=df.columns)

In [None]:
preview(df, n=3)

Unnamed: 0,food_id,userid,stars
0,531,3753,4
1,531,40728,3
2,531,3753,4


In [None]:
train, test = df.randomSplit([0.7, 0.3], seed=42)

from pyspark.ml.recommendation import ALS, ALSModel
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.ml.evaluation import RegressionEvaluator

# The ALS instance
als = ALS(userCol='userid',
          itemCol='food_id',
          ratingCol='stars',
          seed=42,
          nonnegative=True)

# The parameter grid to search
# NOTE: the parmeter lists can be reduced to two or even 
# one item if the grid search takes too long
als_paramgrid = (ParamGridBuilder()
                 .addGrid(als.rank, [2, 4])
                 .addGrid(als.maxIter, [10])
                 .addGrid(als.regParam, [0.1])
                 .addGrid(als.alpha, [2.0, 3.0, 4.0])
                 .build())

# The evaluation function for determining the best model
rmse_eval = RegressionEvaluator(labelCol='stars',
                                predictionCol='prediction', 
                                metricName='rmse')

# The cross validation instance
als_cv = CrossValidator(estimator=als,
                        estimatorParamMaps=als_paramgrid,
                        evaluator=rmse_eval,
                        numFolds=5, 
                        seed=42)

# Fit the models and find the best one!
als_cv = als_cv.fit(train.dropna())

In [None]:
als_best = als_cv.bestModel

print(als_best.rank)

In [None]:
als_best.save("als_best.model")

TypeError: save() takes 2 positional arguments but 3 were given

In [None]:
model = ALSModel.load('als_best.model')

In [None]:
als_pred_train = als_best.transform(train)
als_pred_best = als_best.transform(test)

als_rmse = pd.DataFrame([ (rmse_eval.evaluate(als_pred_train.dropna()), 
                                        rmse_eval.evaluate(als_pred_best.dropna())) ],
                                     columns=['rmse_train', 'rmse_test'])

als_rmse

Unnamed: 0,rmse_train,rmse_test
0,0.424871,0.654038


In [None]:
train_try = train

In [None]:
train.show()

+-------+------+-----+
|food_id|userid|stars|
+-------+------+-----+
|      0|   431|    5|
|      0|  4348|    5|
|      0|  5069|    5|
|      0|  5718|    5|
|      0|  5916|    4|
|      0|  6692|    5|
|      0|  6977|    5|
|      0|  8014|    5|
|      0| 10143|    3|
|      0| 10832|    5|
|      0| 12222|    4|
|      0| 14144|    5|
|      0| 14747|    5|
|      0| 15023|    5|
|      0| 16132|    5|
|      0| 16571|    5|
|      0| 16861|    5|
|      0| 17116|    5|
|      0| 17285|    5|
|      0| 17703|    5|
+-------+------+-----+
only showing top 20 rows



In [None]:
train.count()

128360

In [None]:
train.agg({'userid': 'max'}).show()


+-----------+
|max(userid)|
+-----------+
|      57693|
+-----------+



In [None]:
vals = [(1832, 57694, 5),
        (1671,57694, 5),
        (324,57694, 1),
        (642,57694, 5),
        (1785,57694, 3),]

In [None]:
newRows = spark.createDataFrame(vals,train_try.columns)

In [None]:
train_try = train_try.union(newRows)

In [None]:
train_try_pd = train_try.toPandas()

In [None]:
# create the model with new user
als = ALS(userCol='userid',
          itemCol='food_id',
          ratingCol='stars',
          seed=43,
          rank=als_best.rank,
          maxIter=10,
          regParam=0.1,
          alpha=2.0,
          nonnegative=True)

als_new_user = als.fit(train_try.dropna())

In [None]:
nrecommendations = als_new_user.recommendForAllUsers(5)

In [None]:
from pyspark.sql.functions import split, explode, col, lower, sort_array

In [None]:
recommendationsDF = (nrecommendations
  .select("userid", explode("recommendations")
  .alias("recommendation"))
  .select("userid", "recommendation.*")
)

display(recommendationsDF)

DataFrame[userid: int, food_id: int, rating: float]

In [None]:
collab_rec_57694 = recommendationsDF[recommendationsDF['userid'] == 57694].toPandas()

In [None]:
rated_57694 = train_try_pd[train_try_pd['userid']==57694]['food_id'].tolist()

In [None]:
rated_57694 #rated before

[1832, 1671, 324, 642, 1785]

In [None]:
foods_rec = []
for i in collab_rec_57694['food_id']:
    if i not in rated_57694:
        foods_rec.append(i)

In [None]:
foods_rec #index of recommended foods

[1580, 797, 635, 486, 898]

In [None]:
all_data = pd.read_csv("rec_data_all.csv", index_col=0)

In [None]:
all_data = all_data[['food_id', 'food_name', 'ingredients', 'recipe', 'total_time', 'nutrition']]

In [None]:
all_data.to_csv('flask_data.csv')

In [None]:
recommended_foods = all_data[all_data['food_id'].isin(foods_rec)].drop_duplicates(subset = 'food_name')

In [None]:
pd.options.display.max_rows=9999
#pd.options.display.max_colwidth=-1
pd.set_option('max_colwidth', 100)

In [None]:
recommended_foods

Unnamed: 0,food_id,food_name,ingredients,recipe,total_time,nutrition
11702,635,Eskimo Cubes for Summer,"2 ½ cups cubed seeded watermelon,2 ½ cups cubed cantaloupe,2 ½ cups cubed honeydew,1 cup frozen ...","Blend the watermelon, cantaloupe, honeydew, raspberries, strawberries, sugar, and lemon juice in...",4 hrs 20 mins,44 calories; protein 0.6g; carbohydrates 11g; fat 0.2g; sodium 9mg
59661,1580,Spicy Breakfast Sausage Patties,"1 pound ground pork,1 teaspoon rubbed sage,1 teaspoon salt,1 teaspoon crushed red pepper,½ teasp...","Combine ground pork, sage, salt, crushed red pepper, marjoram, black pepper, onion powder, and t...",25 mins,107 calories; protein 8.1g; carbohydrates 0.4g; fat 7.9g; cholesterol 29.4mg; sodium 255.7mg
59737,486,Crispy Air-Fried Chicken,"1 tablespoon ground paprika,1 teaspoon salt,1 teaspoon onion powder,1 teaspoon garlic powder,½ t...","Mix paprika, salt, onion powder, garlic powder, marjoram, sage, pepper, and nutmeg together in a...",50 mins,436 calories; protein 45.7g; carbohydrates 4g; fat 25.7g; cholesterol 127.5mg; sodium 735.4mg
60466,797,Grilled Chicken Under a Brick,"1 (3 pound) whole chicken,1 teaspoon olive oil,1 pinch salt and freshly ground black pepper to t...","Place the chicken, breast side down, on a cutting board. Remove the backbone and 1 inch of the b...",1 hr 5 mins,229 calories; protein 20.9g; fat 15.5g; cholesterol 61.5mg; sodium 82.7mg
61846,898,Instant Pot® Gyros,"4 pounds pork butt, cut into 2-inch cubes,3 tablespoons Greek seasoning (such as Cavender's®),1 ...",Turn on a multi-functional pressure cooker (such as Instant Pot®) and select the Saute function....,1 hr 5 mins,250 calories; protein 14.8g; carbohydrates 1.2g; fat 20.7g; cholesterol 65.2mg; sodium 1654.1mg
