# Modules

In [83]:
import pyspark
import pandas as pd
from pyspark.sql import SparkSession
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS, ALSModel
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from time import time

# Data

## Data users

In [2]:
data_users= pd.read_csv("users.csv")

In [3]:
data_users.rename(columns={"UserRanking": "UserRating", "Ranking": "Rating"}, inplace=True)

In [4]:
data_users.head()

Unnamed: 0,UserName,GameId,GameName,Status,UserRating,Rating
0,gandalfgrismx,148494,"1,2,3! Now you see me...",1,7.0,6.06084
1,gandalfgrismx,316377,7 Wonders (Second Edition),1,,7.95069
2,gandalfgrismx,155987,Abyss,1,10.0,7.32909
3,gandalfgrismx,21569,Adigma,1,7.0,7.52585
4,gandalfgrismx,31260,Agricola,1,10.0,7.90633


# Preprocesing

## Users

In [5]:
prepo_users= data_users.copy()

### Create id user

In [6]:
dc_userId={key:value for (value,key) in enumerate(list(prepo_users["UserName"].unique()))}

### Missings

In [7]:
ls_userRating=[]
ls_userId=[]
for index,row in prepo_users.iterrows():
    rating= row["UserRating"]
    name= row["UserName"]
    if(pd.isna(rating)):
        ls_userRating.append(row["Rating"])
    else:
        ls_userRating.append(rating)
    ls_userId.append(dc_userId[name])
prepo_users["UserRating"]= ls_userRating
prepo_users["UserId"] = ls_userId

In [8]:
prepo_users.sample(5)

Unnamed: 0,UserName,GameId,GameName,Status,UserRating,Rating,UserId
322,Kakiri,165950,Beasty Bar,1,6.329,6.81474,3
1834,aleck22,286096,Tapestry,1,8.0,7.44495,10
4214,Liboo,95386,Tem-Purr-A,1,6.04165,6.04165,27
2479,dieterminator,303057,Pan Am,1,9.0,7.57719,17
3669,KaeruGames,229220,Santa Maria,1,7.50058,7.50058,24


In [9]:
prepo_users["Status"].value_counts()/len(prepo_users)

1    0.959539
0    0.040461
Name: Status, dtype: float64

In [10]:
df_users= prepo_users[["UserId", "GameId", "UserRating", "Status"]].copy()

# Recomendation system

## Pyspark setup

### Session

In [58]:
sc = pyspark.SparkContext(appName ="clase_datos_masivos")

In [59]:
# local[*] usará todos los núcleos disponibles 
spark = SparkSession.builder\
                    .master('local[*]') \
                    .appName('first_spark') \
                    .getOrCreate()

### Get data

In [60]:
py_users= spark.createDataFrame(df_users)

  for column, series in pdf.iteritems():
  for column, series in pdf.iteritems():


In [61]:
py_users.show()

+------+------+----------+------+
|UserId|GameId|UserRating|Status|
+------+------+----------+------+
|     0|148494|       7.0|     1|
|     0|316377|   7.95069|     1|
|     0|155987|      10.0|     1|
|     0| 21569|       7.0|     1|
|     0| 31260|      10.0|     1|
|     0|161970|   7.66002|     1|
|     0|124742|      10.0|     1|
|     0| 17329|       7.0|     1|
|     0| 25643|       9.0|     1|
|     0| 12005|       6.0|     1|
|     0|230802|       8.0|     1|
|     0|287954|       9.0|     1|
|     0|302388|       8.0|     1|
|     0| 27225|       7.0|     1|
|     0|  2453|       8.0|     1|
|     0|231696|       6.0|     1|
|     0|332386|       8.0|     1|
|     0|184921|       8.0|     1|
|     0|   822|       8.0|     1|
|     0|164127|       9.0|     1|
+------+------+----------+------+
only showing top 20 rows



                                                                                

In [62]:
py_users= py_users.select(py_users.UserId.cast("integer"), py_users.GameId.cast("integer"), py_users.UserRating.cast("double"), py_users.Status.cast("integer"))
py_users.printSchema()

root
 |-- UserId: integer (nullable = true)
 |-- GameId: integer (nullable = true)
 |-- UserRating: double (nullable = true)
 |-- Status: integer (nullable = true)



In [63]:
py_users.createOrReplaceTempView("Users")

## Train test

In [64]:
# Una predicción que NO está
query = " select * from Users where Status =1"
py_own= spark.sql(query)

In [65]:
(train, test) = py_own.randomSplit([0.8, 0.2], seed = 202012)

## ALS

In [19]:
als= ALS(userCol="UserId", itemCol="GameId", ratingCol= "UserRating", coldStartStrategy='drop')
model= als.fit(py_own)

22/11/23 20:08:54 WARN InstanceBuilder$NativeBLAS: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS
22/11/23 20:08:54 WARN InstanceBuilder$NativeBLAS: Failed to load implementation from:dev.ludovic.netlib.blas.ForeignLinkerBLAS
22/11/23 20:08:54 WARN InstanceBuilder$NativeLAPACK: Failed to load implementation from:dev.ludovic.netlib.lapack.JNILAPACK


In [20]:
predictions= model.transform(test)

In [21]:
predictions.show()

+------+------+----------+------+----------+
|UserId|GameId|UserRating|Status|prediction|
+------+------+----------+------+----------+
|     1|128475|       6.0|     1|  6.087182|
|     1|169786|      10.0|     1|  9.495859|
|     1|176458|       7.0|     1|  6.817619|
|     1|193308|       8.0|     1| 7.9615846|
|     1|223770|       8.0|     1| 7.9200544|
|     1|234190|       6.0|     1|  6.182122|
|     1|246693|       7.0|     1|  6.986144|
|     1|246912|       7.0|     1| 7.2369866|
|     1|256479|       6.0|     1| 5.9881234|
|     1|279720|       7.5|     1| 7.4851546|
|     1|291457|       8.5|     1|  8.441986|
|     1|295947|       8.5|     1|  8.532007|
|     1|307997|       8.0|     1|  7.984164|
|     1|338460|       8.0|     1|  7.984164|
|     3|    13|     8.049|     1|   7.91313|
|     3|   188|     6.529|     1| 6.7363462|
|     3|  1339|     7.509|     1|  7.506297|
|     3|  7688|      4.19|     1| 5.3913918|
|     3|  8126|   6.28219|     1| 6.2799277|
|     3|  

In [22]:
evaluator = RegressionEvaluator(metricName="rmse", labelCol="UserRating", predictionCol="prediction")

In [23]:
RMSE_1 = evaluator.evaluate(predictions)
print (RMSE_1)

0.21815614854317808


### Search grid

In [24]:
als_grid = ALS(userCol="UserId", itemCol="GameId", ratingCol="UserRating", nonnegative = False, implicitPrefs = False, coldStartStrategy="drop")

In [25]:
param_grid = ParamGridBuilder() \
            .addGrid(als_grid.rank, [10, 20, 50, 100]) \
            .addGrid(als_grid.maxIter, [10, 15, 20, 25]) \
            .addGrid(als_grid.regParam, [.01, .05, .1, 1, 10]) \
            .build()

In [26]:
evaluator_grid = RegressionEvaluator(metricName="rmse", labelCol="UserRating", predictionCol="prediction")

In [27]:
cv = CrossValidator(estimator=als_grid, estimatorParamMaps=param_grid, evaluator=evaluator_grid, numFolds=5)

In [28]:
start_time = time()

model_grid = cv.fit(py_own)

end_time = time()
elapsed_time = end_time - start_time
print("Elapsed time: %.10f seconds." % elapsed_time)

                                                                                

Elapsed time: 1317.5082001686 seconds.


### Best model

In [29]:
best_model = model_grid.bestModel

In [30]:
print("  Rank:", best_model._java_obj.parent().getRank())
print("  MaxIter:", best_model._java_obj.parent().getMaxIter())
print("  RegParam:", best_model._java_obj.parent().getRegParam())

  Rank: 10
  MaxIter: 25
  RegParam: 0.1


In [66]:
als= ALS(userCol="UserId", itemCol="GameId", ratingCol= "UserRating",coldStartStrategy='drop', rank=10, maxIter=25, regParam=0.1)
best_model= als.fit(py_own)

### Validation data

In [67]:
query = " select * from Users where Status =0"
py_want= spark.sql(query)

In [68]:
validation= best_model.transform(py_want)

In [69]:
validation.show()

+------+------+----------+------+----------+
|UserId|GameId|UserRating|Status|prediction|
+------+------+----------+------+----------+
|     3|328479|   7.34765|     0|  6.628567|
|    17|242574|   7.15744|     0| 7.8516545|
|    19| 25292|   7.39475|     0|  8.076303|
|    17|157403|       8.0|     0| 6.3622165|
|    19|  9217|   7.27961|     0|   6.06106|
|    19|103885|   7.65565|     0|  7.432224|
|    17|280032|       8.0|     0|  6.764351|
|    17|291457|       7.0|     0|   7.49539|
|    19|146278|   7.16606|     0| 6.8624606|
|    19|172818|   7.38501|     0|  7.184277|
|    19|123260|   7.49669|     0| 7.7721505|
|    26|248900|   7.00432|     0| 6.6724176|
|    17|269144|       9.0|     0| 7.6954675|
|    19|205637|   8.15033|     0|  8.174817|
|    19|209010|   7.99829|     0|  8.337383|
|    19| 98229|   7.27087|     0|  6.144598|
|    17|127784|       8.0|     0|  5.496367|
|    19|   760|   7.41117|     0| 7.2617745|
|    19|146021|   7.76739|     0| 7.3819532|
|    17|28

In [37]:
RMSE = evaluator.evaluate(validation)

In [38]:
print(RMSE)

0.9672019239705215


## Save Model

In [46]:
path = "als_model"

In [51]:
best_model.save("best.model")

In [79]:
same=ALSModel.load("best.model")

In [84]:
spark.stop()