In [0]:
import findspark
findspark.init()

In [0]:
import pyspark.sql.functions as sql_func
from pyspark.sql.types import *
from pyspark.ml.recommendation import ALS, ALSModel
from pyspark.context import SparkContext
from pyspark.sql.session import SparkSession
from pyspark.mllib.evaluation import RegressionMetrics, RankingMetrics
from pyspark.ml.evaluation import RegressionEvaluator

sc = SparkContext('local') #https://stackoverflow.com/questions/30763951/spark-context-sc-not-defined
spark = SparkSession(sc)

In [0]:
import csv
import ast
fr = open('users.csv', 'r', encoding='utf-8')
fw = open('users2.csv', 'w', encoding='utf-8')
rdr = csv.reader(fr)
final_list = []
final_list.append(['user_id', 'funding_id', 'funding_name', 'category', 'backedAmount'])
for line in rdr:
    funding_list = ast.literal_eval(line[1])
    userid = line[0]
    for fund in funding_list:
      funding_id = fund[0]
      funding_name = fund[1]
      
      category = fund[2]
      backedAmount = fund[3]
      if int(userid) == 0:
          print(funding_id)
      if backedAmount == "###":
          backedAmount = 0
      if backedAmount == None:
          print(backedAmount)
      backedAmount = int(backedAmount)
      if backedAmount > 0:
          backedAmount = 1
      else:
          backedAmount = 0
      
      final_list.append([userid, funding_id, funding_name,category, backedAmount])
  
wtr = csv.writer(fw)
for row in final_list:
    wtr.writerow(row)
print(len(final_list))
fr.close()
fw.close()

92606


In [0]:
final_stat = spark.read.csv(
    'users2.csv', header=True, inferSchema=True
)
final_stat = final_stat.withColumn("backedAmount", final_stat["backedAmount"].cast("double"))
final_stat = final_stat.fillna({'backedAmount':0.0})
final_stat

DataFrame[user_id: int, funding_id: int, funding_name: string, category: string, backedAmount: double]

In [0]:
ratings = (final_stat
    .select(
        'user_id',
        'funding_id',
        'backedAmount',
    )
)
ratings

DataFrame[user_id: int, funding_id: int, backedAmount: double]

In [0]:
ratings.filter("backedAmount is NULL").show()

+-------+----------+------------+
|user_id|funding_id|backedAmount|
+-------+----------+------------+
+-------+----------+------------+



In [0]:
(training, test) = ratings.randomSplit([0.8, 0.2], seed=13)
training.show()
test.show()

+-------+----------+------------+
|user_id|funding_id|backedAmount|
+-------+----------+------------+
|   8001|     64361|         1.0|
|  17201|     64622|         1.0|
|  21101|     61741|         1.0|
| 185301|     63146|         1.0|
| 190201|     65019|         1.0|
| 190401|     64532|         0.0|
| 190401|     64532|         1.0|
| 210201|     61319|         0.0|
| 210201|     63641|         0.0|
| 210201|     63641|         1.0|
| 216201|     65751|         1.0|
| 230401|     63736|         1.0|
| 230401|     63736|         1.0|
| 322701|     64138|         0.0|
| 322701|     64138|         1.0|
| 346601|     64361|         0.0|
| 357901|     67334|         1.0|
| 364801|     66538|         1.0|
| 385601|     62180|         1.0|
| 385701|     65751|         1.0|
+-------+----------+------------+
only showing top 20 rows

+-------+----------+------------+
|user_id|funding_id|backedAmount|
+-------+----------+------------+
|  16401|     64622|         1.0|
|  34101|     65608|  

In [0]:
# Build the recommendation model using ALS on the training data
# Note we set cold start strategy to 'drop' to ensure we don't get NaN evaluation metrics
als = ALS(rank=50, maxIter=20, regParam=0.01, 
          userCol="user_id", itemCol="funding_id", ratingCol="backedAmount",
          coldStartStrategy="drop",
          implicitPrefs=False)
model = als.fit(training)

# Evaluate the model by computing the RMSE on the test data
predictions = model.transform(test)
evaluator = RegressionEvaluator(metricName="rmse", labelCol="backedAmount",
                                predictionCol="prediction")

rmse = evaluator.evaluate(predictions)
print("Root-mean-square error = " + str(rmse))


Root-mean-square error = 0.5854425881031863


In [0]:
# Generate top 10 movie recommendations for each user
userRecs = model.recommendForAllUsers(10)
userRecs.count()
# Generate top 10 user recommendations for each movie
movieRecs = model.recommendForAllItems(10)
movieRecs.count()

536

In [0]:
userRecs.show()

+--------+--------------------+
| user_id|     recommendations|
+--------+--------------------+
|  441201|[[65614, 0.990398...|
| 8423901|[[63665, 0.992011...|
|12329801|[[64681, 1.168954...|
|14561301|[[64848, 0.984471...|
|15401901|[[61485, 0.990892...|
|19338701|[[62180, 0.989405...|
|20201901|[[62454, 1.16091]...|
|20568101|[[57211, 0.0], [5...|
|21747001|[[62728, 0.989998...|
|22592701|[[64711, 1.033966...|
|22896701|[[62454, 1.075294...|
|25169601|[[64672, 0.988291...|
|26009201|[[65263, 0.946537...|
|26484901|[[63616, 0.992462...|
|27501301|[[57211, 0.0], [5...|
|31317401|[[65331, 0.993791...|
|34254901|[[57211, 0.0], [5...|
|35146001|[[60667, 0.988737...|
|35937001|[[61525, 0.987707...|
|37230601|[[65928, 1.124818...|
+--------+--------------------+
only showing top 20 rows



In [0]:
movieRecs.count()

536