# WSDM - Collaborative Filtering with SVD

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

## Load the preprocessed data
- data: training dataset

In [2]:
# Load the zip csv file
from pyspark.sql import SQLContext

sqlContext = SQLContext(sc)

# Load data and have a look
df = sqlContext.read \
        .format('com.databricks.spark.csv') \
        .options(header='true', inferschema='true') \
        .load('./process/train.csv.gz')
df.show(5)

+-----+-------+-----------------+-------------------+---------------+------+
| msno|song_id|source_system_tab| source_screen_name|    source_type|target|
+-----+-------+-----------------+-------------------+---------------+------+
| 9176| 474849|          explore|            Explore|online-playlist|     1|
|19273|1425656|       my library|Local playlist more| local-playlist|     1|
|19273| 768950|       my library|Local playlist more| local-playlist|     1|
|19273| 150624|       my library|Local playlist more| local-playlist|     1|
| 9176| 210388|          explore|            Explore|online-playlist|     1|
+-----+-------+-----------------+-------------------+---------------+------+
only showing top 5 rows



### We will only choose three columns for the recommendations
- msno: user_id
- song_id
- target: score

In [3]:
# Keep only 3 columns
df = df.select(['msno', 'song_id', 'target'])
df.show(5)

+-----+-------+------+
| msno|song_id|target|
+-----+-------+------+
| 9176| 474849|     1|
|19273|1425656|     1|
|19273| 768950|     1|
|19273| 150624|     1|
| 9176| 210388|     1|
+-----+-------+------+
only showing top 5 rows



### Let's firstly check if the dataset if balanced
- From the result, we find that the positive and negative occupy almost the same
- So we don't need to rebalance the data

In [4]:
total    = df.count()
positive = df.filter(df['target']==1).count()
negative = df.filter(df['target']==0).count()

print("Positive: {} \nNegative: {}".format(float(positive)/total, float(negative)/total))

Positive: 0.5035170841614234 
Negative: 0.49648291583857657


### Random split the data into train and eval
- Train: 0.8
- Eval : 0.2

In [5]:
trainDF, testDF = df.randomSplit([0.8, 0.2])

# Build the Recommendation Model

In [6]:
from pyspark.ml.recommendation import ALS
from pyspark.ml.evaluation import RegressionEvaluator

## Build and train the model

In [7]:
# Note we set cold start strategy to 'drop' to ensure we don't get NaN evaluation metrics
als = ALS(maxIter=5, regParam=0.01, userCol="msno", itemCol="song_id", ratingCol="target",
          coldStartStrategy="drop")
model = als.fit(trainDF)

## Evaluate the model by computing the RMSE on the test data

In [8]:
# Predict and evaluate on test dataset
predictions = model.transform(testDF)
evaluator = RegressionEvaluator(metricName="rmse", labelCol="target",
                                predictionCol="prediction")
rmse = evaluator.evaluate(predictions)
print("Root-mean-square error = " + str(rmse))

Root-mean-square error = 0.4760644035693233
