# WSDM - KKBox's Music Recommendation Challenge

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

## Load train data

In [2]:
# Load the zip csv file
from pyspark.sql import SQLContext

sqlContext = SQLContext(sc)

# Load data and have a look
df = sqlContext.read \
        .format('com.databricks.spark.csv') \
        .options(header='true', inferschema='true') \
        .load('./process/train.csv.gz')
df.show(5)

+-----+-------+-----------------+-------------------+---------------+------+
| msno|song_id|source_system_tab| source_screen_name|    source_type|target|
+-----+-------+-----------------+-------------------+---------------+------+
| 9176| 474849|          explore|            Explore|online-playlist|     1|
|19273|1425656|       my library|Local playlist more| local-playlist|     1|
|19273| 768950|       my library|Local playlist more| local-playlist|     1|
|19273| 150624|       my library|Local playlist more| local-playlist|     1|
| 9176| 210388|          explore|            Explore|online-playlist|     1|
+-----+-------+-----------------+-------------------+---------------+------+
only showing top 5 rows



### Let's firstly check if the dataset if balanced
- From the result, we find that the positive and negative occupy almost the same
- So we don't need to rebalance the data

In [3]:
total    = df.count()
positive = df.filter(df['target']==1).count()
negative = df.filter(df['target']==0).count()

print("Positive: {} \nNegative: {}".format(float(positive)/total, float(negative)/total))

Positive: 0.5035170841614234 
Negative: 0.49648291583857657


### Random split the data into train and eval
- Train: 0.8
- Eval : 0.2

In [4]:
trainDF, evalDF = df.randomSplit([0.8, 0.2])

## Load test data

In [5]:
# Load data and have a look
testDF = sqlContext.read \
        .format('com.databricks.spark.csv') \
        .options(header='true', inferschema='true') \
        .load('./process/test.csv.gz')

---
# Method 1: Collaborative Filtering

### We will only choose three columns to build the model
- msno: user_id
- song_id
- target: score

In [35]:
from pyspark.ml.recommendation import ALS
from pyspark.ml.evaluation import RegressionEvaluator

# Build and train the model
als = ALS(maxIter=15, regParam=0.01, userCol="msno", itemCol="song_id", ratingCol="target")
model = als.fit(trainDF)

# Predict and evaluate on test dataset
predictions = model.transform(evalDF)

# Fill the NaN prediction with 0.5
predictions = predictions.fillna(0.5)

# Evaluation
evaluator = RegressionEvaluator(metricName="rmse", labelCol="target",
                                predictionCol="prediction")
rmse = evaluator.evaluate(predictions)
print("Root-mean-square error = " + str(rmse))

Root-mean-square error = 0.47410587523568726


### Do the prediction

In [42]:
# Predict and evaluate on test dataset
resultDF_1 = model.transform(testDF)
resultDF_1 = resultDF_1.select(["id", "prediction"])
resultDF_1 = resultDF_1.sort('id')
resultDF_1.show()

+---+------------+
| id|  prediction|
+---+------------+
|  0|  0.38287437|
|  1|  0.26514986|
|  2| -0.21284494|
|  3|4.9685687E-4|
|  4|  0.16973872|
|  5|  0.45950592|
|  6| 0.120891124|
|  7|  0.60329115|
|  8|   0.4835725|
|  9|  0.79836446|
| 10|  0.83793026|
| 11|  0.25893503|
| 12|  0.24025439|
| 13|  0.42867026|
| 14|   0.2588826|
| 15|   0.2154228|
| 16|  0.26846784|
| 17|   0.4095208|
| 18|   0.7259852|
| 19|   0.8428296|
+---+------------+
only showing top 20 rows



---
# Method 2: Classification and Regression 

### Select and merge all user information from all data frame
- train, eval and test
    - msno
    - song_id
    - source_system_tab
    - source_screen_name
    - source_type
    - target
- user: 
    - msno
    - city
    - bd
    - gender
    - registered_via
- song: 
    - song_id
    - song_length : Process by diving to minutes
    - genre_ids
    - artist_name
    - composer
    - lyricist
    - language
- song_extra:
    - song_id          
    - name      
    - isrc : Process to get the country code