# **WELCOME TO THIS NOTEBOOK**

In [29]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Let's install pyspark

In [30]:
!pip install pyspark



Importing the modules

In [31]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import count, desc , col, max
from pyspark.ml.feature import  StringIndexer
from pyspark.ml import Pipeline
from pyspark.ml.recommendation import ALS
from pyspark.ml.tuning import TrainValidationSplit, ParamGridBuilder

Creating the spark session


In [32]:
spark = SparkSession.builder.appName("lastfm").getOrCreate()

# Loading the dataset

In [33]:
file_path = '/content/drive/MyDrive/datafiles/dataset/listenings.csv'
df_listening = spark.read.format('csv').option('header',True).option('inferSchema',True).load(file_path)
df_listening.show()

+-----------+-------------+--------------------+---------------+--------------------+
|    user_id|         date|               track|         artist|               album|
+-----------+-------------+--------------------+---------------+--------------------+
|000Silenced|1299680100000|           Price Tag|       Jessie J|         Who You Are|
|000Silenced|1299679920000|Price Tag (Acoust...|       Jessie J|           Price Tag|
|000Silenced|1299679440000|Be Mine! (Ballad ...|          Robyn|            Be Mine!|
|000Silenced|1299679200000|            Acapella|          Kelis|            Acapella|
|000Silenced|1299675660000|   I'm Not Invisible|      The Tease|   I'm Not Invisible|
|000Silenced|1297511400000|Bounce (Feat NORE...|       MSTRKRFT|         Fist of God|
|000Silenced|1294498440000|Don't Stop The Mu...|        Rihanna|Addicted 2 Bassli...|
|000Silenced|1292438340000|               ObZen|      Meshuggah|               ObZen|
|000Silenced|1292437740000|   Yama's Messengers|      


# Cleaning tables 

In [34]:
df_listening = df_listening.drop('date')
df_listening.show()

+-----------+--------------------+---------------+--------------------+
|    user_id|               track|         artist|               album|
+-----------+--------------------+---------------+--------------------+
|000Silenced|           Price Tag|       Jessie J|         Who You Are|
|000Silenced|Price Tag (Acoust...|       Jessie J|           Price Tag|
|000Silenced|Be Mine! (Ballad ...|          Robyn|            Be Mine!|
|000Silenced|            Acapella|          Kelis|            Acapella|
|000Silenced|   I'm Not Invisible|      The Tease|   I'm Not Invisible|
|000Silenced|Bounce (Feat NORE...|       MSTRKRFT|         Fist of God|
|000Silenced|Don't Stop The Mu...|        Rihanna|Addicted 2 Bassli...|
|000Silenced|               ObZen|      Meshuggah|               ObZen|
|000Silenced|   Yama's Messengers|         Gojira|The Way of All Flesh|
|000Silenced|On the Brink of E...|   Napalm Death|Time Waits For No...|
|000Silenced|On the Brink of E...|   Napalm Death|Time Waits For

In [35]:
df_listening =df_listening.na.drop()
df_listening.show()

+-----------+--------------------+---------------+--------------------+
|    user_id|               track|         artist|               album|
+-----------+--------------------+---------------+--------------------+
|000Silenced|           Price Tag|       Jessie J|         Who You Are|
|000Silenced|Price Tag (Acoust...|       Jessie J|           Price Tag|
|000Silenced|Be Mine! (Ballad ...|          Robyn|            Be Mine!|
|000Silenced|            Acapella|          Kelis|            Acapella|
|000Silenced|   I'm Not Invisible|      The Tease|   I'm Not Invisible|
|000Silenced|Bounce (Feat NORE...|       MSTRKRFT|         Fist of God|
|000Silenced|Don't Stop The Mu...|        Rihanna|Addicted 2 Bassli...|
|000Silenced|               ObZen|      Meshuggah|               ObZen|
|000Silenced|   Yama's Messengers|         Gojira|The Way of All Flesh|
|000Silenced|On the Brink of E...|   Napalm Death|Time Waits For No...|
|000Silenced|On the Brink of E...|   Napalm Death|Time Waits For


# Let's Perform some aggregation
to see how many times each user has listened to specific track


In [36]:
row_number=df_listening.count()
col_number=len(df_listening.columns)
print(row_number,col_number)

13758905 4


In [37]:
df_listening_agg = df_listening.select('user_id','track').groupby('user_id','track').agg(count('*').alias('count')).orderBy('user_id')
df_listening_agg.show()

+-------+--------------------+-----+
|user_id|               track|count|
+-------+--------------------+-----+
| --Seph|Chelsea Hotel - L...|    1|
| --Seph|               Leloo|    1|
| --Seph|          Paris 2004|    7|
| --Seph|     The Way We Were|    1|
| --Seph|        Window Blues|    1|
| --Seph|   Summa for Strings|    1|
| --Seph|         The Embrace|    1|
| --Seph|       Life On Mars?|    1|
| --Seph|Hungarian Rhapsod...|    1|
| --Seph| Air on the G String|    1|
| --Seph|Vestido Estampado...|    1|
| --Seph|Belina (Original ...|    1|
| --Seph|Hungarian Dance No 5|    1|
| --Seph|       Phantom Pt II|    1|
| --Seph|              Monday|    1|
| --Seph| White Winter Hymnal|    3|
| --Seph|Airplanes [feat H...|    1|
| --Seph|  California Waiting|    1|
| --Seph|      Hour for magic|    2|
| --Seph|Virus (Luke Fair ...|    1|
+-------+--------------------+-----+
only showing top 20 rows



In [38]:
row_numbers =df_listening_agg.count()
column_numbers =len(df_listening_agg.columns)
print(row_numbers,column_numbers)

9930128 3


In [39]:
df_listening_agg = df_listening_agg.limit(20000)

# Let's convert the user id and track columns into unique integers




In [40]:
indexer = [StringIndexer(inputCol=col,outputCol = col+'_index').fit(df_listening_agg)for col in list(set(df_listening_agg.columns)-set(['count']))]
pipline =Pipeline(stages=indexer)
data=pipline.fit(df_listening_agg).transform(df_listening_agg)
data.show()

+-------+--------------------+-----+-------------+-----------+
|user_id|               track|count|user_id_index|track_index|
+-------+--------------------+-----+-------------+-----------+
| --Seph| White Winter Hymnal|    3|         69.0|       59.0|
| --Seph|Virus (Luke Fair ...|    1|         69.0|    15896.0|
| --Seph|Airplanes [feat H...|    1|         69.0|      519.0|
| --Seph|Belina (Original ...|    1|         69.0|     3278.0|
| --Seph|              Monday|    1|         69.0|      334.0|
| --Seph|Hungarian Dance No 5|    1|         69.0|     7558.0|
| --Seph|       Life On Mars?|    1|         69.0|     1161.0|
| --Seph|  California Waiting|    1|         69.0|      197.0|
| --Seph|       Phantom Pt II|    1|         69.0|     1377.0|
| --Seph|   Summa for Strings|    1|         69.0|    13739.0|
| --Seph|      Hour for magic|    2|         69.0|     7495.0|
| --Seph|Hungarian Rhapsod...|    1|         69.0|     7559.0|
| --Seph|     The Way We Were|    1|         69.0|    1

In [41]:
data =data.select('user_id_index','track_index','count').orderBy('user_id_index')
data.show()

+-------------+-----------+-----+
|user_id_index|track_index|count|
+-------------+-----------+-----+
|          0.0|    10943.0|    1|
|          0.0|    11628.0|    2|
|          0.0|     1349.0|    1|
|          0.0|      381.0|    1|
|          0.0|     8692.0|    1|
|          0.0|     6899.0|    1|
|          0.0|    14044.0|    1|
|          0.0|    15513.0|    1|
|          0.0|    11978.0|    2|
|          0.0|    15176.0|    1|
|          0.0|     8305.0|    1|
|          0.0|    13722.0|    1|
|          0.0|    10620.0|    1|
|          0.0|     4424.0|    1|
|          0.0|    16732.0|    1|
|          0.0|    10630.0|    1|
|          0.0|    12169.0|    1|
|          0.0|     4117.0|    1|
|          0.0|    10336.0|    1|
|          0.0|    16829.0|    1|
+-------------+-----------+-----+
only showing top 20 rows



# Train and Test data

In [42]:
(training,testing)=data.randomSplit([0.5,0.5])

# Let's Create our Model

In [43]:
USERID='user_id_index'

TRACK ='track_index'
COUNT='count'
als=ALS(maxIter=5,regParam=0.01,userCol=USERID,itemCol=TRACK,ratingCol=COUNT)
model=als.fit(training)

predictions=model.transform(testing)


# Generate top 10 Track recommendations for each user

In [44]:
recs = model.recommendForAllUsers(10)

In [45]:
recs.show()

+-------------+--------------------+
|user_id_index|     recommendations|
+-------------+--------------------+
|          148|[[16909, 17.74880...|
|           31|[[11941, 37.65725...|
|           85|[[11941, 27.35186...|
|          137|[[11941, 34.99787...|
|           65|[[11941, 35.65268...|
|           53|[[11941, 14.79691...|
|          133|[[8496, 15.997536...|
|           78|[[568, 6.87129], ...|
|          108|[[14301, 8.297548...|
|           34|[[11941, 8.73956]...|
|          101|[[16628, 8.97987]...|
|          115|[[11941, 19.74026...|
|          126|[[568, 13.980038]...|
|           81|[[568, 8.82521], ...|
|           28|[[11941, 39.91378...|
|           76|[[309, 9.459379],...|
|           26|[[235, 11.07777],...|
|           27|[[16909, 11.47072...|
|           44|[[16909, 14.72031...|
|          103|[[16909, 17.60501...|
+-------------+--------------------+
only showing top 20 rows



In [46]:
recs.take(1)

[Row(user_id_index=148, recommendations=[Row(track_index=16909, rating=17.748804092407227), Row(track_index=9500, rating=13.607417106628418), Row(track_index=4619, rating=8.573165893554688), Row(track_index=9557, rating=8.573165893554688), Row(track_index=10645, rating=8.573165893554688), Row(track_index=8496, rating=8.573165893554688), Row(track_index=11878, rating=8.573165893554688), Row(track_index=3150, rating=7.501521110534668), Row(track_index=8821, rating=7.230241298675537), Row(track_index=9101, rating=7.178316116333008)])]