In [95]:
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import *
from pyspark.sql import Row
import numpy as np
np.set_printoptions(threshold=np.inf)

In [2]:
spark = SparkSession \
.builder \
.appName("Python Spark SQL basic example") \
.config("spark.some.config.option", "some-value") \
.getOrCreate()

(a)

In [28]:
sc = spark.sparkContext

schema_usr_artist = ["userid", "artistid", "playcount"]
#schemaType_usr_artist = [StringType(), StringType(), IntegerType()]

lines = sc.textFile("datasets/user_artist_data_small.txt")
parts = lines.map(lambda l: l.split(" ")).filter(lambda x: len(x) == len(schema_usr_artist))
parts = parts.map(lambda p: [p[i].strip() for i in range(len(schema_usr_artist))])
fields = [StructField(field_name, StringType(), True) for field_name in schema_usr_artist]
schema = StructType(fields)

df_usr_artist = spark.createDataFrame(parts, schema)
df_usr_artist = df_usr_artist.withColumn("playcount", col("playcount").cast(IntegerType()))

In [31]:
print(df_usr_artist.count())
df_usr_artist.show(5)

49481
+-------+--------+---------+
| userid|artistid|playcount|
+-------+--------+---------+
|1059637| 1000010|      238|
|1059637| 1000049|        1|
|1059637| 1000056|        1|
|1059637| 1000062|       11|
|1059637| 1000094|        1|
+-------+--------+---------+
only showing top 5 rows



In [35]:
schema_artist_alias = ["badid", "goodid"]
lines = sc.textFile("datasets/artist_alias_small.txt")
parts = lines.map(lambda l: l.split("\t")).filter(lambda x: len(x) == len(schema_artist_alias))
parts = parts.map(lambda p: [p[i].strip() for i in range(len(schema_artist_alias))])
fields = [StructField(field_name, StringType(), True) for field_name in schema_artist_alias]
schema = StructType(fields)

df_artist_alias = spark.createDataFrame(parts, schema)
print(df_artist_alias.count())
df_artist_alias.show(5)

587
+-------+-------+
|  badid| goodid|
+-------+-------+
|1027859|1252408|
|1017615|    668|
|6745885|1268522|
|1018110|1018110|
|1014609|1014609|
+-------+-------+
only showing top 5 rows



In [74]:
cond = [df_artist_alias.badid == df_usr_artist.artistid]
df_artist_alias_new = df_artist_alias.join(df_usr_artist, cond, 'inner').select("userid", "goodid", "playcount")\
    .withColumnRenamed("goodid", "artistid")

In [75]:
df_artist_alias_new = df_artist_alias_new.toPandas()
df_artist_alias_new

Unnamed: 0,userid,artistid,playcount
0,2023686,1056296,2
1,1052461,1341919,7
2,2010008,1326,1
3,2010008,1256115,6
4,2010008,1238269,9
...,...,...,...
736,2010008,1238478,3
737,2023686,2114258,3
738,1026084,2114258,5
739,1048402,153,1


In [140]:
import pandas as pd
rows = list(df_artist_alias_new["artistid"].unique())
counts = list(df_artist_alias_new["userid"].unique())

In [141]:
utility_matrix = np.zeros((len(rows), len(counts)))

In [142]:
for i in range(len(df_artist_alias_new.index)):
    col = counts.index(df_artist_alias_new.iloc[i][0])
    row = rows.index(df_artist_alias_new.iloc[i][1])
    utility_matrix[row][col] = df_artist_alias_new.iloc[i][2]
    

In [143]:
df_utility_matrix_item_user = pd.DataFrame(data=utility_matrix, index=rows, columns=counts) 
df_utility_matrix_item_user

Unnamed: 0,2023686,1052461,2010008,1031009,1024631,1041919,1026084,1059245,1007308,1035511,...,1047812,1021501,2000668,2069889,2064012,2017397,2070757,1070641,2102019,1076906
1056296,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1341919,0.0,7.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1326,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1256115,0.0,0.0,6.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1238269,0.0,0.0,9.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1006837,27.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1238056,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1238478,2.0,0.0,3.0,0.0,6.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2114258,3.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [145]:
def k_neighborhood(column1, k):
    df_utility_matrix_item_user.corr()[column1].sort_values(ascending=False)[1:k + 1]

In [156]:
S = ["1240105", "1240113", "1240132", "6776115", "1030848"]
usr = df_utility_matrix_item_user["1029563"]

1056296    0.0
1341919    0.0
1326       0.0
1256115    0.0
1238269    0.0
          ... 
1006837    0.0
1238056    1.0
1238478    0.0
2114258    0.0
153        0.0
Name: 1029563, Length: 486, dtype: float64
