In [20]:
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import pyarrow as pa

In [21]:
DATA_DIR = 'datasets/lastfm-dataset-360K/'

In [22]:
df = pd.read_csv(DATA_DIR + 'usersha1-artmbid-artname-plays.tsv', sep='\t', header=None)

In [23]:
df.head()

Unnamed: 0,0,1,2,3
0,00000c289a1829a808ac09c00daf10bc3c4e223b,3bd73256-3905-4f3a-97e2-8b341527f805,betty blowtorch,2137
1,00000c289a1829a808ac09c00daf10bc3c4e223b,f2fb0ff0-5679-42ec-a55c-15109ce6e320,die Ärzte,1099
2,00000c289a1829a808ac09c00daf10bc3c4e223b,b3ae82c2-e60b-4551-a76d-6620f1b456aa,melissa etheridge,897
3,00000c289a1829a808ac09c00daf10bc3c4e223b,3d6bbeb7-f90e-4d10-b440-e153c0d10b53,elvenking,717
4,00000c289a1829a808ac09c00daf10bc3c4e223b,bbd2ffd7-17f4-4506-8572-c1ea58c3f9a8,juliette & the licks,706


In [24]:
import findspark
findspark.init() 

In [25]:
from pyspark.sql import SparkSession

spark = (
    SparkSession
    .builder
    .config('spark.driver.memory', '8G')
    .config('spark.sql.analyzer.failAmbiguousSelfJoin', 'False')
    .master('local[*]')
    .getOrCreate()
)

In [26]:
import os
import pyspark.sql.functions as sql_func

plays = (
    spark
    .read
    .csv(
        os.path.join(DATA_DIR, 'usersha1-artmbid-artname-plays.tsv'),
        header=False,
        inferSchema=True,
        sep='\t'
    )
    .sample(withReplacement=False, fraction=1.0, seed=42)
    # имена колонкам можно узнать из файла README.txt
    .toDF('user', 'artist', 'artist_name', 'plays')
    .where('LENGTH(artist) >= 35')
    .where('LENGTH(user) == 40')
    .where('plays IS NOT NULL')
    .drop('artist_name')
)

In [27]:
# построим индексы пользователей и исполнителей
user_index = (
    plays
    .select('user')
    .distinct()
    .coalesce(1)
    .select(
        'user',
        sql_func.monotonically_increasing_id().alias('user_id')
    )
)
artist_index = (
    plays
    .select('artist')
    .distinct()
    .coalesce(1)
    .select(
        'artist',
        sql_func.monotonically_increasing_id().alias('artist_id')
    )
)

In [28]:
# перекодируем наш набор данных с использованием
# числовых инлексов вместо строчных
triples = (
    plays
    .join(user_index, 'user')
    .join(artist_index, 'artist')
    .select(
        'user_id',
        'artist_id',
        sql_func.log(
            1 + sql_func.col('plays')
        ).alias('plays')
    )
    .cache()
)
triples.show(n=5)

+-------+---------+-----------------+
|user_id|artist_id|            plays|
+-------+---------+-----------------+
|   3932|      221|5.303304908059076|
|   9048|      221|4.543294782270004|
|  15430|      221|5.049856007249537|
|  16664|      221|6.037870919922137|
|  22832|      221|4.990432586778736|
+-------+---------+-----------------+
only showing top 5 rows



In [29]:
# сох раняем тройки на диск для дальнейшего использования
(
    triples
    .write
    .mode('overwrite')
    .parquet('models/user_item_lastfm.parquet')
)

Py4JJavaError: An error occurred while calling o227.parquet.
: java.lang.UnsatisfiedLinkError: 'void org.apache.hadoop.io.nativeio.NativeIO$Windows.createDirectoryWithMode0(java.lang.String, int)'
	at org.apache.hadoop.io.nativeio.NativeIO$Windows.createDirectoryWithMode0(Native Method)
	at org.apache.hadoop.io.nativeio.NativeIO$Windows.createDirectoryWithMode(NativeIO.java:560)
	at org.apache.hadoop.fs.RawLocalFileSystem.mkOneDirWithMode(RawLocalFileSystem.java:534)
	at org.apache.hadoop.fs.RawLocalFileSystem.mkdirsWithOptionalPermission(RawLocalFileSystem.java:587)
	at org.apache.hadoop.fs.RawLocalFileSystem.mkdirs(RawLocalFileSystem.java:559)
	at org.apache.hadoop.fs.RawLocalFileSystem.mkdirsWithOptionalPermission(RawLocalFileSystem.java:586)
	at org.apache.hadoop.fs.RawLocalFileSystem.mkdirs(RawLocalFileSystem.java:559)
	at org.apache.hadoop.fs.RawLocalFileSystem.mkdirsWithOptionalPermission(RawLocalFileSystem.java:586)
	at org.apache.hadoop.fs.RawLocalFileSystem.mkdirs(RawLocalFileSystem.java:559)
	at org.apache.hadoop.fs.ChecksumFileSystem.mkdirs(ChecksumFileSystem.java:705)
	at org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter.setupJob(FileOutputCommitter.java:354)
	at org.apache.spark.internal.io.HadoopMapReduceCommitProtocol.setupJob(HadoopMapReduceCommitProtocol.scala:163)
	at org.apache.spark.sql.execution.datasources.FileFormatWriter$.write(FileFormatWriter.scala:173)
	at org.apache.spark.sql.execution.datasources.InsertIntoHadoopFsRelationCommand.run(InsertIntoHadoopFsRelationCommand.scala:178)
	at org.apache.spark.sql.execution.command.DataWritingCommandExec.sideEffectResult$lzycompute(commands.scala:108)
	at org.apache.spark.sql.execution.command.DataWritingCommandExec.sideEffectResult(commands.scala:106)
	at org.apache.spark.sql.execution.command.DataWritingCommandExec.doExecute(commands.scala:131)
	at org.apache.spark.sql.execution.SparkPlan.$anonfun$execute$1(SparkPlan.scala:180)
	at org.apache.spark.sql.execution.SparkPlan.$anonfun$executeQuery$1(SparkPlan.scala:218)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:215)
	at org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:176)
	at org.apache.spark.sql.execution.QueryExecution.toRdd$lzycompute(QueryExecution.scala:127)
	at org.apache.spark.sql.execution.QueryExecution.toRdd(QueryExecution.scala:126)
	at org.apache.spark.sql.DataFrameWriter.$anonfun$runCommand$1(DataFrameWriter.scala:962)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$5(SQLExecution.scala:100)
	at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:160)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$1(SQLExecution.scala:87)
	at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:767)
	at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:64)
	at org.apache.spark.sql.DataFrameWriter.runCommand(DataFrameWriter.scala:962)
	at org.apache.spark.sql.DataFrameWriter.saveToV1Source(DataFrameWriter.scala:414)
	at org.apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:398)
	at org.apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:287)
	at org.apache.spark.sql.DataFrameWriter.parquet(DataFrameWriter.scala:847)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:64)
	at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.base/java.lang.reflect.Method.invoke(Method.java:564)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:238)
	at java.base/java.lang.Thread.run(Thread.java:832)


In [30]:
# соебрем индексы в виде python объектов
i = triples.select('user_id').rdd.map(lambda row: row.user_id).collect()
j = triples.select('artist_id').rdd.map(lambda row: row.artist_id).collect()
data = triples.select('plays').rdd.map(lambda row: row.plays).collect()
# больше Spark не нужен
spark.stop()

In [31]:
print('номера строк [индексы пользователей]:', i[:5])
print('номера столбцов [индексы исполнителей]:', j[:5])
print('Элементы матрицы [количество прослушиваний]:', data[:5])

номера строк [индексы пользователей]: [3932, 9048, 15430, 16664, 22832]
номера столбцов [индексы исполнителей]: [221, 221, 221, 221, 221]
Элементы матрицы [количество прослушиваний]: [5.303304908059076, 4.543294782270004, 5.049856007249537, 6.037870919922137, 4.990432586778736]


In [32]:
from scipy.sparse import coo_matrix
import numpy as np
import gc

In [33]:
# соберем разреженную матрицу из полученных значений
user_item_matrix = coo_matrix((data, (i, j)), dtype=np.float16)
# от самых данных мы освободимся, чтобы не занимать лишнюю память
del i, j, data
gc.collect()

368

In [34]:
# разреженные матрицы долстаточно удобный объект
print('размерность матрицы', user_item_matrix.shape)
print('количество ненудевых элементов', user_item_matrix.nnz)
print('доля ненулевых элементов:',
      user_item_matrix.nnz / user_item_matrix.shape[0] / user_item_matrix.shape[1])

размерность матрицы (359337, 160163)
количество ненудевых элементов 17332977
доля ненулевых элементов: 0.00030116814091700147
