## Calcular modelo de `Who X also X`
Calculo do modelo de estudo paseado na interação de visualização de produtos num e-commerce, assim  criando um modelo de recomendação `quem-viu-tambem-viu`

In [1]:
# Importação para carregar o Spark
import findspark
findspark.init()

### Importação da bibliotecas


In [2]:
import pandas as pd

from pyspark.sql.functions import *
from pyspark.sql import SparkSession
 
from pyspark.sql.types import DoubleType
from pyspark.sql.window import Window

spark = (
    SparkSession.builder\
    .master("local")
    .appName("bart-calcullate-wxax")\
    .getOrCreate()
)

### Calculando Recomendações

In [3]:
# Constants
action_type = 1
attributes = ['source_item_id']
min_occurrence = 2
max_recommendations = 200

In [4]:
# Carregando Datasets
actions = spark.createDataFrame(
    pd.read_csv("./interactions.csv")
)
actions.printSchema()

root
 |-- Unnamed: 0: long (nullable = true)
 |-- customer_id: string (nullable = true)
 |-- product_id: string (nullable = true)
 |-- action_id: long (nullable = true)
 |-- timestamp: double (nullable = true)



In [5]:
# Inputs
source_actions = actions.filter(actions.action_id == action_type).alias(
    'source'
)
target_actions = actions.filter(actions.action_id == action_type).alias(
    'target'
)

raw_recommendations = (
    source_actions.join(target_actions, 'customer_id', 'inner')
    .filter('source.product_id <> target.product_id')
    .groupBy('source.product_id', 'target.product_id')
    .count()
    .filter(col('count') > min_occurrence)
    .selectExpr(
        'source.product_id as source_item_id',
        'target.product_id as recommended_item_id',
        'count as score',
    )
)

### Recomendação de Conteúdo para o Usuário

In [6]:
# Filter recommendations
window = Window.partitionBy(*attributes).orderBy(
    raw_recommendations.score.cast(DoubleType()).desc()
)
limit_recommendations = raw_recommendations.withColumn(
    'rank', dense_rank().over(window)
).filter(f'rank <= {max_recommendations}')

In [7]:
# Groub by recommendations
group_by = limit_recommendations.groupBy(*attributes)
group_recommendations = group_by.agg(
    collect_set(
        struct(
            limit_recommendations.recommended_item_id,
            limit_recommendations.score.cast(DoubleType()).alias('score'),
        )
    ).alias('recommendations')
)

In [8]:
# Recommended User Product
group_recommendations.show(n=5)

+--------------+--------------------+
|source_item_id|     recommendations|
+--------------+--------------------+
|      Dce730CE|[[39D90E8F, 1351....|
|      a827F4CD|[[2E305BAe, 1547....|
|      24d1E961|[[4b15bf94, 1517....|
|      aFaAe1F4|[[CCcd595A, 1464....|
|      3cDfe90c|[[FA1CacBa, 1465....|
+--------------+--------------------+
only showing top 5 rows

