In [1]:
import pandas as pd
import numpy as np
import tqdm
import json
from pyspark.sql import SparkSession
import pyspark.sql.functions as spf
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import pairwise_distances

In [2]:
spark = SparkSession.builder.appName("dnikanorova").getOrCreate()

In [3]:
sc

## User-based Collaborative Filtering

#### Основная идея: 
Рекомендовать пользователю треки, которые понравились похожим на него пользователям

$$\hat r_{ui} = h^{-1} \left( \frac{\sum_{v \in N_i(u)} w_{uv} h(r_{vi})}{\sum_{v \in N_i(u)} w_{uv}} \right)$$

$N_i(u)$ - соседи пользователя $u$, которые оценили айтем $i$, 
$w_{uv}, w_{ij}$ - веса соседей, 
$h$ - функция нормализации



**Нормализация**: В качестве функции нормализации используем среднее время прослушивания

**Веса**: Похожих пользователей будем искать по *cosine similarity*

**Отсутствующие данные** заполним средним времнем прослушивания по пользователю


In [157]:
DATA_DIR = "../data/"

In [158]:
data = spark.read.json("/user/dnikanorova/data/top_pop_25k_2k/")

data.printSchema()

root
 |-- experiments: struct (nullable = true)
 |    |-- TOP_POP: string (nullable = true)
 |    |-- USER_BASED: string (nullable = true)
 |-- latency: double (nullable = true)
 |-- message: string (nullable = true)
 |-- recommendation: long (nullable = true)
 |-- time: double (nullable = true)
 |-- timestamp: long (nullable = true)
 |-- track: long (nullable = true)
 |-- user: long (nullable = true)



In [160]:
df = (data
          .filter(spf.col("experiments.TOP_POP").isin("T1", "T2", "T3"))
          .select(spf.col("user"), spf.col("time"), spf.col("recommendation").cast("int").alias("track"))
          .filter(spf.col("track").isNotNull())
          .dropDuplicates(["user", "track"])
          .toPandas()
     )
df.head()

Unnamed: 0,user,time,track
0,4234,0.65,32663
1,4222,0.1,1275
2,1687,0.0,42691
3,6887,0.0,31110
4,2072,0.04,5104


### Подготовка данных
На этом этапе соберем несколько вспомогательных датасетов:

1) ***norm*** - датасет с нормализованными данными

2) ***interactions_raw*** - матрица взаимодействий user-item 

3) ***interactions*** - матрица взаимодействий user-item с заполненными значениями

4) ***user_similarity_cosine*** - матрица похожести пользователей

5) ***sim_user_30_u*** - топ-30 ближаших соседей для пользователя

6) ***tracks_by_user*** - треки, прослушанные пользователями

In [161]:
class UserBased:
    
    def __init__(self, df, value_col, user_col, item_col, n_neighbours=30):
        self.df = df
        self.user_col = user_col
        self.value_col = value_col 
        self.value_adj_col = self.value_col + "_adj"
        self.item_col = item_col
        
        self.norm = self._normalize()
        self.interactions = self._create_interactions() #raw
        self.interactions_filled = self._fill_na() #filled
        self.cosine_similarity = self._calculate_cosine_similarity()
        self.tracks_by_user = self._collect_tracks_by_user()
        self.neighbours = self._find_n_neighbours(n_neighbours) # param
    
        
    def _normalize(self):
        norm = self.df.copy()
        norm['avg'] = norm.groupby(self.user_col)[self.value_col].transform('mean')
        norm['time_adj'] = norm[self.value_col] - norm['avg']
        self.value_adj_col = 'time_adj'
        return norm
    
    def _create_interactions(self):
        inter_df = self.norm.copy()

        interactions = pd.pivot_table(inter_df, values=self.value_adj_col, index=self.user_col, columns=self.item_col)
    
        print("Interaction matrix consists of {} users and {} items".format(interactions.shape[0], interactions.shape[1]))
        return interactions
    
    def _fill_na(self):
        return self.interactions.copy().fillna(0)
    
    def _calculate_cosine_similarity(self):
        interactions_t = self.interactions_filled.copy()
        
        similarity_matrix = cosine_similarity(interactions_t)
        np.fill_diagonal(similarity_matrix, 0 )
        similarity_df = pd.DataFrame(similarity_matrix,index=interactions_t.index)
        similarity_df.columns=interactions_t.index
        
        return similarity_df
    
    # collect tracks listened by users
    def _collect_tracks_by_user(self):
        df_t = self.df.copy()
        df_t = df_t.astype({"track": str})
        tracks_by_user = df_t.groupby(by = 'user')['track'].apply(lambda x:','.join(x))
        return tracks_by_user
    
    
    # find most similar users
    def _find_n_neighbours(self, n):
        df = self.cosine_similarity.copy()
        
        order = np.argsort(df.values, axis=1)[:, :n]
        df = df.apply(lambda x: pd.Series(x.sort_values(ascending=False)
               .iloc[:n].index, 
              index=['top{}'.format(i) for i in range(1, n + 1)]), axis=1)
        return df

    # check interests
    def get_user_similar_tracks(self, user1, user2):
        common_tracks = self.df[self.df.user == user1].merge(
        self.df[self.df.user == user2],
        on = self.item_col,
        how = "inner" )
        
        return common_tracks
    
    # score tracks
    def predict(self, user,n):
        # get tracks already listened by user
        tracks_listen_by_user = self.interactions.loc[user, :].dropna().index.tolist()
    
        # get similar users
        similar_users = self.neighbours.loc[user, :].values.tolist()
    
        # get tracks from similar users
        tracks_of_similar_users = self.tracks_by_user[self.tracks_by_user.index.isin(similar_users)]
        all_tracks_of_similar_users = list(map(int, ','.join(tracks_of_similar_users.values).split(',')))
    
        # only take tracks that were not listened by a user
        tracks_under_consideration = list(set(all_tracks_of_similar_users) - set(tracks_listen_by_user))
    
        scores=[]
        for item in tracks_under_consideration:
            score = self._score_track(item, similar_users)
            scores.append(score)
        
        top = np.array(tracks_under_consideration)[np.argsort(scores)[-n:]]

        return top


    def _score_track(self, item, similar_users):
    
        item_ratings_by_similars = self.interactions.loc[similar_users, item].dropna()
        similars_rated_item = item_ratings_by_similars.index.values.tolist()
        avg_user = self.norm.query('user == @user')['avg'].values[0]
        weights = self.cosine_similarity.loc[user, similars_rated_item]
    
        numerator = sum(item_ratings_by_similars * weights)
        denominator = weights.sum()
        score = avg_user + (numerator / denominator)
    
        return score

    

In [None]:
%%time
recommender = UserBased(df, 
    value_col='time',
    user_col="user" ,
    item_col="track",
    n_neighbours=30)

Interaction matrix consists of 7110 users and 2000 items


In [152]:
norm = recommender.norm
interactions_raw = recommender.interactions
interactions = recommender.interactions_filled
user_similarity_cosine = recommender.cosine_similarity
tracks_by_user = recommender.tracks_by_user
neighbours = recommender.neighbours

In [153]:
neighbours.head()

Unnamed: 0_level_0,top1,top2,top3,top4,top5,top6,top7,top8,top9,top10,...,top21,top22,top23,top24,top25,top26,top27,top28,top29,top30
user,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,9164,1432,442,4528,186,5136,632,8487,6123,4866,...,5099,7771,9791,4195,4992,4485,1010,4512,9366,6841
2,426,3480,738,9738,5489,2480,9871,7095,7231,9613,...,6333,4330,5907,3653,2050,9289,1627,757,1075,2874
5,698,416,8229,9652,1970,3443,165,2562,2212,6135,...,7380,7363,7114,5593,847,2950,7539,1043,8343,6841
8,6744,3383,3994,5033,7787,77,7512,5394,6820,9169,...,6262,1523,5367,5195,4357,3478,3456,3484,7901,2072
10,7789,1812,9385,8579,1680,9856,4608,6557,9655,4943,...,1117,4482,6629,4088,6682,377,2651,7086,6056,5620


In [154]:
sim_tracks = recommender.get_user_similar_tracks(0, 9164)

In [155]:
sim_tracks.head()

Unnamed: 0,user_x,time_x,track,user_y,time_y
0,0,1.0,18307,9164,0.86
1,0,1.0,22020,9164,1.0


### Построение рекомендаций
На этом этапе рассчитаем скоры айтемов по формуле 

$$\hat r_{ui} = h^{-1} \left( \frac{\sum_{v \in N_i(u)} w_{uv} h(r_{vi})}{\sum_{v \in N_i(u)} w_{uv}} \right)$$

$N_i(u)$ - соседи пользователя $u$, которые оценили айтем $i$, 
$w_{uv}, w_{ij}$ - веса соседей, 
$h$ - функция нормализации

In [None]:
users = df['user'].unique()

with open(DATA_DIR + "recommendations_2k.json", "w") as rf:
    for user in tqdm.tqdm(users):
        top = recommender.predict(user, 100)
        recommendation = {
                "user": int(user),
                "tracks": top.tolist()
        }
        rf.write(json.dumps(recommendation) + "\n")
