In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import random
from faker import Faker

# custom
import utils

# imports re for text cleaning
import re
from datetime import datetime, timedelta, date

# we will ignore pandas warning
import warnings
warnings.filterwarnings('ignore')

%load_ext autoreload
%autoreload 2

## Generating dataframes

In [5]:
# all lightfm imports 
from lightfm.data import Dataset
from lightfm import LightFM
from lightfm import cross_validation
from lightfm.evaluation import precision_at_k
from lightfm.evaluation import auc_score

from sklearn.model_selection import train_test_split

In [7]:
df, inns, inn2group, group2name = utils.make_interactions_dataset(
    num_transactions=5000,
    num_inns=80,
    num_groups=10
)

df_train, df_test = train_test_split(df, test_size=0.2)

# dropping duplicate interactions from test which are present in train
train_kt_dt_set = set(zip(df_train['inn_kt'], df_train['inn_dt']))
df_test = df_test[~df_test[['inn_kt', 'inn_dt']].apply(tuple, axis=1).isin(train_kt_dt_set)]
df_test = df_test.reset_index(drop=True)

df.head()

Unnamed: 0,id_trans,inn_kt,inn_dt,c_sum,date,nazn,kt_group_num,dt_group_num,kt_group_name,dt_group_name
0,1,4153186486,8766161799,179954.68,2021-09-25,овощи,10,10,analysis,analysis
1,2,7807062498,4826977470,56154.63,2021-11-02,фрукты,1,1,yet,yet
2,3,2489620450,9411708365,38962.65,2022-04-12,серсо,8,7,half,resource
3,4,7387179590,3301277896,248720.61,2023-09-26,серсо,4,4,couple,couple
4,5,3745811688,3567683946,51635.07,2021-08-12,колесо,3,9,school,fall


In [8]:
kt_feature_list = utils.generate_feature_list(df_train, ["kt_group_num", "kt_group_name"])
dt_feature_list = utils.generate_feature_list(df_test, ["dt_group_num", "dt_group_name"])

In [9]:
# creating dataset
dataset = Dataset()
dataset.fit(
    users=set(inns),
    items=set(inns),
    user_features=kt_feature_list,
    item_features=dt_feature_list
)

interactions, weights = dataset.build_interactions(
    data=list(zip(df_train["inn_kt"], df_train["inn_dt"]))
)

test_interactions, test_weights = dataset.build_interactions(
    data=list(zip(df_test["inn_kt"], df_test["inn_dt"]))
)

# now we are building our questions and professionals features
# in a way that lightfm understand.
# we are using lightfm build in method for building
# questions and professionals features 
kt_features = dataset.build_user_features(
    utils.create_features(df_train, ["kt_group_num", "kt_group_name"], "inn_kt"),
    normalize=True
)

dt_features = dataset.build_item_features(
    utils.create_features(df_train, ["dt_group_num", "dt_group_name"], "inn_dt"),
    normalize=True
)

Columns in `dt_features` correspond to `dataset.mapping()` tuple of dictionaries for item features.

**NOTE**: `dataset.mapping()` returns a tuple of 4 dictionaries:

`_user_id_mapping`,

`_user_feature_mapping`,

`_item_id_mapping`,

`_item_feature_mapping`

**For example**, `_item_feature_mapping["Pharmacologist"]` returns an index of `"Pharmacologist"` feature in `dt_features` matrix, so `dt_features[:, index]` shows the "Pharmacologist" feature value for all items.

# Fitting the model

In [98]:
model = LightFM(
    no_components=150,
    learning_rate=0.05,
    loss='warp',
    random_state=2024)

model.fit(
    interactions,
    item_features=dt_features,
    user_features=kt_features, sample_weight=weights,
    epochs=5, num_threads=1, verbose=True)

print(utils.calculate_auc_score(model, test_interactions, kt_features, dt_features))
print(utils.calculate_precision_at_k(model, test_interactions, kt_features, dt_features))

Epoch 0
Epoch 1
Epoch 2
Epoch 3
Epoch 4
0.46253723
0.021428572


# Making embedding mappings for dt and kt

In [16]:
kt_biases, kt_embeds = model.get_user_representations(kt_features)
dt_biases, dt_embeds = model.get_item_representations(dt_features)
user_id_mapping, user_feature_mapping, item_id_mapping, item_feature_mapping = dataset.mapping()

embeddings = {}
for inn in inns:
    # e.g.
    # user_id_mapping[inn] - index of user in kt_embeds
    embeddings[inn] = {
        "kt_embed": kt_embeds[user_id_mapping[inn]],
        "dt_embed": dt_embeds[item_id_mapping[inn]],
        "kt_bias": kt_biases[user_id_mapping[inn]],
        "dt_bias": dt_biases[item_id_mapping[inn]]
    }

# Obtaining predictions manually

In [54]:
from typing import Dict
# раз ВСЕХ, то и себя тоже что ли?
kt_scores: Dict[int, Dict[int, float]] = {}
for inn_kt in df_test["inn_kt"]:
    inn_idx = user_id_mapping[inn_kt]
    kt_embed = embeddings[inn_kt]["kt_embed"]
    kt_bias = embeddings[inn_kt]["kt_bias"]
    scores = {}
    for inn_dt in inns:
        dt_idx = item_id_mapping[inn_dt]
        dt_embed = embeddings[inn_dt]["dt_embed"]
        dt_bias = embeddings[inn_dt]["dt_bias"]

        score = (dt_embed @ kt_embed) + kt_bias + dt_bias
        scores[inn_dt] = score

    kt_scores[inn_kt] = scores

Полуаем все взаимодействия (и на трейне и на тесте (?))

In [99]:
test_kt_dt_set = set(zip(df_test['inn_kt'], df_test['inn_dt']))
train_kt_dt_set = set(zip(df_train['inn_kt'], df_train['inn_dt']))
all_set = train_kt_dt_set | test_kt_dt_set

Считаем map

Попробуем рекомендовать строго тех с кем ещё не взаимодействовали, поскольку их заведемо не будет в тестовом наборе

In [112]:
# kt датафрейм с предиктами
kt_preds = pd.DataFrame(kt_scores)

# считаем мапы
average_precisions = {}
average_precisions_no_index = []
top_k = 20
for inn_kt in kt_scores:
    # составляем тех с кем ещё не взаимодействовали в трейне
    data = kt_preds[inn_kt].sort_values(ascending=False)
    data = data[[(inn_kt, inn_dt) not in train_kt_dt_set for inn_dt in data.index]][:top_k]
    top_k_dt = data.index
    top_k_scores = data.values

    targets = np.array([(inn_kt, inn_dt) in test_kt_dt_set for inn_dt in top_k_dt])
    total_ones = sum(targets)
    
    precisions = (targets.cumsum() / np.arange(1, top_k + 1)) * targets
    if total_ones == 0:
        average_precision = 0
    else:
        average_precision = precisions.sum() / total_ones
    
    average_precisions[inn_kt] = average_precision
    average_precisions_no_index.append(average_precision)

average_precisions_no_index = np.array(average_precisions_no_index)


print(average_precisions_no_index.mean())

0.1416732881657141


Модель обучена