In [336]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import random
from faker import Faker

# custom
import utils

# imports re for text cleaning
import re
from datetime import datetime, timedelta, date

# we will ignore pandas warning
import warnings
warnings.filterwarnings('ignore')

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Generating dataframes

In [337]:
# all lightfm imports 
from lightfm.data import Dataset
from lightfm import LightFM
from lightfm import cross_validation
from lightfm.evaluation import precision_at_k
from lightfm.evaluation import auc_score

from sklearn.model_selection import train_test_split

In [338]:
df, inns, inn2group, group2name = utils.make_interactions_dataset(
    num_transactions=20000,
    num_inns=700,
    num_groups=10
)

df_train, df_test = train_test_split(df, test_size=0.2)
df.head()

Unnamed: 0,id_trans,inn_kt,inn_dt,c_sum,date,nazn,kt_group_num,dt_group_num,kt_group_name,dt_group_name
0,1,7410522978,5524004328,76218.2,2021-08-19,масло,10,9,American,attention
1,2,5020559351,2146746793,100915.16,2022-12-03,сок,5,5,night,night
2,3,8349320862,7337073023,21492.65,2022-08-07,чай,5,10,night,American
3,4,1632244652,7410522978,29088.24,2023-10-27,кофе,10,10,American,American
4,5,9308544856,8651614206,95320.85,2023-01-16,масло,2,4,detail,off


In [339]:
kt_feature_list = utils.generate_feature_list(df_train, ["kt_group_num", "kt_group_name"])
dt_feature_list = utils.generate_feature_list(df_test, ["dt_group_num", "dt_group_name"])

In [340]:
# creating dataset
dataset = Dataset()
dataset.fit(
    users=set(inns),
    items=set(inns),
    user_features=kt_feature_list,
    item_features=dt_feature_list
)

interactions, weights = dataset.build_interactions(
    data = list(zip(df_train["inn_kt"], df_train["inn_dt"]))
)

# now we are building our questions and professionals features
# in a way that lightfm understand.
# we are using lightfm build in method for building
# questions and professionals features 
kt_features = dataset.build_user_features(
    utils.create_features(df_train, ["kt_group_num", "kt_group_name"], "inn_kt"),
    normalize=True
)

dt_features = dataset.build_item_features(
    utils.create_features(df_train, ["dt_group_num", "dt_group_name"], "inn_dt"),
    normalize=True
)

Columns in `dt_features` correspond to `dataset.mapping()` tuple of dictionaries for item features.

**NOTE**: `dataset.mapping()` returns a tuple of 4 dictionaries:

`_user_id_mapping`,

`_user_feature_mapping`,

`_item_id_mapping`,

`_item_feature_mapping`

**For example**, `_item_feature_mapping["Pharmacologist"]` returns an index of `"Pharmacologist"` feature in `dt_features` matrix, so `dt_features[:, index]` shows the "Pharmacologist" feature value for all items.

# Fitting the model

In [346]:
model = LightFM(
    no_components=150,
    learning_rate=0.05,
    loss='warp',
    random_state=2024)

model.fit(
    interactions,
    item_features=dt_features,
    user_features=kt_features, sample_weight=weights,
    epochs=10, num_threads=1, verbose=True)

Epoch 0
Epoch 1
Epoch 2
Epoch 3
Epoch 4
Epoch 5
Epoch 6
Epoch 7
Epoch 8
Epoch 9


<lightfm.lightfm.LightFM at 0x140187fd0>

# Making embedding mappings for dt and kt

In [347]:
kt_biases, kt_embeds = model.get_user_representations(kt_features)
dt_biases, dt_embeds = model.get_item_representations(dt_features)
user_id_mapping, user_feature_mapping, item_id_mapping, item_feature_mapping = dataset.mapping()

embeddings = {}
for inn in inns:
    # e.g.
    # user_id_mapping[inn] - index of user in kt_embeds
    embeddings[inn] = {
        "kt_embed": kt_embeds[user_id_mapping[inn]],
        "dt_embed": dt_embeds[item_id_mapping[inn]],
        "kt_bias": kt_biases[user_id_mapping[inn]],
        "dt_bias": dt_biases[item_id_mapping[inn]]
    }

# Obtaining predictions

In [350]:
utils.calculate_auc_score(model, interactions, kt_features, dt_features)

np.float32(0.46168375)

In [351]:
utils.calculate_precision_at_k(model, interactions, kt_features, dt_features)

np.float32(0.069571435)