In [1]:
import pandas as pd

In [2]:
events_data = pd.read_csv("data/events.csv")
events_data.head()

Unnamed: 0,user_id,item_id,rating,timestamp
0,0,1505,4,0
1,0,3669,3,1
2,0,584,4,2
3,0,3390,3,3
4,0,2885,4,4


In [3]:
# REMOVE LAST 10 FOR VALIDATION IF NEEDED

# Assuming df is your initial DataFrame with 'userid', 'itemid', and 'timestamp' columns

# Sort by 'userid' and 'timestamp' in descending order
# df = events_data.sort_values(['user_id', 'timestamp'], ascending=[True, False])
# 
# # Select the last 10 items for each user
# top_10_per_user = df.groupby('user_id').head(10)
# events_data = df[~df.index.isin(top_10_per_user.index)].reset_index(drop=True)

In [4]:
items_features_data = pd.read_csv("data/item_features.csv")
items_features_data.head()

Unnamed: 0,item_id,genre_0,genre_1,genre_2,genre_3,genre_4,genre_5,genre_6,genre_7,genre_8,genre_9,genre_10,genre_11,genre_12,genre_13,genre_14,genre_15,genre_16,genre_17
0,0,0,1,0,1,1,0,0,0,1,0,0,0,0,1,0,0,0,0
1,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
2,2,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
3,3,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0
4,4,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0


In [5]:
user_features_data = pd.read_csv("data/user_features.csv").sort_values(by="user_id", axis=0).reset_index(drop=True)
user_features_data.head()

Unnamed: 0,user_id,gender,age
0,0,M,35
1,1,M,18
2,2,M,25
3,3,M,18
4,4,M,18


In [6]:
user_features_data['user_id'].nunique() == len(user_features_data)

True

In [7]:
def cast_age_to_bin(x: int):
    if 0 <= x < 12:
        return "young"
    elif 12 <= x < 25:
        return "middle"
    elif 25 <= x < 40:
        return "senior"
    elif 40 <= x:
        return "old"
    else:
        return "other"

user_features_data["age"] = user_features_data['age'].apply(cast_age_to_bin)
user_features_data.head()

Unnamed: 0,user_id,gender,age
0,0,M,senior
1,1,M,middle
2,2,M,senior
3,3,M,middle
4,4,M,middle


In [8]:
def _collect_features_with_feature_names(
    df: pd.DataFrame,
    columns: list[str],
) -> pd.Series:
    features = df[columns].apply(
        lambda row: ",".join(
            [",".join(f"{col}_{value.strip()}" for value in str(row[col]).split(",")) for col in columns],
        ),
        axis=1,
    )
    return features

def _generate_feature_list(
    df: pd.DataFrame,
    columns = None,
) -> list[str]:
    """Generate the list of features of corresponding columns to list
    in order to fit the lightfm Dataset
    """
    if columns is None:
        columns = df.columns

    features = _collect_features_with_feature_names(df, columns)
    features = features.str.split(",")
    features = features.apply(pd.Series).stack().reset_index(drop=True).to_list()
    return features

def _prepare_user_features(
    df: pd.DataFrame,
    columns = None,
) -> list[tuple[str, list[str]]]:
    """Prepare the corresponding feature formats for
    the lightfm.dataset's build_item_features function
    """
    if columns is None:
        columns = df.columns

    features = _collect_features_with_feature_names(df, columns)
    features = features.str.split(",")
    features = list(zip(df.index.to_list(), features))
    return features

In [9]:
all_user_features = _generate_feature_list(user_features_data, ["gender", "age"])
len(set(all_user_features))

6

In [10]:
import lightfm
from lightfm.data import Dataset



In [11]:
dataset = Dataset()

In [12]:
dataset.fit(
    users=user_features_data["user_id"].to_list(),
    items=items_features_data["item_id"].to_list(),
    user_features=list(set(all_user_features)),
    item_features=items_features_data.columns[1:]
)

In [13]:
lightfm_client_features = _prepare_user_features(user_features_data, ["gender", "age"])
lightfm_client_features[:10]

[(0, ['gender_M', 'age_senior']),
 (1, ['gender_M', 'age_middle']),
 (2, ['gender_M', 'age_senior']),
 (3, ['gender_M', 'age_middle']),
 (4, ['gender_M', 'age_middle']),
 (5, ['gender_F', 'age_senior']),
 (6, ['gender_F', 'age_young']),
 (7, ['gender_M', 'age_senior']),
 (8, ['gender_F', 'age_middle']),
 (9, ['gender_M', 'age_senior'])]

In [14]:
lightfm_client_features = dataset.build_user_features(lightfm_client_features)
lightfm_client_features

<6040x6046 sparse matrix of type '<class 'numpy.float32'>'
	with 18120 stored elements in Compressed Sparse Row format>

In [15]:
item_features = []
for x in items_features_data.iloc:
    temp_res = []
    for i, value in enumerate(x[1:]):
        if value != 0:
            temp_res.append(f"genre_{i}")
    item_features.append([x.to_list()[0], temp_res])
item_features[:10]

[[0, ['genre_1', 'genre_3', 'genre_4', 'genre_8', 'genre_13']],
 [1, ['genre_7']],
 [2, ['genre_7']],
 [3, ['genre_7', 'genre_13']],
 [4, ['genre_7']],
 [5, ['genre_14']],
 [6, ['genre_7']],
 [7, ['genre_0', 'genre_5', 'genre_7']],
 [8, ['genre_4']],
 [9, ['genre_7']]]

In [16]:
lightfm_item_features = dataset.build_item_features(item_features)
lightfm_item_features

<3706x3724 sparse matrix of type '<class 'numpy.float32'>'
	with 9898 stored elements in Compressed Sparse Row format>

In [17]:
%%time

(interactions, weights) = dataset.build_interactions(
    [x['user_id'], x['item_id'], x['rating']] for x in events_data.iloc
)

CPU times: user 12 s, sys: 110 ms, total: 12.1 s
Wall time: 12.1 s


In [18]:
interactions.toarray().sum()

np.int64(894149)

In [19]:
from lightfm import LightFM

model = LightFM(
    no_components=20,
    loss="warp",
    learning_rate=2e-2,
    learning_schedule='adagrad',
    random_state=42,
    # k=15
)
model.fit(
    interactions=interactions,
    sample_weight=weights,
    item_features=lightfm_item_features,
    user_features=lightfm_client_features,
    verbose=True,
    epochs=100, # ? хз
    num_threads=20,
)

Epoch: 100%|██████████| 100/100 [01:18<00:00,  1.27it/s]


<lightfm.lightfm.LightFM at 0x148fc49d0>

In [20]:
def _prepare_user_item_pairs(
    cnum_ids: list[int],
    product_ids: list[int],
) -> (list[int], list[int]):
    num_products = len(product_ids)

    users, items = [], []
    for cnum_id in cnum_ids:
        users.extend([cnum_id] * num_products)
        items.extend(product_ids)

    return users, items

In [21]:
client_ids, item_ids = _prepare_user_item_pairs(user_features_data['user_id'].to_list(), items_features_data['item_id'].to_list())

In [22]:
raw_model_predictions = model.predict(
    client_ids,
    item_ids,
    user_features=lightfm_client_features,
    item_features=lightfm_item_features,
).reshape(len(user_features_data), len(items_features_data))

In [23]:
import numpy as np

def _mask_and_order_predictions(
    raw_model_predictions: np.ndarray,
    existing_interactions: np.ndarray,
) -> np.ndarray:

    filtered_predictions = np.where(~existing_interactions, raw_model_predictions, -np.inf)
    product_id_predictions = np.argsort(-filtered_predictions, axis=1)
    # Filter out those who were -np.inf to -1
    # TODO: this seems like a bad idea but for now it passes the tests and
    #   I don't have a good idea how to change this.
    n_relevant_per_cnum = (filtered_predictions != -np.inf).sum(axis=1)
    column_indices = np.arange(product_id_predictions.shape[1])
    mask = column_indices >= n_relevant_per_cnum[:, np.newaxis]
    product_id_predictions[mask] = -1
    # print(product_id_predictions)

    return product_id_predictions

In [24]:
matrix_to_exclude = interactions.toarray().astype(bool)
res = _mask_and_order_predictions(raw_model_predictions, matrix_to_exclude)[:, :10]
res[:10]

array([[1811, 2732,  472, 2342, 2256,  331, 1001, 2630, 1086, 2688],
       [3529,  463, 1699, 1039,  208,   36, 2908, 1030, 3153,  106],
       [ 640, 1831, 2210, 1956, 1039, 2603, 2342, 2630, 3694, 1211],
       [3022, 2194, 2342, 1617,  802, 3327, 2603, 3435, 1546, 3005],
       [1583, 2297, 1337, 2646, 1560, 1039, 3022, 2402, 1086, 2210],
       [1315,  472, 2054,  530, 3350,  406, 3367, 3013, 1223, 3438],
       [2636, 3677,  472, 2281, 2402,  467, 2528, 1240,  863, 2646],
       [ 584, 2210,  476, 1956, 2630,  463,  331, 1746, 2646, 3238],
       [1240, 2528, 1848, 2587,  169, 2119, 3227, 2335,  472,  663],
       [3013,   36, 1560, 2297, 1505,  989,  124, 2054, 2664, 1030]])

In [25]:
# VALIDATION

# from tqdm import tqdm
# 
# total = 0
# for i, predictions in enumerate(tqdm(res)):
#     relevant = top_10_per_user[top_10_per_user['user_id'] == i]['item_id'].to_list()
#     for item in predictions:
#         if item in relevant:
#             total += 1
# 
# print(total / 10 / len(res))

In [26]:
predictions = [" ".join([str(y) for y in x]) for x in res]

In [27]:
output = pd.DataFrame(
    {
        'user_id': user_features_data['user_id'].to_list(),
        "item_id": predictions
    }
)
output.head()

Unnamed: 0,user_id,item_id
0,0,1811 2732 472 2342 2256 331 1001 2630 1086 2688
1,1,3529 463 1699 1039 208 36 2908 1030 3153 106
2,2,640 1831 2210 1956 1039 2603 2342 2630 3694 1211
3,3,3022 2194 2342 1617 802 3327 2603 3435 1546 3005
4,4,1583 2297 1337 2646 1560 1039 3022 2402 1086 2210


In [28]:
output.iloc[-1]

user_id                                               6039
item_id    3022 640 1831 3529 2630 2067 2342 476 2784 1505
Name: 6039, dtype: object

In [29]:
output.to_csv("result.csv", index=None)