In [1]:
!pip install lightfm

import pandas as pd
import numpy as np
from lightfm import LightFM
from lightfm.data import Dataset
from lightfm.evaluation import precision_at_k
from sklearn.model_selection import train_test_split

# === 1. Load Data ===
transactions = pd.read_csv("/kaggle/input/dataset3/transactions.csv")
limit_prices = pd.read_csv("/kaggle/input/dataset3/limit_prices.csv")
customers = pd.read_csv("/kaggle/input/dataset3/customer_information.csv")
assets = pd.read_csv("/kaggle/input/dataset3/asset_information.csv")

# === 2. Filter buy transactions ===
buy_data = transactions[transactions["transactionType"] == "Buy"].drop_duplicates(subset=["customerID", "ISIN"])

# === 3. Merge data ===
customers_latest = customers.sort_values("timestamp").drop_duplicates(subset="customerID", keep="last")
merged = buy_data.merge(customers_latest, on="customerID").merge(
    assets[["ISIN", "assetCategory", "sector", "industry"]], on="ISIN"
)
filtered = merged.copy()

# === 4. Filter rarely traded assets ===
popular_assets = buy_data["ISIN"].value_counts()[lambda x: x > 5].index
filtered = filtered[filtered["ISIN"].isin(popular_assets)]

# === 5. Split ===
train_data, test_data = train_test_split(filtered, test_size=0.1, random_state=42)

# === 6. Encode IDs and features ===
dataset = Dataset()
user_feature_fields = ["riskLevel", "customerType", "investmentCapacity"]
users_list = customers_latest[user_feature_fields].fillna("Unknown").astype(str)
user_feature_tuples = [
    (row["customerID"], [
        f"riskLevel:{row['riskLevel']}",
        f"type:{row['customerType']}",
        f"capacity:{row['investmentCapacity']}"
    ]) for _, row in users_list.join(customers_latest["customerID"]).iterrows()
]
valid_customers = set(filtered["customerID"])
filtered_user_feature_tuples = [(uid, feats) for uid, feats in user_feature_tuples if uid in valid_customers]

dataset.fit(
    users=filtered["customerID"],
    items=filtered["ISIN"],
    user_features={f for _, feats in filtered_user_feature_tuples for f in feats}
)

train_interactions, _ = dataset.build_interactions(list(zip(train_data["customerID"], train_data["ISIN"])))
test_interactions, _ = dataset.build_interactions(list(zip(test_data["customerID"], test_data["ISIN"])))
user_features = dataset.build_user_features(filtered_user_feature_tuples)

# === 7. Train Model ===
model = LightFM(loss="warp-kos")
model.fit(train_interactions, user_features=user_features, epochs=10, num_threads=4)

# === 8. Evaluate Precision@10 ===
precision = precision_at_k(model, test_interactions, user_features=user_features, k=10).mean()
print(f"Precision@10: {precision:.4f}")

# === 9. Inverse Mapping ===
user_id_map, _, item_id_map, _ = dataset.mapping()
inv_user_id_map = {v: k for k, v in user_id_map.items()}
inv_item_id_map = {v: k for k, v in item_id_map.items()}

# === 10. Threshold ===
def auto_select_threshold(profitability_series):
    stats = profitability_series.describe()
    median, q3, minv, maxv = stats["50%"], stats["75%"], stats["min"], stats["max"]
    if maxv - minv > 10 * abs(median): return q3
    elif median > 0: return median
    return 0.01

roi_threshold = auto_select_threshold(limit_prices["profitability"])
print(f"ROI Threshold: {roi_threshold:.4f}")

# === 11. Recommend + Metrics ===
def recommend_with_metrics(model, interactions, top_k=10, sample_users=10):
    n_users, n_items = interactions.shape
    all_recommendations, roi_at_k_list, ndcg_list = [], [], []

    for uid in range(min(sample_users, n_users)):
        actual_user_id = inv_user_id_map[uid]
        scores = model.predict(uid, np.arange(n_items), user_features=user_features)
        sorted_items = np.argsort(-scores)

        top_assets = []
        for iid in sorted_items:
            isin = inv_item_id_map[iid]
            if isin not in limit_prices["ISIN"].values: continue
            roi = limit_prices.loc[limit_prices["ISIN"] == isin, "profitability"].values[0]
            if roi <= roi_threshold: continue

            asset_row = assets[assets["ISIN"] == isin].iloc[0]
            top_assets.append((isin, roi))
            if len(top_assets) >= top_k: break

        roi_values = [r[1] for r in top_assets]
        if roi_values:
            roi_at_k = np.mean(roi_values)
            roi_at_k_list.append(roi_at_k)

            ideal = sorted(roi_values, reverse=True)
            dcg = sum([(2**r - 1) / np.log2(idx + 2) for idx, r in enumerate(roi_values)])
            idcg = sum([(2**r - 1) / np.log2(idx + 2) for idx, r in enumerate(ideal)])
            ndcg = dcg / idcg if idcg != 0 else 0
            ndcg_list.append(ndcg)

    print(f"ROI@10: {np.mean(roi_at_k_list):.4f}")
    print(f"nDCG@10: {np.mean(ndcg_list):.4f}")

recommend_with_metrics(model, test_interactions)


Collecting lightfm
  Downloading lightfm-1.17.tar.gz (316 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m316.4/316.4 kB[0m [31m7.5 MB/s[0m eta [36m0:00:00[0mta [36m0:00:01[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: lightfm
  Building wheel for lightfm (setup.py) ... [?25l[?25hdone
  Created wheel for lightfm: filename=lightfm-1.17-cp311-cp311-linux_x86_64.whl size=829306 sha256=f6c949cfb3b7618aeb033986919cded51764867da594dc87ba248b13746c0d55
  Stored in directory: /root/.cache/pip/wheels/b9/0d/8a/0729d2e6e3ca2a898ba55201f905da7db3f838a33df5b3fcdd
Successfully built lightfm
Installing collected packages: lightfm
Successfully installed lightfm-1.17
Precision@10: 0.0650
ROI Threshold: 0.2200
ROI@10: 1.0015
nDCG@10: 0.5070
