In [None]:
import numpy as np
import pandas as pd
import scipy.sparse as sp
import pickle
import os
from pathlib import Path
from typing import Tuple, Dict
from sklearn.neighbors import NearestNeighbors
from tqdm import tqdm

In [None]:
#K_VAL = 1

#INPUT_DIR = Path('Test_Train_Data')

TRAIN_PATH = "data_k1_train.txt"
TEST_PATH = "data_k1_test.txt"

In [None]:
def load_data_and_create_matrices(  train_file: Path,
                                    test_file: Path ) -> Tuple[sp.csr_matrix, sp.csr_matrix, Dict[int, int], Dict[int, int]]:

    raw_data = []

    def parse_file(filepath: Path, dataset_type: str) -> None:
        if not filepath.exists():
            raise FileNotFoundError(f"File not found: {filepath}")

        with filepath.open("r") as f:
            for line in f:
                parts = line.strip().split()
                if len(parts) < 2:
                    continue

                u_id = int(parts[0])
                items = [int(i) for i in parts[1:]]

                for i_id in items:
                    raw_data.append(
                        {
                            "user": u_id,
                            "item": i_id,
                            "type": dataset_type,  # "train" or "test"
                        }
                    )

    parse_file(train_file, "train")
    parse_file(test_file, "test")

    df = pd.DataFrame(raw_data)
    print(f"Loaded {len(df):,} total interactions.")

    df["user_idx"] = df["user"].astype("category").cat.codes
    df["item_idx"] = df["item"].astype("category").cat.codes

    # Internal index -> original ID
    user_map: Dict[int, int] = dict(zip(df["user_idx"], df["user"]))
    item_map: Dict[int, int] = dict(zip(df["item_idx"], df["item"]))

    n_users = len(user_map)
    n_items = len(item_map)

    print(f"Matrix dimensions: {n_users:,} users x {n_items:,} items")

    def build_csr(dataset_type: str) -> sp.csr_matrix:
        subset = df[df["type"] == dataset_type]

        rows = subset["user_idx"].values
        cols = subset["item_idx"].values
        data = np.ones(len(subset), dtype=np.float32)

        return sp.csr_matrix((data, (rows, cols)), shape=(n_users, n_items))

    train_matrix = build_csr("train")
    test_matrix = build_csr("test")

    print(f"Train nnz: {train_matrix.nnz:,} | Test nnz: {test_matrix.nnz:,}")

    return train_matrix, test_matrix, user_map, item_map

In [None]:
train_matrix, test_matrix, user_map, item_map = load_data_and_create_matrices(Path(TRAIN_PATH), Path(TEST_PATH))

Loaded 2,380,730 total interactions.
Matrix dimensions: 52,643 users x 91,599 items
Train nnz: 1,924,739 | Test nnz: 455,991


In [None]:
class ItemKNNRecommender:
    """
    Item-Based Collaborative Filtering.
    Logic: "Users who liked Item A also liked Item B."
    """
    def __init__(self, n_neighbors=20):
        self.n_neighbors = n_neighbors
        self.model = NearestNeighbors(metric='cosine', algorithm='brute', n_neighbors=n_neighbors, n_jobs=-1)
        self.train_matrix = None
        self.item_vectors = None

    def fit(self, train_matrix):
        """
        Trains the KNN model.
        We want to calculate distance between ITEMS (rows).
        """
        print(f"Training Item-KNN (k={self.n_neighbors})...")
        self.train_matrix = train_matrix

        self.item_vectors = train_matrix.T

        # Fit the model on the Item Vectors
        self.model.fit(self.item_vectors)

    def predict(self, user_idx, top_k=20):
        """
        Predicts items based on the user's history.
        """
        user_history_indices = self.train_matrix[user_idx].indices

        if len(user_history_indices) == 0:
            return np.array([])

        query_indices = user_history_indices

        query_vectors = self.item_vectors[query_indices]
        distances, neighbor_indices = self.model.kneighbors(query_vectors)

        candidate_scores = {}
        for i in range(len(query_indices)):
            for j in range(self.n_neighbors):
                neighbor_idx = neighbor_indices[i, j]
                similarity = 1.0 - distances[i, j]

                candidate_scores[neighbor_idx] = candidate_scores.get(neighbor_idx, 0) + similarity

        for seen_idx in user_history_indices:
            if seen_idx in candidate_scores:
                del candidate_scores[seen_idx]

        sorted_candidates = sorted(candidate_scores.items(), key=lambda x: x[1], reverse=True)
        top_indices = [idx for idx, score in sorted_candidates[:top_k]]

        return np.array(top_indices)

In [None]:
def dcg_at_k(ranked_items, relevant_items, k=20):
    """
    ranked_items: 1D array/list of item indices (internal item_idx)
    relevant_items: set of relevant item indices from test_matrix[user_idx].indices
    """
    dcg = 0.0
    for rank, item in enumerate(ranked_items[:k], start=1):
        if item in relevant_items:
            dcg += 1.0 / np.log2(rank + 1)
    return dcg


def ndcg_for_user(model, user_idx, test_matrix, k=20):
    """
    Compute NDCG@k for a single user_idx using the fitted ItemKNNRecommender.
    """
    rel_items = test_matrix[user_idx].indices
    if len(rel_items) == 0:
        return None

    rel_set = set(rel_items)

    # Predicted top-K items
    preds = model.predict(user_idx, top_k=k)
    if preds is None or len(preds) == 0:
        return None

    dcg = dcg_at_k(preds, rel_set, k=k)

    ideal_len = min(len(rel_set), k)
    if ideal_len == 0:
        return None
    idcg = sum(1.0 / np.log2(r + 1) for r in range(1, ideal_len + 1))

    return dcg / idcg if idcg > 0 else None


def evaluate_item_knn_segmented(model,
                                train_matrix,
                                test_matrix,
                                user_map,
                                user_segment,
                                k=20):
    """
    Compute overall NDCG@k and NDCG@k per user segment:
    - regular
    - heavy
    - extreme

    user_map: dict[user_idx -> raw_user_id]
    user_segment: dict[raw_user_id -> "regular"/"heavy"/"extreme"]
    """
    n_users = train_matrix.shape[0]

    overall_scores = []
    seg_scores = {
        "regular": [],
        "heavy": [],
        "extreme": []
    }

    from tqdm import tqdm

    for u_idx in tqdm(range(n_users), desc="Evaluating Item-KNN"):
        ndcg_u = ndcg_for_user(model, u_idx, test_matrix, k=k)
        if ndcg_u is None:
            continue

        overall_scores.append(ndcg_u)

        raw_u = user_map[u_idx]
        seg = user_segment.get(raw_u, None)
        if seg in seg_scores:
            seg_scores[seg].append(ndcg_u)

    overall_ndcg = float(np.mean(overall_scores)) if overall_scores else 0.0
    seg_ndcg = {
        seg: (float(np.mean(vals)) if len(vals) > 0 else None)
        for seg, vals in seg_scores.items()
    }

    print(f"\nOverall NDCG@{k}: {overall_ndcg:.4f}")
    for seg in ["regular", "heavy", "extreme"]:
        v = seg_ndcg[seg]
        if v is None:
            print(f"  {seg:8s}: no users evaluated")
        else:
            print(f"  {seg:8s}: {v:.4f} (n={len(seg_scores[seg])})")

    return overall_ndcg, seg_ndcg, seg_scores

In [None]:
def ndcg_at_k(predicted_items, true_items, k=20):
    """
    predicted_items: iterable of item_idx predicted for the user
    true_items: iterable or set of relevant item_idx from test
    """
    if len(true_items) == 0:
        return 0.0

    true_set = set(true_items)

    dcg = 0.0
    for rank, item in enumerate(predicted_items[:k], start=1):
        if item in true_set:
            dcg += 1.0 / np.log2(rank + 1)

    ideal_len = min(len(true_set), k)
    idcg = sum(1.0 / np.log2(i + 1) for i in range(1, ideal_len + 1))

    return dcg / idcg if idcg > 0 else 0.0


In [None]:
from collections import defaultdict

user_train_counts = defaultdict(int)
train_users, train_items = train_matrix.nonzero()
for u in train_users:
    user_train_counts[u] += 1

def get_user_segment(count):
    if 11 <= count <= 50:
        return "regular"
    elif 51 <= count <= 200:
        return "heavy"
    elif count > 200:
        return "extreme"
    else:
        return None

user_segment = {}

for user_idx, raw_id in user_map.items():
    seg = get_user_segment(user_train_counts[user_idx])
    if seg is not None:
        user_segment[raw_id] = seg

print("Segmentation built:")
for s in ["regular", "heavy", "extreme"]:
    print(s, sum(1 for x in user_segment.values() if x == s))


Segmentation built:
regular 43985
heavy 7904
extreme 754


In [None]:
def knn_pipeline(
    train_matrix,
    test_matrix,
    user_map,
    item_map,
    top_k=20,
    n_neighbors=20,
    user_segment=None
):
    """
    Orchestrates the training and evaluation of the Item-KNN model.
    Returns: output_df, avg_ndcg_score

    If user_segment is provided (raw_user_id -> 'regular'/'heavy'/'extreme'),
    also prints NDCG@k per segment.
    """

    model = ItemKNNRecommender(n_neighbors=n_neighbors)
    model.fit(train_matrix)

    ndcg_scores = []
    output_rows = []

    seg_scores = None
    if user_segment is not None:
        from collections import defaultdict
        seg_scores = defaultdict(list)

    test_users = np.unique(test_matrix.nonzero()[0])
    n_test_users = len(test_users)

    print(f"Evaluating {n_test_users:,} users")

    for i, user_idx in tqdm(enumerate(test_users, 1), total=n_test_users, desc="Predicting"):
        top_indices = model.predict(user_idx, top_k=top_k)

        true_items = test_matrix[user_idx].indices
        if len(true_items) == 0:
            continue

        score = ndcg_at_k(top_indices, true_items, k=top_k)
        ndcg_scores.append(score)

        real_user_id = user_map.get(user_idx, f"User_{user_idx}")
        real_item_ids = [item_map.get(idx, f"Item_{idx}") for idx in top_indices]

        output_rows.append({
            'user_id': real_user_id,
            'recommended_items': real_item_ids
        })

        if seg_scores is not None:
            seg = user_segment.get(real_user_id, None)
            if seg is not None:
                seg_scores[seg].append(score)

        if i % 5000 == 0:
            current_avg = np.mean(ndcg_scores)
            tqdm.write(f"Processed {i} users. Current Avg NDCG@{top_k}: {current_avg:.4f}")

    avg_ndcg = np.mean(ndcg_scores) if ndcg_scores else 0.0

    print("\nRESULTS:")
    print("=" * 40)
    print(f"Evaluated Users: {len(ndcg_scores):,}")
    print(f"Average NDCG@{top_k}: {avg_ndcg:.4f}")

    if seg_scores is not None:
        print("\nSegment-wise NDCG:")
        for seg in ["regular", "heavy", "extreme"]:
            vals = seg_scores.get(seg, [])
            if len(vals) == 0:
                print(f"  {seg:8s}: no users")
            else:
                print(f"  {seg:8s}: {np.mean(vals):.4f} (n={len(vals)})")

    return pd.DataFrame(output_rows), avg_ndcg

In [None]:
def save_recommendations(df: pd.DataFrame, filename_base: str = "recommendations", output_dir: str = "."):

    os.makedirs(output_dir, exist_ok=True)

    csv_path = os.path.join(output_dir, f"{filename_base}.csv")
    df.to_csv(csv_path, index=False)

    txt_path = os.path.join(output_dir, f"{filename_base}.txt")

    with open(txt_path, 'w') as f:
        for _, row in tqdm(df.iterrows(), total=len(df), desc="Writing TXT"):
            user_id = row['user_id']
            items_str = " ".join(map(str, row['recommended_items']))
            f.write(f"{user_id} {items_str}\n")

    print(f"Saved results as TXT")

In [None]:
output_df, avg_ndcg = knn_pipeline(
    train_matrix=train_matrix,
    test_matrix=test_matrix,
    user_map=user_map,
    item_map=item_map,
    top_k=20,
    n_neighbors=10,
    user_segment=user_segment
)

Training Item-KNN (k=10)...
Evaluating 52,643 users


Predicting:  10%|▉         | 5004/52643 [04:50<38:18, 20.73it/s]

Processed 5000 users. Current Avg NDCG@20: 0.0811


Predicting:  19%|█▉        | 10004/52643 [09:10<37:08, 19.13it/s]

Processed 10000 users. Current Avg NDCG@20: 0.0795


Predicting:  28%|██▊       | 15003/52643 [13:20<31:10, 20.12it/s]

Processed 15000 users. Current Avg NDCG@20: 0.0805


Predicting:  38%|███▊      | 20004/52643 [17:33<23:19, 23.32it/s]

Processed 20000 users. Current Avg NDCG@20: 0.0857


Predicting:  47%|████▋     | 25001/52643 [21:32<26:03, 17.68it/s]

Processed 25000 users. Current Avg NDCG@20: 0.0916


Predicting:  57%|█████▋    | 30003/52643 [25:36<16:14, 23.22it/s]

Processed 30000 users. Current Avg NDCG@20: 0.0986


Predicting:  66%|██████▋   | 35001/52643 [29:28<14:37, 20.10it/s]

Processed 35000 users. Current Avg NDCG@20: 0.1062


Predicting:  76%|███████▌  | 40002/52643 [33:19<09:22, 22.47it/s]

Processed 40000 users. Current Avg NDCG@20: 0.1137


Predicting:  85%|████████▌ | 45003/52643 [37:07<05:51, 21.71it/s]

Processed 45000 users. Current Avg NDCG@20: 0.1199


Predicting:  95%|█████████▍| 50003/52643 [40:53<01:56, 22.67it/s]

Processed 50000 users. Current Avg NDCG@20: 0.1259


Predicting: 100%|██████████| 52643/52643 [42:47<00:00, 20.50it/s]


RESULTS:
Evaluated Users: 52,643
Average NDCG@20: 0.1283

Segment-wise NDCG:
  regular : 0.1255 (n=43985)
  heavy   : 0.1378 (n=7904)
  extreme : 0.1921 (n=754)





In [None]:
save_recommendations(output_df, filename_base="results")

Writing TXT: 100%|██████████| 52643/52643 [00:01<00:00, 30348.10it/s]

Saved results as TXT



