In [1]:
import gzip
import os
import datetime 
import struct
import numpy as np
from collections import defaultdict, Counter
from typing import Dict, Iterable, Tuple, List, Any

IMAGE_FEATURES_DATA_PATH = "data/Behance_Image_Features.b"
ITEMS_TO_OWNERS_DATA_PATH = "data/Behance_Item_to_Owners.gz"
APPRECIATE_DATA_PATH = "data/Behance_appreciate_1M.gz"
IMAGE_FEATURE_LIMIT = 70000  # set to None to load all features

# Assignment 2 â€“ Behance Like Prediction

## 1. Predictive Task & Evaluation

## 2. Exploratory Data Analysis (EDA)

In [2]:
def process_gzipped_text_file(path: str) -> Iterable[Tuple[str, ...]]:
    try:
        with gzip.open(path, "rt", encoding="utf-8") as f:
            for line in f:
                segments = tuple(line.strip().split())
                yield segments
    except FileNotFoundError:
        print(f"Error: File not found at {path}")
    except Exception as e:
        print(f"An error occurred: {e}")

def readImageFeatures(path: str):
    f = open(path, "rb")
    while True:
        itemId = f.read(8)
        if not itemId or len(itemId) < 8:
            break
        feature = struct.unpack("f" * 4096, f.read(4 * 4096))
        yield itemId, feature

def _decode_item_id(raw_id):
    if isinstance(raw_id, (bytes, bytearray)):
        return raw_id.decode("utf-8")
    return str(raw_id)

In [3]:
g1 = readImageFeatures(path=IMAGE_FEATURES_DATA_PATH)
g2 = process_gzipped_text_file(path=ITEMS_TO_OWNERS_DATA_PATH)
g3 = process_gzipped_text_file(path=APPRECIATE_DATA_PATH)

# Item ownership lookups
item_to_owner: Dict[str, str] = {}
owner_to_items: Dict[str, set] = defaultdict(set)
for row in g2:
    item, owner = row[0], row[1]
    item_to_owner[item] = owner
    owner_to_items[owner].add(item)

# Interaction histories
user_to_items: Dict[str, List[Tuple[str, Any]]] = defaultdict(list)  # user -> list of (item, timestamp)
item_to_users: Dict[str, List[Tuple[str, Any]]] = defaultdict(list)  # item -> list of (user, timestamp)
for user_id, item_id, ts in g3:
    ts_int = int(ts) if ts.isdigit() else ts
    user_to_items[user_id].append((item_id, ts_int))
    item_to_users[item_id].append((user_id, ts_int))

In [None]:
def summarize_counter(counter: Counter) -> Dict[str, float]:
    if not counter:
        return {"count": 0, "mean": 0.0, "median": 0.0, "p90": 0.0, "p99": 0.0, "max": 0}
    vals = np.fromiter(counter.values(), dtype=np.int64)
    return {
        "count": len(counter),
        "mean": float(vals.mean()),
        "median": float(np.percentile(vals, 50)),
        "p90": float(np.percentile(vals, 90)),
        "p99": float(np.percentile(vals, 99)),
        "max": int(vals.max()),
    }

def load_feature_ids(path: str, limit: int = None) -> set:
    ids = set()
    for idx, (raw_id, _) in enumerate(readImageFeatures(path)):
        ids.add(_decode_item_id(raw_id))
        if limit and idx + 1 >= limit:
            break
    return ids

def compute_eda(
    ownership_path: str = ITEMS_TO_OWNERS_DATA_PATH,
    interaction_path: str = APPRECIATE_DATA_PATH,
    feature_path: str = IMAGE_FEATURES_DATA_PATH,
    feature_id_limit: int = None,  
) -> Dict[str, Any]:
    owner_item_counts = Counter()
    with gzip.open(ownership_path, "rt", encoding="utf-8") as f:
        for line in f:
            parts = line.strip().split()
            if len(parts) >= 2:
                owner_item_counts[parts[1]] += 1
    ownership_summary = summarize_counter(owner_item_counts)

    user_inter_counts = Counter()
    item_inter_counts = Counter()
    day_bins = Counter()
    ts_min, ts_max = None, None
    with gzip.open(interaction_path, "rt", encoding="utf-8") as f:
        for line in f:
            parts = line.strip().split()
            if len(parts) >= 3:
                user, item, ts = parts[0], parts[1], int(parts[2])
                user_inter_counts[user] += 1
                item_inter_counts[item] += 1
                day_bins[ts // 86400] += 1
                ts_min = ts if ts_min is None else min(ts_min, ts)
                ts_max = ts if ts_max is None else max(ts_max, ts)

    cold_users_1 = sum(1 for v in user_inter_counts.values() if v == 1)
    cold_items_1 = sum(1 for v in item_inter_counts.values() if v == 1)
    cold_users_3 = sum(1 for v in user_inter_counts.values() if v <= 3)
    cold_items_3 = sum(1 for v in item_inter_counts.values() if v <= 3)

    interaction_summary = {
        "total_interactions": int(sum(user_inter_counts.values())),
        "users": summarize_counter(user_inter_counts),
        "items": summarize_counter(item_inter_counts),
        "timestamp_range": (ts_min, ts_max),
        "cold_start": {
            "users_eq1": cold_users_1,
            "items_eq1": cold_items_1,
            "users_le3": cold_users_3,
            "items_le3": cold_items_3,
        },
        "top_days": day_bins.most_common(5),
    }

    feature_ids = load_feature_ids(feature_path, limit=feature_id_limit)
    inter_items = set(item_inter_counts.keys())
    missing_features = inter_items - feature_ids
    record_size = 8 + 4096 * 4
    feature_count_from_size = os.path.getsize(feature_path) // record_size

    return {
        "ownership": ownership_summary,
        "interactions": interaction_summary,
        "image_feature_ids_loaded": len(feature_ids),
        "image_feature_count_filesize": int(feature_count_from_size),
        "interaction_items_missing_features": len(missing_features),
    }

def print_eda_report(eda: Dict[str, Any]) -> None:
    print("\n=== Exploratory Data Analysis ===")
    own = eda["ownership"]
    print(
        f"Ownership: {int(own.get('count', 0)):,} owners, "
        f"avg items/owner={own.get('mean', 0):.2f}, median={own.get('median', 0):.0f}, "
        f"p90={own.get('p90', 0):.0f}, p99={own.get('p99', 0):.0f}, max={own.get('max', 0):,}"
    )

    inter = eda["interactions"]
    print(f"\nInteractions: {inter['total_interactions']:,}")
    u = inter["users"]
    print(
        f"Users: {int(u.get('count', 0)):,}; "
        f"avg={u.get('mean', 0):.2f}, median={u.get('median', 0):.0f}, "
        f"p90={u.get('p90', 0):.0f}, p99={u.get('p99', 0):.0f}, max={u.get('max', 0):,}"
    )
    it = inter["items"]
    print(
        f"Items: {int(it.get('count', 0)):,}; "
        f"avg={it.get('mean', 0):.2f}, median={it.get('median', 0):.0f}, "
        f"p90={it.get('p90', 0):.0f}, p99={it.get('p99', 0):.0f}, max={it.get('max', 0):,}"
    )
    ts_min, ts_max = inter["timestamp_range"]
    print(f"Timestamps span: {ts_min} to {ts_max} "
          f"({datetime.datetime.fromtimestamp(ts_max, tz=datetime.timezone.utc).date()})")

    cold = inter["cold_start"]
    print(
        f"Cold-start: users with 1 interaction={cold['users_eq1']:,}, <=3={cold['users_le3']:,}; "
        f"items with 1 interaction={cold['items_eq1']:,}, <=3={cold['items_le3']:,}"
    )

    top_days = [
        f"{datetime.datetime.fromtimestamp(day * 86400, tz=datetime.timezone.utc).date()}: {count:,}"
        for day, count in inter["top_days"]
    ]
    print("Top 5 days by interactions: " + ("; ".join(top_days) if top_days else "n/a"))

    print(
        f"Image features loaded: {eda['image_feature_ids_loaded']:,} "
        f"(from file size est: {eda['image_feature_count_filesize']:,}); "
        f"interaction items missing features: {eda['interaction_items_missing_features']:,}"
    )

In [None]:
eda_stats = compute_eda(feature_id_limit=IMAGE_FEATURE_LIMIT)  
print_eda_report(eda_stats)


=== Exploratory Data Analysis ===
Ownership: 51,487 owners, avg items/owner=3.62, median=2, p90=8, p99=21, max=153

Interactions: 1,000,000
Users: 63,497; avg=15.75, median=4, p90=32, p99=197, max=2,260
Items: 178,788; avg=5.59, median=2, p90=8, p99=67, max=1,793
Timestamps span: 1307583271 to 1321254674 (2011-11-14)
Cold-start: users with 1 interaction=15,938, <=3=29,875; items with 1 interaction=84,996, <=3=136,787
Top 5 days by interactions: 2011-11-08: 9,862; 2011-11-02: 9,485; 2011-10-24: 9,482; 2011-11-09: 9,453; 2011-10-03: 9,340
Image features loaded: 70,000 (from file size est: 178,787); interaction items missing features: 108,788


## 3. Modeling

## 4. Evaluation & Results

## 5. Related Work