# Integration of Synthetic data

Now that we've created some synthetic data we can use.

## Some Initial Checks
Do some initial checks to validate things.

---
### Are all User IDs in history in synthetic data?

In [18]:
import pandas as pd
import ast

data_path_base = "/app/datasets/"
MIND_type= "MINDsmall"
data_path = data_path_base + MIND_type + "/"
history_file = data_path + "history.tsv"
new_behaviors_file = data_path + "analysts_behavior.tsv"

news_df_file = data_path + "train/news.tsv"

# Load files
history_df = pd.read_csv(history_file, sep="\t")
synthetic_df = pd.read_csv(new_behaviors_file, sep="\t")


In [2]:
news_df = pd.read_csv(news_df_file, sep="\t", header=None)
news_df.columns = ["news_id", "category", "subcategory", "title", "abstract", "url", "title_entities", "abstract_entities"]


In [4]:
display(history_df.columns)
display(synthetic_df.columns)

Index(['uid', 'history'], dtype='object')

Index(['impression_id', 'user_id', 'time', 'history', 'impressions'], dtype='object')

In [5]:
# Check which users in history.tsv appear in synthetic data
missing_users = set(history_df["uid"]) - set(synthetic_df["user_id"])

print(f"Missing users in synthetic data: {len(missing_users)}")
if missing_users:
    print("Examples:", list(missing_users)[:10])


Missing users in synthetic data: 0


In [6]:
# Check which users in history.tsv appear in synthetic data
missing_users = set(synthetic_df["user_id"]) - set(history_df["uid"])

print(f"Missing users in history: {len(missing_users)}")
if missing_users:
    print("Examples:", list(missing_users)[:10])


Missing users in history: 0


---
### Do history values refer to valid News IDs?

In [7]:
valid_news_ids = set(news_df.news_id.values)

# Step 1: Parse and filter non-empty lists
parsed_lists = [ast.literal_eval(s) for s in list(history_df['history'])]



# Step 2: Validate
for i, sublist in enumerate(parsed_lists):
    invalid_items = [item for item in sublist if item not in valid_news_ids]
    if invalid_items:
        print(f"List at index {i} has invalid items: {invalid_items}")


---

## Now let's combine some data 

In [30]:
import pandas as pd
from collections import defaultdict
import ast
# Step 1: Collect clicked news IDs per user
clicked_map = defaultdict(set)

for _, row in synthetic_df.iterrows():
    uid = row["user_id"]
    impressions = row["impressions"]
    if pd.notna(impressions):
        for item in impressions.strip().split():
            if "-" in item:
                news_id, label = item.split("-")
                if label == "1":  # clicked
                    clicked_map[uid].add(news_id)

# Step 2: Compare against history.tsv
mismatch_report = []

for _, row in history_df.iterrows():
    uid = row["uid"]
    try:
        history_items = ast.literal_eval(row["history"]) if row["history"].startswith("[") else row["history"].split()
    except Exception as e:
        print(f"Parse error for uid {uid}: {e}")
        continue

    user_clicked = clicked_map.get(uid, set())
    missing = [nid for nid in history_items if nid not in user_clicked]

    if missing:
        mismatch_report.append((uid, missing))

# Step 3: Report
print(f"\nUsers with mismatches: {len(mismatch_report)}")
for uid, missing in mismatch_report[:10]:
    print(f"User {uid} has {len(missing)} history items not found in their clicks: {missing}")


Users with mismatches: 0



Users with mismatches: 0
