## MIND Data Exploration

In [1]:
# Importing necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter, defaultdict
from datetime import datetime
import os
import warnings
warnings.filterwarnings("ignore")

# Set style
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")

In [2]:
# ============================================================================
# 1. LOAD AND UNDERSTAND DATA STRUCTURE
# ============================================================================

# Define data paths
TRAIN_DIR = '../data/raw/train/'
DEV_DIR = '../data/raw/dev/'

# List available files
print("\nFiles in training directory:")
train_files = os.listdir(TRAIN_DIR)
for f in sorted(train_files):
    size_mb = os.path.getsize(os.path.join(TRAIN_DIR, f)) / (1024*1024)
    print(f"  {f:<30} {size_mb:>10.2f} MB")


Files in training directory:
  __placeholder__                      0.00 MB
  behaviors.tsv                     1310.20 MB
  entity_embedding.vec                38.44 MB
  news.tsv                            80.95 MB
  relation_embedding.vec               1.00 MB


In [4]:
# ============================================================================
# 2. LOAD NEWS ARTICLES
# ============================================================================
news_columns = ['news_id', 'category', 'subcategory', 'title', 'abstract', 
                'url', 'title_entities', 'abstract_entities']

news_train = pd.read_csv(
    os.path.join(TRAIN_DIR, 'news.tsv'),
    sep='\t',
    names=news_columns,
    encoding='utf-8'
)

print(f"\nTotal news articles: {len(news_train):,}")
print(f"\nFirst few articles:")
print(news_train.head())


Total news articles: 101,527

First few articles:
  news_id   category               subcategory  \
0  N88753  lifestyle           lifestyleroyals   
1  N45436       news  newsscienceandtechnology   
2  N23144     health                weightloss   
3  N86255     health                   medical   
4  N93187       news                 newsworld   

                                               title  \
0  The Brands Queen Elizabeth, Prince Charles, an...   
1    Walmart Slashes Prices on Last-Generation iPads   
2                      50 Worst Habits For Belly Fat   
3  Dispose of unwanted prescription drugs during ...   
4  The Cost of Trump's Aid Freeze in the Trenches...   

                                            abstract  \
0  Shop the notebooks, jackets, and more that the...   
1  Apple's new iPad releases bring big deals on l...   
2  These seemingly harmless habits are holding yo...   
3                                                NaN   
4  Lt. Ivan Molchanets peeked o

In [8]:
print(f"Data types:\n{news_train.dtypes}")
print(f"\nMissing values:\n{news_train.isnull().sum()}")

Data types:
news_id              object
category             object
subcategory          object
title                object
abstract             object
url                  object
title_entities       object
abstract_entities    object
dtype: object

Missing values:
news_id                 0
category                0
subcategory             0
title                   0
abstract             5415
url                     0
title_entities          3
abstract_entities       6
dtype: int64


In [9]:
# ============================================================================
# 3. ANALYZE NEWS CATEGORIES
# ============================================================================
# Category distribution
category_counts = news_train['category'].value_counts()
print(f"\nNumber of unique categories: {len(category_counts)}")
print(f"\nTop 10 categories:")
print(category_counts.head(10))

# Subcategory distribution
subcategory_counts = news_train['subcategory'].value_counts()
print(f"\nNumber of unique subcategories: {len(subcategory_counts)}")
print(f"\nTop 10 subcategories:")
print(subcategory_counts.head(10))


Number of unique categories: 18

Top 10 categories:
category
sports          32020
news            30478
finance          5916
travel           4955
lifestyle        4570
video            4569
foodanddrink     4418
weather          4255
autos            3071
health           2929
Name: count, dtype: int64

Number of unique subcategories: 285

Top 10 subcategories:
subcategory
newsus               14467
football_nfl         11813
newspolitics          5145
weathertopstories     4253
newscrime             3676
baseball_mlb          3617
football_ncaa         3450
news                  3351
basketball_nba        3226
more_sports           2801
Name: count, dtype: int64


In [10]:
# ============================================================================
# 4. ANALYZE TEXT CHARACTERISTICS
# ============================================================================
# Title lengths
news_train['title_length'] = news_train['title'].fillna('').apply(len)
news_train['title_word_count'] = news_train['title'].fillna('').apply(
    lambda x: len(x.split())
)

print(f"\nTitle statistics:")
print(f"  Mean length: {news_train['title_length'].mean():.1f} characters")
print(f"  Mean words: {news_train['title_word_count'].mean():.1f}")
print(f"  Max length: {news_train['title_length'].max()} characters")

# Abstract lengths (many might be missing)
news_train['abstract_length'] = news_train['abstract'].fillna('').apply(len)
print(f"\nAbstract statistics:")
print(f"  Non-empty abstracts: {(news_train['abstract_length'] > 0).sum():,} "
      f"({(news_train['abstract_length'] > 0).mean()*100:.1f}%)")
print(f"  Mean length (when present): "
      f"{news_train[news_train['abstract_length'] > 0]['abstract_length'].mean():.1f} characters")


Title statistics:
  Mean length: 66.0 characters
  Mean words: 10.7
  Max length: 554 characters

Abstract statistics:
  Non-empty abstracts: 96,112 (94.7%)
  Mean length (when present): 228.7 characters


In [None]:
# ============================================================================
# 5. LOAD USER BEHAVIORS (IMPRESSIONS)
# ============================================================================
# Behaviors.tsv columns: [Impression ID, User ID, Time, History, Impressions]
# History: Space-separated news IDs that user clicked before
# Impressions: Space-separated News-Label pairs (News-1 means clicked, News-0 means not)

behaviors_columns = ['impression_id', 'user_id', 'time', 'history', 'impressions']

# Load first 100k rows for exploration (full dataset is large)
behaviors_train = pd.read_csv(
    os.path.join(TRAIN_DIR, 'behaviors.tsv'),
    sep='\t',
    names=behaviors_columns,
    nrows=100000
)

print(f"\nTotal impressions (first 100k): {len(behaviors_train):,}")
print(f"\nFirst few behaviors:")
print(behaviors_train.head())


Total impressions (first 100k): 100,000

First few behaviors:
   impression_id  user_id                    time  \
0              1   U87243  11/10/2019 11:30:54 AM   
1              2  U598644   11/12/2019 1:45:29 PM   
2              3  U532401  11/13/2019 11:23:03 AM   
3              4  U593596  11/12/2019 12:24:09 PM   
4              5  U239687   11/14/2019 8:03:01 PM   

                                             history  \
0  N8668 N39081 N65259 N79529 N73408 N43615 N2937...   
1  N56056 N8726 N70353 N67998 N83823 N111108 N107...   
2  N128643 N87446 N122948 N9375 N82348 N129412 N5...   
3  N31043 N39592 N4104 N8223 N114581 N92747 N1207...   
4  N65250 N122359 N71723 N53796 N41663 N41484 N11...   

                                         impressions  
0  N78206-0 N26368-0 N7578-0 N58592-0 N19858-0 N5...  
1  N47996-0 N82719-0 N117066-0 N8491-0 N123784-0 ...  
2              N103852-0 N53474-0 N127836-0 N47925-1  
3  N38902-0 N76434-0 N71593-0 N100073-0 N108736-0...  
4  N76

In [12]:
# ============================================================================
# 6. ANALYZE USER BEHAVIORS
# ============================================================================
# Number of unique users
n_users = behaviors_train['user_id'].nunique()
print(f"\nUnique users in sample: {n_users:,}")

# Parse impressions to count clicks
def parse_impressions(imp_str):
    """Parse impression string to get clicked and non-clicked counts"""
    if pd.isna(imp_str):
        return 0, 0
    
    impressions = imp_str.split()
    clicked = sum(1 for imp in impressions if imp.endswith('-1'))
    not_clicked = sum(1 for imp in impressions if imp.endswith('-0'))
    return clicked, not_clicked

behaviors_train[['clicked', 'not_clicked']] = behaviors_train['impressions'].apply(
    lambda x: pd.Series(parse_impressions(x))
)

behaviors_train['total_shown'] = behaviors_train['clicked'] + behaviors_train['not_clicked']
behaviors_train['ctr'] = behaviors_train['clicked'] / behaviors_train['total_shown']

print(f"\nImpression statistics:")
print(f"  Mean articles shown per impression: {behaviors_train['total_shown'].mean():.2f}")
print(f"  Mean clicks per impression: {behaviors_train['clicked'].mean():.2f}")
print(f"  Overall CTR: {behaviors_train['ctr'].mean()*100:.2f}%")

# Analyze user history lengths
behaviors_train['history_length'] = behaviors_train['history'].fillna('').apply(
    lambda x: len(x.split()) if x else 0
)

print(f"\nUser history statistics:")
print(f"  Users with history: {(behaviors_train['history_length'] > 0).sum():,} "
      f"({(behaviors_train['history_length'] > 0).mean()*100:.1f}%)")
print(f"  Mean history length: {behaviors_train[behaviors_train['history_length'] > 0]['history_length'].mean():.1f} articles")
print(f"  Median history length: {behaviors_train[behaviors_train['history_length'] > 0]['history_length'].median():.0f} articles")
print(f"  Max history length: {behaviors_train['history_length'].max()}")



Unique users in sample: 90,101

Impression statistics:
  Mean articles shown per impression: 37.41
  Mean clicks per impression: 1.52
  Overall CTR: 10.76%

User history statistics:
  Users with history: 97,936 (97.9%)
  Mean history length: 33.7 articles
  Median history length: 20 articles
  Max history length: 801


In [13]:
# ============================================================================
# 7. TEMPORAL ANALYSIS
# ============================================================================
# Parse timestamps
behaviors_train['timestamp'] = pd.to_datetime(
    behaviors_train['time'], 
    format='%m/%d/%Y %I:%M:%S %p'
)

behaviors_train['hour'] = behaviors_train['timestamp'].dt.hour
behaviors_train['day_of_week'] = behaviors_train['timestamp'].dt.dayofweek
behaviors_train['date'] = behaviors_train['timestamp'].dt.date

print(f"\nTime range:")
print(f"  Start: {behaviors_train['timestamp'].min()}")
print(f"  End: {behaviors_train['timestamp'].max()}")
print(f"  Duration: {(behaviors_train['timestamp'].max() - behaviors_train['timestamp'].min()).days} days")

# Impressions per day
daily_impressions = behaviors_train.groupby('date').size()
print(f"\nDaily impressions:")
print(f"  Mean: {daily_impressions.mean():.0f}")
print(f"  Std: {daily_impressions.std():.0f}")



Time range:
  Start: 2019-11-09 00:00:25
  End: 2019-11-14 23:59:54
  Duration: 5 days

Daily impressions:
  Mean: 16667
  Std: 5944


In [14]:
# ============================================================================
# 8. ANALYZE CLICK PATTERNS
# ============================================================================
# Extract all clicked news IDs
all_clicked_news = []
for imp_str in behaviors_train['impressions'].dropna():
    impressions = imp_str.split()
    clicked = [imp.split('-')[0] for imp in impressions if imp.endswith('-1')]
    all_clicked_news.extend(clicked)

clicked_news_counts = Counter(all_clicked_news)
print(f"\nTotal clicks in sample: {len(all_clicked_news):,}")
print(f"Unique news articles clicked: {len(clicked_news_counts):,}")

# Most popular articles
print(f"\nTop 10 most clicked articles:")
for news_id, count in clicked_news_counts.most_common(10):
    if news_id in news_train['news_id'].values:
        article = news_train[news_train['news_id'] == news_id].iloc[0]
        print(f"  {news_id}: {count:>4} clicks - "
              f"[{article['category']}] {article['title'][:60]}...")

# Click distribution
click_counts = list(clicked_news_counts.values())
print(f"\nClick distribution:")
print(f"  Mean clicks per article: {np.mean(click_counts):.2f}")
print(f"  Median clicks per article: {np.median(click_counts):.0f}")
print(f"  Max clicks per article: {np.max(click_counts)}")



Total clicks in sample: 152,347
Unique news articles clicked: 6,642

Top 10 most clicked articles:
  N98178: 2714 clicks - [sports] Charles Rogers, former Michigan State football, Detroit Lion...
  N30899: 2138 clicks - [news] College gymnast dies following training accident in Connecti...
  N32154: 2110 clicks - [news] Porsche launches into second story of New Jersey building, k...
  N47257: 1833 clicks - [tv] Rip Taylor's Cause of Death Revealed, Memorial Service Sched...
  N7937: 1639 clicks - [finance] Dean Foods files for bankruptcy...
  N31174: 1487 clicks - [music] Broadway Actress Laurel Griggs Dies at Age 13...
  N3664: 1447 clicks - [music] Broadway Star Laurel Griggs Suffered Asthma Attack Before Sh...
  N47925: 1347 clicks - [news] Three school workers charged in death of special needs stude...
  N53474: 1220 clicks - [news] Rep. Tim Ryan endorses Biden in Democratic primary...
  N76665: 1176 clicks - [lifestyle] Prince Harry and Meghan Markle just shared a never-before-se

In [15]:
# ============================================================================
# 9. CATEGORY-LEVEL CLICK ANALYSIS
# ============================================================================
# Get categories of clicked articles
clicked_categories = []
for news_id in all_clicked_news:
    if news_id in news_train['news_id'].values:
        cat = news_train[news_train['news_id'] == news_id].iloc[0]['category']
        clicked_categories.append(cat)

category_click_counts = Counter(clicked_categories)
print(f"\nMost clicked categories:")
for cat, count in category_click_counts.most_common(10):
    print(f"  {cat:>20}: {count:>6} clicks ({count/len(all_clicked_news)*100:>5.2f}%)")


Most clicked categories:
                  news:  44900 clicks (29.47%)
                sports:  17773 clicks (11.67%)
             lifestyle:  17220 clicks (11.30%)
               finance:  13218 clicks ( 8.68%)
                 music:  10327 clicks ( 6.78%)
                    tv:   9286 clicks ( 6.10%)
                health:   7131 clicks ( 4.68%)
          foodanddrink:   7079 clicks ( 4.65%)
         entertainment:   6867 clicks ( 4.51%)
                travel:   5366 clicks ( 3.52%)


In [16]:
# ============================================================================
# 10. USER ENGAGEMENT ANALYSIS
# ============================================================================
# Impressions per user
user_impressions = behaviors_train.groupby('user_id').size()
print(f"\nImpressions per user:")
print(f"  Mean: {user_impressions.mean():.2f}")
print(f"  Median: {user_impressions.median():.0f}")
print(f"  Max: {user_impressions.max()}")

# Clicks per user
user_clicks = behaviors_train.groupby('user_id')['clicked'].sum()
print(f"\nClicks per user:")
print(f"  Mean: {user_clicks.mean():.2f}")
print(f"  Median: {user_clicks.median():.0f}")
print(f"  Users with 0 clicks: {(user_clicks == 0).sum()} "
      f"({(user_clicks == 0).mean()*100:.1f}%)")

# User CTR distribution
user_total_shown = behaviors_train.groupby('user_id')['total_shown'].sum()
user_ctr = user_clicks / user_total_shown
print(f"\nUser-level CTR:")
print(f"  Mean: {user_ctr.mean()*100:.2f}%")
print(f"  Median: {user_ctr.median()*100:.2f}%")
print(f"  Std: {user_ctr.std()*100:.2f}%")


Impressions per user:
  Mean: 1.11
  Median: 1
  Max: 7

Clicks per user:
  Mean: 1.69
  Median: 1
  Users with 0 clicks: 0 (0.0%)

User-level CTR:
  Mean: 10.50%
  Median: 5.56%
  Std: 12.55%


In [17]:
# ============================================================================
# 11. COLD START ANALYSIS
# ============================================================================
# Users with no history
cold_start_users = (behaviors_train['history_length'] == 0).sum()
print(f"\nCold-start users (no history): {cold_start_users:,} "
      f"({cold_start_users/len(behaviors_train)*100:.1f}%)")

# Compare CTR between cold-start and warm users
cold_start_ctr = behaviors_train[behaviors_train['history_length'] == 0]['ctr'].mean()
warm_start_ctr = behaviors_train[behaviors_train['history_length'] > 0]['ctr'].mean()

print(f"\nCTR comparison:")
print(f"  Cold-start users: {cold_start_ctr*100:.2f}%")
print(f"  Warm users: {warm_start_ctr*100:.2f}%")
print(f"  Difference: {(warm_start_ctr - cold_start_ctr)*100:.2f} percentage points")


Cold-start users (no history): 2,064 (2.1%)

CTR comparison:
  Cold-start users: 10.72%
  Warm users: 10.76%
  Difference: 0.03 percentage points


Based on the data exploration, here are key considerations for your RL system:

1. STATE REPRESENTATION:
   - User history length varies significantly (0 to 50+ articles)
   - Need to handle cold-start users (~10-20% have no history)
   - Temporal features matter (time of day, recency)

2. ACTION SPACE:
   - Start with K=100 candidates as proposed
   - Article popularity follows power law distribution
   - Categories: ~15-20 distinct categories to consider

3. REWARD DESIGN:
   - Overall CTR is low (~5-10%), indicating sparse rewards
   - Need reward shaping or intrinsic motivation
   - Consider dwell time if available in full dataset

4. DIVERSITY CONCERNS:
   - Some categories dominate clicks (news, sports, lifestyle)
   - Need diversity metrics to avoid filter bubbles
   - Balance popularity bias vs. personalization

5. TEMPORAL DYNAMICS:
   - Dataset spans multiple days/weeks
   - Article relevance decays over time
   - User preferences may shift

6. EVALUATION CONSIDERATIONS:
   - Need to handle position bias in clicks
   - Off-policy evaluation critical (logged data)
   - Split data chronologically (not randomly)