In [None]:
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License.

from timeit import default_timer
from datetime import timedelta


class Timer(object):
    """Timer class.

    `Original code <https://github.com/miguelgfierro/pybase/blob/2298172a13fb4a243754acbc6029a4a2dcf72c20/log_base/timer.py>`_.
    
    Examples:
        >>> import time
        >>> t = Timer()
        >>> t.start()
        >>> time.sleep(1)
        >>> t.stop()
        >>> t.interval < 1
        True
        >>> with Timer() as t:
        ...   time.sleep(1)
        >>> t.interval < 1
        True
        >>> "Time elapsed {}".format(t) #doctest: +ELLIPSIS
        'Time elapsed 1...'
    """

    def __init__(self):
        self._timer = default_timer
        self._interval = 0
        self.running = False

    def __enter__(self):
        self.start()
        return self

    def __exit__(self, *args):
        self.stop()

    def __str__(self):
        return "{:0.4f}".format(self.interval)

    def start(self):
        """Start the timer."""
        self.init = self._timer()
        self.running = True

    def stop(self):
        """Stop the timer. Calculate the interval in seconds."""
        self.end = self._timer()
        try:
            self._interval = self.end - self.init
            self.running = False
        except AttributeError:
            raise ValueError(
                "Timer has not been initialized: use start() or the contextual form with Timer() as t:"
            )

    @property
    def interval(self):
        """Get time interval in seconds.

        Returns:
            float: Seconds.
        """
        if self.running:
            raise ValueError("Timer has not been stopped, please use stop().")
        else:
            return self._interval

In [25]:
import pandas as pd
import numpy as np
import time
from collections import deque

# Load the training dataset
behaviors_columns = ["Impression ID", "User ID", "Time", "History", "Impressions"]
behaviors_df = pd.read_csv("../data/raw/MINDsmall_dev/behaviors.tsv", sep='\t', names=behaviors_columns, header=None)

# Clean data: Drop rows with missing 'History' or 'Impressions' (NaN values)
behaviors_df = behaviors_df.dropna(subset=["History", "Impressions"])

# Convert 'Time' to datetime and get the timestamp (seconds since the epoch)
behaviors_df['Time'] = pd.to_datetime(behaviors_df['Time'])
behaviors_df['Timestamp'] = behaviors_df['Time'].apply(lambda x: x.timestamp())

# Get the most recent timestamp from the data
latest_timestamp = behaviors_df['Timestamp'].max()

# Define the 7-day time window in seconds
TIME_WINDOW = 7 * 24 * 60 * 60  # 7 days in seconds
BUFFER_SIZE = 10000  # Maximum number of articles to store in the buffer

# Recency-weighted model parameters
DECAY_FACTOR = 0.9  # Factor by which recency decays

# Efficient processing by only keeping interactions from the last 7 days
behaviors_df = behaviors_df[behaviors_df['Timestamp'] >= (latest_timestamp - TIME_WINDOW)]

# Preprocess interactions efficiently using pandas' groupby and apply
class RecencyWeightedRecommender:
    def __init__(self, buffer_size, decay_factor):
        self.buffer_size = buffer_size
        self.decay_factor = decay_factor
        self.buffer = deque(maxlen=buffer_size)  # Ring buffer to store article interactions
        self.click_counts = {}  # stores total clicks per article
        self.timestamps = {}  # stores timestamps of last interactions

    def add_interaction(self, article_id, clicked, timestamp):
        # Update click counts for the article
        if article_id in self.click_counts:
            self.click_counts[article_id] += clicked
        else:
            self.click_counts[article_id] = clicked
        
        # Update timestamp for recency weighting
        self.timestamps[article_id] = timestamp

        # Add to the buffer
        if article_id not in self.buffer:
            self.buffer.append(article_id)

        # Apply time decay and evict outdated articles
        self._apply_decay(timestamp)

    def _apply_decay(self, timestamp):
        # Remove articles that are too old (outside the 7-day window)
        for article in list(self.buffer):
            age = timestamp - self.timestamps[article]
            if age > TIME_WINDOW:
                self.buffer.remove(article)
                del self.click_counts[article]
                del self.timestamps[article]

    def recommend(self):
        # Rank articles based on recency-weighted clicks
        scores = {}
        for article in self.buffer:
            # Apply recency decay to the click count
            age = time.time() - self.timestamps[article]
            weighted_clicks = self.click_counts[article] * (self.decay_factor ** (age / TIME_WINDOW))
            scores[article] = weighted_clicks
        
        # Sort articles by the weighted score
        sorted_articles = sorted(scores, key=scores.get, reverse=True)
        return sorted_articles

# Initialize the recommender
recommender = RecencyWeightedRecommender(buffer_size=BUFFER_SIZE, decay_factor=DECAY_FACTOR)

# Process interactions efficiently in bulk
# Expand History and Impressions columns to simulate interactions
interactions = []
for index, row in behaviors_df.iterrows():
    article_ids = row['History'].split()  # Assuming space-separated article IDs
    timestamp = row['Timestamp']
    clicked = int(row['Impressions'].split()[0].split('-')[1])  # Assuming 'Impressions' column contains 'News_ID-clicked' data
    
    for article_id in article_ids:
        if article_id.strip():
            interactions.append((article_id, clicked, timestamp))

# Add all interactions in one go to the recommender
for article_id, clicked, timestamp in interactions:
    recommender.add_interaction(article_id, clicked, timestamp)

# Get top recommendations
recommended_articles = recommender.recommend()

# Output top 10 recommended articles
print("Top 10 Recommended Articles based on Recent Interactions:")
print(recommended_articles[:10])




KeyboardInterrupt: 

In [None]:
import pandas as pd
import time

# Define decay parameters
DECAY_FACTOR = 0.9       # decay multiplier per TIME_WINDOW
TIME_WINDOW = 604800     # one week in seconds


# Load datasets
behaviors_file = "../data/raw/MINDsmall_dev/behaviors.tsv"
news_file = "../data/raw/MINDsmall_dev/news.tsv"
behaviors_df = pd.read_csv(behaviors_file, sep='\t', header=None, names=["Impression ID", "User ID", "Time", "History", "Impressions"])
news_df = pd.read_csv(news_file, sep='\t', header=None, names=["News ID", "Category", "SubCategory", "Title", "Abstract", "URL", "Title Entities", "Abstract Entities"])

# Drop rows with missing Impressions
behaviors_df = behaviors_df.dropna(subset=["Impressions"])

# Convert 'Time' to datetime and compute epoch timestamp
behaviors_df['Time'] = pd.to_datetime(behaviors_df['Time'])
behaviors_df['Timestamp'] = behaviors_df['Time'].apply(lambda x: x.timestamp())

# Determine the most recent interaction timestamp (across all users)
latest_time = behaviors_df['Timestamp'].max()

# Filter the dataset to only include interactions from the past week
behaviors_df = behaviors_df[behaviors_df['Timestamp'] >= (latest_time - TIME_WINDOW)]

# Function to parse the Impressions string and extract clicked news IDs (where label is "1")
def parse_impressions(impressions_str):
    clicked = []
    # Each impression is in the form "NewsID-label", separated by whitespace
    for item in impressions_str.split():
        news_id, click_label = item.split('-')
        if click_label == "1":
            clicked.append(news_id)
    return clicked

# Extract the clicked articles from the Impressions column
behaviors_df['Clicked_Articles'] = behaviors_df['Impressions'].apply(parse_impressions)


def collect_to_list():
    read_articles = []
    for index, row in behaviors_df.head(100).iterrows():
        read_articles.extend(row["Clicked_Articles"])
    print(read_articles)

collect_to_list()

print("Top 10 Recommended Articles based on recent one-week interactions and dynamic recency decay:")
print(recommended_articles)




['N31958']
['N23513']
['N5940']
['N15347']
['N5940', 'N31958']
['N24802', 'N31958']
['N42767']
['N20477', 'N57818']
['N60939']
['N28640', 'N24802']
['N24802', 'N15347']
['N11390', 'N51470', 'N24802']
['N31958']
['N36940']
['N6400']
['N13854', 'N5507']
['N31958']
['N51008']
['N6400']
['N23513']
['N62365']
['N32536']
['N46976', 'N31958']
['N37204', 'N496']
['N62365']
['N42767']
['N20477']
['N23513']
['N54911']
['N43277']
['N47383']
['N38620']
['N29862']
['N12409', 'N52492']
['N20036']
['N42844']
['N1567']
['N31958']
['N30290', 'N48426', 'N49285', 'N14223']
['N24802']
['N31958']
['N50775']
['N55237']
['N3168']
['N26063', 'N3788']
['N23513', 'N31958']
['N52492']
['N58748']
['N58748', 'N42844', 'N13270', 'N30290']
['N42844']
['N20036']
['N30290', 'N19990', 'N5940', 'N29393']
['N20036', 'N5051']
['N60762']
['N17672']
['N16118', 'N42180', 'N46279']
['N36779']
['N57560']
['N6916', 'N58098']
['N6400']
['N24802', 'N55237', 'N11390']
['N53572']
['N10083', 'N23675']
['N18708']
['N9407']
['N31958',