## Crafting User Profiles from the original data

The concept will be to extract the original datasets and shape them into analysts.

In [2]:
import pandas as pd
import json
from tqdm import tqdm
from SPARQLWrapper import SPARQLWrapper, JSON
import time

MIND_type = 'MINDsmall'

data_path_base="/app/datasets/"
data_path = data_path_base + MIND_type +"/"


behaviors_file = data_path + "train/behaviors.tsv"
print(f"Behaviors File {behaviors_file}")

news_file = data_path + "train/news.tsv"
news_df = pd.read_csv(news_file, sep="\t", names=["news_id", "category", "subcategory", "title", "abstract", "url", "title_entities", "abstract_entities"])
print(f"News file {news_file}")
# Load the behaviors data
columns = ["impression_id", "user_id", "time", "history", "impressions"]
behaviors_df = pd.read_csv(behaviors_file, sep="\t", names=columns)

# Display basic statistics and data sample
#print(behaviors_df.info())


def print_elapsed_time(start_time):
    """
    Print the elapsed time since `start_time` in hours, minutes, and seconds.
    
    Args:
        start_time (float): The starting time, typically obtained from time.time().
    """
    elapsed_time = time.time() - start_time
    hours, remainder = divmod(elapsed_time, 3600)
    minutes, seconds = divmod(remainder, 60)
    print(f"Elapsed Time: {int(hours)}h {int(minutes)}m {int(seconds)}s")



Behaviors File /app/datasets/MINDsmall/train/behaviors.tsv
News file /app/datasets/MINDsmall/train/news.tsv


In [None]:
behaviors_df.head()

In [None]:
news_df.head()

In [None]:
# Replace NaN values in 'history_articles' with empty lists
behaviors_df['history_articles'] = behaviors_df['history_articles'].apply(lambda x: x if isinstance(x, list) else [])

# Count session length (history length)
behaviors_df['session_length'] = behaviors_df['history_articles'].apply(len)


In [None]:
# Parse impressions and clicks
def parse_impressions(impressions_str):
    impressions = impressions_str.split(" ")
    return [(imp.split("-")[0], int(imp.split("-")[1])) for imp in impressions]

behaviors_df['impressions_parsed'] = behaviors_df['impressions'].apply(parse_impressions)

# Calculate CTR per session
behaviors_df['ctr'] = behaviors_df['impressions_parsed'].apply(lambda imp: sum([click for _, click in imp]) / len(imp))


In [None]:
behaviors_df.head()

In [None]:
# Group by user to aggregate data
user_profiles = behaviors_df.groupby('user_id').agg({
    'session_length': 'mean',
    'ctr': 'mean',
    'history_articles': 'sum'  # Combine history across sessions
}).reset_index()


In [None]:
# Map history articles to topics using news.tsv
news_topic_mapping = news_df.set_index('news_id')['category'].to_dict()
user_profiles['topics'] = user_profiles['history_articles'].swifter.apply(
    lambda articles: pd.Series(articles).map(news_topic_mapping).value_counts(normalize=True).to_dict()
)

print(user_profiles.head())

In [None]:
news_df.head()