In [25]:
import pandas as pd
import os
import json

In [26]:
behaviors_path = r"data/MINDsmall_train/behaviors.tsv"
news_path = r"data/MINDsmall_train/news.tsv"

# Load behaviors data
behaviors = pd.read_csv(behaviors_path, sep='\t', header=None, names=['ImpressionID', 'UserID', 'Time', 'History', 'Impressions'])

# Load news data
news = pd.read_csv(news_path, sep='\t', header=None, names=['NewsID', 'Category', 'SubCategory', 'Title', 'Abstract', 'URL', 'TitleEntities', 'AbstractEntities'])

In [27]:
# Transform Impressions column
behaviors['Impressions'] = behaviors['Impressions'].apply(lambda x: x.split(' '))
behaviors = behaviors.explode('Impressions')
behaviors[['NewsID','Clicked']] = behaviors.Impressions.str.split('-', expand = True)
behaviors = behaviors.drop(columns = ['Impressions'])

In [28]:
# Transform time column
behaviors['Time'] = pd.to_datetime(behaviors['Time'])
behaviors['Date'] = behaviors['Time'].dt.date
behaviors['Hour'] = behaviors['Time'].dt.hour
behaviors['DayOfWeek'] = behaviors['Time'].dt.strftime('%A')
behaviors['IsWeekend'] = behaviors['DayOfWeek'].isin(['Saturday', 'Sunday'])

In [None]:
# Transform TitleEntities & AbstractEntities

def ExtractLabels(entity_json, threshold = 0.9):
    try:
        entities = json.loads(entity_json)
        return [e['Label'] for e in entities if 'Label' in e and e['Confidence'] >= threshold]
    except:
        return []

# Apply to create a new column of list of entity labels
news['TitleEntityLabels'] = news['TitleEntities'].apply(ExtractLabels)
news['AbstractEntityLabels'] = news['AbstractEntities'].apply(ExtractLabels)

# Explode so each label gets its own row
news_title_exploded = news.explode('TitleEntityLabels')
news_abstract_exploded = news.explode('AbstractEntityLabels')

In [None]:
# Saved as csv files
news_title_exploded = news_title_exploded.drop(columns = ['Category','SubCategory','Title','Abstract','URL','AbstractEntities','AbstractEntityLabels'])
news_abstract_exploded = news_abstract_exploded.drop(columns = ['Category','SubCategory','Title','Abstract','URL','TitleEntities','TitleEntityLabels'])

news_title_exploded.to_csv('data/news_title_labels.csv')
news_abstract_exploded.to_csv('data/news_abstract_labels.csv')
behaviors.to_csv('data/behaviors_flatten.csv')

In [32]:
news_dim = news.drop(columns = ['TitleEntities', 'AbstractEntities','AbstractEntityLabels','TitleEntityLabels'])
news_dim.to_csv('data/news_dim.csv')