In [None]:
import pandas as pd
import numpy as np
import os
import json

In [2]:
behaviors_path = r"data/MINDsmall_train/behaviors.tsv"
news_path = r"data/MINDsmall_train/news.tsv"

# Load behaviors data
behaviors = pd.read_csv(behaviors_path, sep='\t', header=None, names=['ImpressionID', 'UserID', 'Time', 'History', 'Impressions'])

# Load news data
news = pd.read_csv(news_path, sep='\t', header=None, names=['NewsID', 'Category', 'SubCategory', 'Title', 'Abstract', 'URL', 'TitleEntities', 'AbstractEntities'])

In [4]:
len(behaviors['UserID'].unique())

50000

In [21]:
# Transform Impressions column
behaviors['Impressions'] = behaviors['Impressions'].apply(lambda x: x.split(' '))
behaviors = behaviors.explode('Impressions')
behaviors[['NewsID','Clicked']] = behaviors.Impressions.str.split('-', expand = True)
behaviors = behaviors.drop(columns = ['Impressions'])

In [22]:
# Transform time column
behaviors['Time'] = pd.to_datetime(behaviors['Time'])
behaviors['Date'] = behaviors['Time'].dt.date
behaviors['Hour'] = behaviors['Time'].dt.hour
behaviors['DayOfWeek'] = behaviors['Time'].dt.strftime('%A')
behaviors['IsWeekend'] = behaviors['DayOfWeek'].isin(['Saturday', 'Sunday'])

In [23]:
# Transform TitleEntities & AbstractEntities

def ExtractLabels(entity_json, threshold = 0.9):
    try:
        entities = json.loads(entity_json)
        return [e['Label'] for e in entities if 'Label' in e and e['Confidence'] >= threshold]
    except:
        return []

# Apply to create a new column of list of entity labels
news['TitleEntityLabels'] = news['TitleEntities'].apply(ExtractLabels)
news['AbstractEntityLabels'] = news['AbstractEntities'].apply(ExtractLabels)

# Explode so each label gets its own row
news_title_exploded = news.explode('TitleEntityLabels')
news_abstract_exploded = news.explode('AbstractEntityLabels')

In [24]:
# Saved as csv files
news_title_exploded = news_title_exploded.drop(columns = ['Category','SubCategory','Title','Abstract','URL','AbstractEntities','AbstractEntityLabels'])
news_abstract_exploded = news_abstract_exploded.drop(columns = ['Category','SubCategory','Title','Abstract','URL','TitleEntities','TitleEntityLabels'])

news_title_exploded.to_csv('data/news_title_labels.csv')
news_abstract_exploded.to_csv('data/news_abstract_labels.csv')
behaviors.to_csv('data/behaviors_flatten.csv')

In [25]:
news_dim = news.drop(columns = ['TitleEntities', 'AbstractEntities','AbstractEntityLabels','TitleEntityLabels'])
news_dim.to_csv('data/news_dim.csv')

In [26]:
news_title_exploded.head()

Unnamed: 0,NewsID,TitleEntities,TitleEntityLabels
0,N55528,"[{""Label"": ""Prince Philip, Duke of Edinburgh"",...","Prince Philip, Duke of Edinburgh"
0,N55528,"[{""Label"": ""Prince Philip, Duke of Edinburgh"",...","Charles, Prince of Wales"
0,N55528,"[{""Label"": ""Prince Philip, Duke of Edinburgh"",...",Elizabeth II
1,N19639,"[{""Label"": ""Adipose tissue"", ""Type"": ""C"", ""Wik...",Adipose tissue
2,N61837,[],


In [27]:
news_abstract_exploded.head()


Unnamed: 0,NewsID,AbstractEntities,AbstractEntityLabels
0,N55528,[],
1,N19639,"[{""Label"": ""Adipose tissue"", ""Type"": ""C"", ""Wik...",Adipose tissue
2,N61837,"[{""Label"": ""Ukraine"", ""Type"": ""G"", ""WikidataId...",Ukraine
3,N53526,"[{""Label"": ""National Basketball Association"", ...",National Basketball Association
4,N38324,"[{""Label"": ""Skin tag"", ""Type"": ""C"", ""WikidataI...",Skin tag


In [30]:
behaviors.head()
print(behaviors['Date'].unique())

[datetime.date(2019, 11, 11) datetime.date(2019, 11, 12)
 datetime.date(2019, 11, 14) datetime.date(2019, 11, 13)
 datetime.date(2019, 11, 10) datetime.date(2019, 11, 9)]


In [29]:
news_dim.head()

Unnamed: 0,NewsID,Category,SubCategory,Title,Abstract,URL
0,N55528,lifestyle,lifestyleroyals,"The Brands Queen Elizabeth, Prince Charles, an...","Shop the notebooks, jackets, and more that the...",https://assets.msn.com/labs/mind/AAGH0ET.html
1,N19639,health,weightloss,50 Worst Habits For Belly Fat,These seemingly harmless habits are holding yo...,https://assets.msn.com/labs/mind/AAB19MK.html
2,N61837,news,newsworld,The Cost of Trump's Aid Freeze in the Trenches...,Lt. Ivan Molchanets peeked over a parapet of s...,https://assets.msn.com/labs/mind/AAJgNsz.html
3,N53526,health,voices,I Was An NBA Wife. Here's How It Affected My M...,"I felt like I was a fraud, and being an NBA wi...",https://assets.msn.com/labs/mind/AACk2N6.html
4,N38324,health,medical,"How to Get Rid of Skin Tags, According to a De...","They seem harmless, but there's a very good re...",https://assets.msn.com/labs/mind/AAAKEkt.html
