# Load and transform MIND dataset for cold-start scenario

Based on `004-create-mind-cold-start-basic-train-test-dataset.ipynb`.

## Install deps, download and unzip original dataset

In [1]:
# ! apt update && apt install unzip

# ! mkdir -p /tmp/mind/train
# ! mkdir -p /tmp/mind/test

# ! wget https://mind201910small.blob.core.windows.net/release/MINDsmall_train.zip -P /tmp/mind/
# ! wget https://mind201910small.blob.core.windows.net/release/MINDsmall_dev.zip -P /tmp/mind/

# ! unzip /tmp/mind/MINDsmall_train.zip -d /tmp/mind/train/
# ! unzip /tmp/mind/MINDsmall_dev.zip -d /tmp/mind/test/

# ! rm /tmp/mind/MINDsmall_train.zip
# ! rm /tmp/mind/MINDsmall_dev.zip

## Imports and files/paths definitions

In [2]:
import pandas as pd
import numpy as np
import os

In [3]:
COLD_START_CATEGORY = "news"
ORIGINAL_TRAIN_INPUT_DIR = "/tmp/mind/train/"
ORIGINAL_TEST_INPUT_DIR = "/tmp/mind/test/"
OUTPUT_DIR = "./data/mind_cold_start_datasets_basic/"

COLD_START_BEHAVIORS_TRAIN = os.path.join(OUTPUT_DIR, "behaviors_train.tsv")
COLD_START_BEHAVIORS_TEST = os.path.join(OUTPUT_DIR, "behaviors_test.tsv")
NEWS_CATALOGUE_TRAIN = os.path.join(OUTPUT_DIR, "news_catalogue_train.tsv")
NEWS_CATALOGUE_TEST = os.path.join(OUTPUT_DIR, "news_catalogue_test.tsv")
AUXILIARY_DATA_CATALOGUE_TRAIN = os.path.join(OUTPUT_DIR, "auxiliary_data_catalogue_train.tsv")
AUXILIARY_DATA_CATALOGUE_TEST = os.path.join(OUTPUT_DIR, "auxiliary_data_catalogue_test.tsv")

In [None]:
! mkdir -p $OUTPUT_DIR

## Load and inspect original data

In [4]:
behaviors_train = pd.read_csv(
    os.path.join(ORIGINAL_TRAIN_INPUT_DIR, "behaviors.tsv"),
    sep="\t",
    names=["slateid", "userid", "time", "history", "impressions"]
)

behaviors_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 156965 entries, 0 to 156964
Data columns (total 5 columns):
 #   Column       Non-Null Count   Dtype 
---  ------       --------------   ----- 
 0   slateid      156965 non-null  int64 
 1   userid       156965 non-null  object
 2   time         156965 non-null  object
 3   history      153727 non-null  object
 4   impressions  156965 non-null  object
dtypes: int64(1), object(4)
memory usage: 6.0+ MB


In [5]:
behaviors_test = pd.read_csv(
    os.path.join(ORIGINAL_TEST_INPUT_DIR, "behaviors.tsv"),
    sep="\t",
    names=["slateid", "userid", "time", "history", "impressions"]
)

behaviors_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 73152 entries, 0 to 73151
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   slateid      73152 non-null  int64 
 1   userid       73152 non-null  object
 2   time         73152 non-null  object
 3   history      70938 non-null  object
 4   impressions  73152 non-null  object
dtypes: int64(1), object(4)
memory usage: 2.8+ MB


In [6]:
news_train = pd.read_csv(
    os.path.join(ORIGINAL_TRAIN_INPUT_DIR, "news.tsv"),
    sep="\t",
    names=["newsid", "category", "subcategory", "title", "abstract", "url", "title_entities", "abstract_entities"]
)

news_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 51282 entries, 0 to 51281
Data columns (total 8 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   newsid             51282 non-null  object
 1   category           51282 non-null  object
 2   subcategory        51282 non-null  object
 3   title              51282 non-null  object
 4   abstract           48616 non-null  object
 5   url                51282 non-null  object
 6   title_entities     51279 non-null  object
 7   abstract_entities  51278 non-null  object
dtypes: object(8)
memory usage: 3.1+ MB


In [7]:
news_test = pd.read_csv(
    os.path.join(ORIGINAL_TEST_INPUT_DIR, "news.tsv"),
    sep="\t",
    names=["newsid", "category", "subcategory", "title", "abstract", "url", "title_entities", "abstract_entities"]
)

news_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 42416 entries, 0 to 42415
Data columns (total 8 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   newsid             42416 non-null  object
 1   category           42416 non-null  object
 2   subcategory        42416 non-null  object
 3   title              42416 non-null  object
 4   abstract           40395 non-null  object
 5   url                42416 non-null  object
 6   title_entities     42414 non-null  object
 7   abstract_entities  42414 non-null  object
dtypes: object(8)
memory usage: 2.6+ MB


In [8]:
behaviors_train.head()

Unnamed: 0,slateid,userid,time,history,impressions
0,1,U13740,11/11/2019 9:05:58 AM,N55189 N42782 N34694 N45794 N18445 N63302 N104...,N55689-1 N35729-0
1,2,U91836,11/12/2019 6:11:30 PM,N31739 N6072 N63045 N23979 N35656 N43353 N8129...,N20678-0 N39317-0 N58114-0 N20495-0 N42977-0 N...
2,3,U73700,11/14/2019 7:01:48 AM,N10732 N25792 N7563 N21087 N41087 N5445 N60384...,N50014-0 N23877-0 N35389-0 N49712-0 N16844-0 N...
3,4,U34670,11/11/2019 5:28:05 AM,N45729 N2203 N871 N53880 N41375 N43142 N33013 ...,N35729-0 N33632-0 N49685-1 N27581-0
4,5,U8125,11/12/2019 4:11:21 PM,N10078 N56514 N14904 N33740,N39985-0 N36050-0 N16096-0 N8400-1 N22407-0 N6...


In [9]:
news_train.head()

Unnamed: 0,newsid,category,subcategory,title,abstract,url,title_entities,abstract_entities
0,N55528,lifestyle,lifestyleroyals,"The Brands Queen Elizabeth, Prince Charles, an...","Shop the notebooks, jackets, and more that the...",https://assets.msn.com/labs/mind/AAGH0ET.html,"[{""Label"": ""Prince Philip, Duke of Edinburgh"",...",[]
1,N19639,health,weightloss,50 Worst Habits For Belly Fat,These seemingly harmless habits are holding yo...,https://assets.msn.com/labs/mind/AAB19MK.html,"[{""Label"": ""Adipose tissue"", ""Type"": ""C"", ""Wik...","[{""Label"": ""Adipose tissue"", ""Type"": ""C"", ""Wik..."
2,N61837,news,newsworld,The Cost of Trump's Aid Freeze in the Trenches...,Lt. Ivan Molchanets peeked over a parapet of s...,https://assets.msn.com/labs/mind/AAJgNsz.html,[],"[{""Label"": ""Ukraine"", ""Type"": ""G"", ""WikidataId..."
3,N53526,health,voices,I Was An NBA Wife. Here's How It Affected My M...,"I felt like I was a fraud, and being an NBA wi...",https://assets.msn.com/labs/mind/AACk2N6.html,[],"[{""Label"": ""National Basketball Association"", ..."
4,N38324,health,medical,"How to Get Rid of Skin Tags, According to a De...","They seem harmless, but there's a very good re...",https://assets.msn.com/labs/mind/AAAKEkt.html,"[{""Label"": ""Skin tag"", ""Type"": ""C"", ""WikidataI...","[{""Label"": ""Skin tag"", ""Type"": ""C"", ""WikidataI..."


In [10]:
news_train.category.unique()

array(['lifestyle', 'health', 'news', 'sports', 'weather',
       'entertainment', 'autos', 'travel', 'foodanddrink', 'tv',
       'finance', 'movies', 'video', 'music', 'kids', 'middleeast',
       'northamerica'], dtype=object)

## Transform datasets to cold-start

In [11]:
def transform_behaviors_to_coldstart(behaviors, news):
    # filter histories to news-only
    behaviors_histories_news_only = (behaviors[["slateid", "history"]]
                                     .assign(history=lambda x: x["history"].fillna("").str.split())
                                     .explode("history")
                                     .reset_index(drop=True).reset_index(drop=False) # trick to preserve original ordering through merge
                                     .merge(news[news.category == COLD_START_CATEGORY][["newsid"]], left_on="history", right_on="newsid", how="inner", sort=False)
                                     .sort_values("index").drop("index", axis=1) # restore original ordering
                                     .groupby("slateid", as_index=False)
                                     .agg({"history": list})
                                     .assign(history=lambda x: x["history"].str.join(" "))
                                     .rename(columns={"history": "history_news"}))

    # filter impressions to news-only
    behaviors_impressions_news_only = (behaviors[["slateid", "impressions"]]
                                       .assign(impressions=lambda x: x["impressions"].fillna("").str.split())
                                       .explode("impressions")
                                       .assign(impression_id=lambda x: x["impressions"].str.split("-").str[0])
                                       .reset_index(drop=True).reset_index(drop=False) # trick to preserve original ordering through merge
                                       .merge(news[news.category == COLD_START_CATEGORY][["newsid"]], left_on="impression_id", right_on="newsid", how="inner", sort=False)
                                       .sort_values("index").drop("index", axis=1) # restore original ordering
                                       .groupby("slateid", as_index=False)
                                       .agg({"impressions": list})
                                       .assign(impressions=lambda x: x["impressions"].str.join(" "))
                                       .rename(columns={"impressions": "impressions_news"})
                                      )

    # generate categories and subcategories side-data from user histories
    behaviors_categories = (behaviors[["slateid", "history"]]
                                     .assign(history=lambda x: x["history"].fillna("").str.split())
                                     .explode("history")
                                     .reset_index(drop=True).reset_index(drop=False) # trick to preserve original ordering through merge
                                     .merge(news[["newsid", "category", "subcategory"]], left_on="history", right_on="newsid")
                                     .sort_values("index").drop("index", axis=1) # restore original ordering
                                     .groupby("slateid", as_index=False)
                                     .agg({"category": list, "subcategory": list})
                                     .assign(category=lambda x: x["category"].str.join(" "))
                                     .assign(subcategory=lambda x: x["subcategory"].str.join(" "))
                                     .rename(columns={"category": "history_all_categories", "subcategory": "history_all_subcategories"}))

    # join all data together
    return (behaviors
            .rename(columns={"history": "history_all"})
            .rename(columns={"impressions": "impressions_all"})
            .merge(behaviors_histories_news_only, on="slateid", how="inner")
            .merge(behaviors_impressions_news_only, on="slateid", how="inner")
            .merge(behaviors_categories, on="slateid", how="inner"))

### Transform train dataset

In [12]:
behaviors_train_ex = transform_behaviors_to_coldstart(behaviors_train, news_train)

In [13]:
behaviors_train_ex.head(5)

Unnamed: 0,slateid,userid,time,history_all,impressions_all,history_news,impressions_news,history_all_categories,history_all_subcategories
0,1,U13740,11/11/2019 9:05:58 AM,N55189 N42782 N34694 N45794 N18445 N63302 N104...,N55689-1 N35729-0,N45794 N19347 N31801,N35729-0,tv sports tv news sports lifestyle movies news...,tvnews baseball_mlb tvnews newscrime football_...
1,2,U91836,11/12/2019 6:11:30 PM,N31739 N6072 N63045 N23979 N35656 N43353 N8129...,N20678-0 N39317-0 N58114-0 N20495-0 N42977-0 N...,N31739 N6072 N63045 N43353 N8129 N1569 N17686 ...,N39317-0 N20495-0 N42977-0,news news news finance travel news news news n...,newscrime newsus newscrime markets travelnews ...
2,3,U73700,11/14/2019 7:01:48 AM,N10732 N25792 N7563 N21087 N41087 N5445 N60384...,N50014-0 N23877-0 N35389-0 N49712-0 N16844-0 N...,N7563 N24233,N23877-0 N49712-0 N64174-0 N46821-0 N48017-0 N...,lifestyle lifestyle news sports tv weather spo...,lifestylebuzz lifestylehomeandgarden newsus fo...
3,6,U19739,11/11/2019 6:52:13 PM,N39074 N14343 N32607 N32320 N22007 N442 N19001...,N21119-1 N53696-0 N33619-1 N25722-0 N2869-0,N14343 N32607 N32320 N22007 N24294 N8275 N6695...,N21119-1 N53696-0 N33619-1 N25722-0 N2869-0,sports news news news news sports lifestyle ne...,more_sports newsworld newsus newsgoodnews news...
4,7,U8355,11/11/2019 12:22:09 PM,N8419 N15771 N1431 N5888 N18663 N24123 N22130 ...,N51346-0 N33848-0 N15132-0 N10688-0 N6342-0 N6...,N8419 N1431 N18663 N32095 N55310 N31931 N23887...,N15132-0 N10688-0 N7809-0 N64397-0 N15830-0 N9...,news autos news sports news sports music movie...,newscrime autosenthusiasts newscrime baseball_...


In [14]:
behaviors_train_ex[["slateid", "userid", "time", "history_news", "impressions_news", "history_all_categories", "history_all_subcategories", "history_all"]].rename(columns={"history_news": "history", "impressions_news": "impressions"}).to_csv(COLD_START_BEHAVIORS_TRAIN, sep="\t", index=False)

### Transform test dataset

In [15]:
behaviors_test_ex = transform_behaviors_to_coldstart(behaviors_test, news_test)

In [16]:
behaviors_test_ex.head(5)

Unnamed: 0,slateid,userid,time,history_all,impressions_all,history_news,impressions_news,history_all_categories,history_all_subcategories
0,1,U80234,11/15/2019 12:37:50 PM,N55189 N46039 N51741 N53234 N11276 N264 N40716...,N28682-0 N48740-0 N31958-1 N34130-0 N6916-0 N5...,N46039 N53234 N6616 N63573 N38895,N50775-0 N24802-0 N33176-0,tv news tv news finance autos tv movies entert...,tvnews newsus tv-celebrity newsus finance-comp...
1,2,U60458,11/15/2019 7:11:50 AM,N58715 N32109 N51180 N33438 N54827 N28488 N611...,N20036-0 N23513-1 N32536-0 N46976-0 N35216-0 N...,N58715 N33438 N54827 N34775,N36779-0,news travel finance news news finance music ne...,newsus travelnews finance-companies newsscienc...
2,3,U44190,11/15/2019 9:55:12 AM,N56253 N1150 N55189 N16233 N61704 N51706 N5303...,N36779-0 N62365-0 N58098-0 N5472-0 N13408-0 N5...,N1150 N16233 N53033,N36779-0 N55036-0 N50775-0 N16680-0,sports news tv news lifestyle sports news ente...,football_nfl newscrime tvnews newsus shop-book...
3,4,U87380,11/15/2019 3:12:46 PM,N63554 N49153 N28678 N23232 N43369 N58518 N444...,N6950-0 N60215-0 N6074-0 N11930-0 N6916-0 N248...,N49153 N58518 N7649 N45794 N53033 N29361 N28247,N24802-0 N45057-0 N21941-0 N29091-0 N52492-0 N...,travel news sports sports travel news tv news ...,traveltripideas newsus baseball_mlb football_n...
4,6,U69606,11/15/2019 1:24:44 PM,N879 N19591 N63054 N53033 N54088 N34140 N14952...,N29862-0 N48740-0 N11390-0 N5472-0 N53572-0 N2...,N53033 N54088,N11390-0 N24802-1 N29091-0,sports finance sports news news finance foodan...,basketball_nba finance-companies football_nfl ...


In [17]:
behaviors_test_ex[["slateid", "userid", "time", "history_news", "impressions_news", "history_all_categories", "history_all_subcategories", "history_all"]].rename(columns={"history_news": "history", "impressions_news": "impressions"}).to_csv(COLD_START_BEHAVIORS_TEST, sep="\t", index=False)

# Split `news` data to news-only (main catalogue) and non-news-only (auxiliary catalogue)

In [18]:
news_train.query("category == @COLD_START_CATEGORY").to_csv(NEWS_CATALOGUE_TRAIN, sep="\t", index=False)

In [19]:
news_train.query("category != @COLD_START_CATEGORY").to_csv(AUXILIARY_DATA_CATALOGUE_TRAIN, sep="\t", index=False)

In [20]:
news_test.query("category == @COLD_START_CATEGORY").to_csv(NEWS_CATALOGUE_TEST, sep="\t", index=False)

In [21]:
news_test.query("category != @COLD_START_CATEGORY").to_csv(AUXILIARY_DATA_CATALOGUE_TEST, sep="\t", index=False)

## Explorative cold-start dataset analysis

## Pre-process data and train baseline recommender(s)

### Show that baseline performs poorly for cold-start users

## Train SUM recommender

## Evaluate and compare recommenders