In [9]:
import pandas as pd
from pathlib import Path
from tqdm import tqdm

In [5]:
MIND_train_path = Path('/mount/arbeitsdaten66/projekte/multiview/hardy/datasets/mind/MINDlarge_train')
columns_names = [
                "nid",
                "category",
                "subcategory",
                "title",
                "abstract",
                "url",
                "title_entities",
                "abstract_entities",
            ]
df_news = pd.read_table(
            filepath_or_buffer=MIND_train_path / "news.tsv",
            header=None,
            names=columns_names,
            usecols=range(len(columns_names)),
        )

In [6]:
df_news

Unnamed: 0,nid,category,subcategory,title,abstract,url,title_entities,abstract_entities
0,N88753,lifestyle,lifestyleroyals,"The Brands Queen Elizabeth, Prince Charles, an...","Shop the notebooks, jackets, and more that the...",https://assets.msn.com/labs/mind/AAGH0ET.html,"[{""Label"": ""Prince Philip, Duke of Edinburgh"",...",[]
1,N45436,news,newsscienceandtechnology,Walmart Slashes Prices on Last-Generation iPads,Apple's new iPad releases bring big deals on l...,https://assets.msn.com/labs/mind/AABmf2I.html,"[{""Label"": ""IPad"", ""Type"": ""J"", ""WikidataId"": ...","[{""Label"": ""IPad"", ""Type"": ""J"", ""WikidataId"": ..."
2,N23144,health,weightloss,50 Worst Habits For Belly Fat,These seemingly harmless habits are holding yo...,https://assets.msn.com/labs/mind/AAB19MK.html,"[{""Label"": ""Adipose tissue"", ""Type"": ""C"", ""Wik...","[{""Label"": ""Adipose tissue"", ""Type"": ""C"", ""Wik..."
3,N86255,health,medical,Dispose of unwanted prescription drugs during ...,,https://assets.msn.com/labs/mind/AAISxPN.html,"[{""Label"": ""Drug Enforcement Administration"", ...",[]
4,N93187,news,newsworld,The Cost of Trump's Aid Freeze in the Trenches...,Lt. Ivan Molchanets peeked over a parapet of s...,https://assets.msn.com/labs/mind/AAJgNsz.html,[],"[{""Label"": ""Ukraine"", ""Type"": ""G"", ""WikidataId..."
...,...,...,...,...,...,...,...,...
101522,N115249,sports,more_sports,St. Dominic soccer player tries to kick cancer...,"Sometimes, what happens on the sidelines can b...",https://assets.msn.com/labs/mind/BBWzQnK.html,[],[]
101523,N64337,finance,finance-real-estate,Mansion Monday: Contemporary Des Moines home i...,Among the perks of this unique Des Moines home...,https://assets.msn.com/labs/mind/BBWzQq8.html,"[{""Label"": ""Des Moines, Iowa"", ""Type"": ""G"", ""W...","[{""Label"": ""Des Moines, Iowa"", ""Type"": ""G"", ""W..."
101524,N100102,sports,soccer_epl,How the Sounders won MLS Cup,"Mark, Jeremiah and Casey were so excited they ...",https://assets.msn.com/labs/mind/BBWzQuK.html,"[{""Label"": ""MLS Cup"", ""Type"": ""U"", ""WikidataId...",[]
101525,N74617,autos,autossports,Best Sports Car Deals for October,,https://assets.msn.com/labs/mind/BBy5rVe.html,"[{""Label"": ""Peugeot RCZ"", ""Type"": ""V"", ""Wikida...",[]


In [7]:
def load_and_fix_df(path):
    df_behaviors = pd.read_csv(MIND_train_path / "behaviors.tsv", header=None, sep='\t')
    df_behaviors.columns = ['impression_id', 'user_id', 'timestamp', 'history', 'impressions']
    df_behaviors['timestamp'] = pd.to_datetime(df_behaviors['timestamp'])
    return df_behaviors
df_behaviors = load_and_fix_df(MIND_train_path)

In [21]:
orig_df_behaviors = df_behaviors.copy()

In [36]:
df_behaviors['date'] = df_behaviors['timestamp'].dt.date
train_groups_by_day = df_behaviors.groupby('date')
article_groups_by_day = {}
df_behaviors['history'] = df_behaviors['history'].fillna('')
for id, group in tqdm(train_groups_by_day):
    impression_rows = group['impressions'].apply(lambda x: [i.split('-')[0] for i in x.split()]).to_list()
    reading_rows = group['history'].apply(lambda x: [i for i in x.split() if type(x) == str]).to_list()
    unique_articles = set()
    for impressions in impression_rows:
        unique_articles.update(set(impressions))
    for reading in reading_rows:
        unique_articles.update(set(reading))
    article_groups_by_day[id] = ' '.join(list(unique_articles))
article_groups_df = pd.DataFrame(
    [(date, articles) for date, articles in article_groups_by_day.items()],
    columns=['Date', 'Articles']
)


100%|██████████| 6/6 [00:41<00:00,  6.86s/it]


In [None]:
new_train_path = Path('/home/users1/hardy/hardy/datasets/mind_resplit/MINDlarge_train')
new_dev_path = Path('/home/users1/hardy/hardy/datasets/mind_resplit/MINDlarge_dev')

In [27]:
dates_sorted = sorted(df_behaviors['date'].unique())
train_dates = dates_sorted[:5]
dev_dates = dates_sorted[-1:]

train_df_behaviors = orig_df_behaviors[df_behaviors['date'].isin(train_dates)]
dev_df_behaviors = orig_df_behaviors[df_behaviors['date'].isin(dev_dates)]

In [29]:
print(len(df_behaviors), len(train_df_behaviors), len(dev_df_behaviors))

2232748 1801231 431517


In [31]:
train_df_behaviors.to_csv(new_train_path / "behaviors.tsv", sep='\t', header=False, index=False)
dev_df_behaviors.to_csv(new_dev_path / "behaviors.tsv", sep='\t', header=False, index=False)

Date                                               2019-11-14
Articles    N3726 N13499 N112710 N80463 N33500 N86153 N822...
Name: 5, dtype: object

In [40]:
dev_article_ids = set(article_groups_df.iloc[-1]['Articles'].split())
dev_set = df_news[df_news['nid'].isin(dev_article_ids)]
train_set = df_news[~df_news['nid'].isin(dev_article_ids)]

In [41]:
print(len(df_news), "total articles"
      "\n", len(dev_set), "dev articles",
      "\n", len(train_set), "train articles")

101527 total articles
 76167 dev articles 
 25360 train articles


In [42]:
train_set.to_csv(new_train_path / "news.tsv", sep='\t', header=False, index=False)
dev_set.to_csv(new_dev_path / "news.tsv", sep='\t', header=False, index=False)