In [1]:
import pandas as pd
from pathlib import Path
from tqdm import tqdm

In [2]:
MIND_train_path = Path('/mount/arbeitsdaten66/projekte/multiview/hardy/datasets/mind/MINDlarge_train')
columns_names = [
                "nid",
                "category",
                "subcategory",
                "title",
                "abstract",
                "url",
                "title_entities",
                "abstract_entities",
            ]
df_news = pd.read_table(
            filepath_or_buffer=MIND_train_path / "news.tsv",
            header=None,
            names=columns_names,
            usecols=range(len(columns_names)),
        )

In [6]:
df_news

Unnamed: 0,nid,category,subcategory,title,abstract,url,title_entities,abstract_entities
0,N88753,lifestyle,lifestyleroyals,"The Brands Queen Elizabeth, Prince Charles, an...","Shop the notebooks, jackets, and more that the...",https://assets.msn.com/labs/mind/AAGH0ET.html,"[{""Label"": ""Prince Philip, Duke of Edinburgh"",...",[]
1,N45436,news,newsscienceandtechnology,Walmart Slashes Prices on Last-Generation iPads,Apple's new iPad releases bring big deals on l...,https://assets.msn.com/labs/mind/AABmf2I.html,"[{""Label"": ""IPad"", ""Type"": ""J"", ""WikidataId"": ...","[{""Label"": ""IPad"", ""Type"": ""J"", ""WikidataId"": ..."
2,N23144,health,weightloss,50 Worst Habits For Belly Fat,These seemingly harmless habits are holding yo...,https://assets.msn.com/labs/mind/AAB19MK.html,"[{""Label"": ""Adipose tissue"", ""Type"": ""C"", ""Wik...","[{""Label"": ""Adipose tissue"", ""Type"": ""C"", ""Wik..."
3,N86255,health,medical,Dispose of unwanted prescription drugs during ...,,https://assets.msn.com/labs/mind/AAISxPN.html,"[{""Label"": ""Drug Enforcement Administration"", ...",[]
4,N93187,news,newsworld,The Cost of Trump's Aid Freeze in the Trenches...,Lt. Ivan Molchanets peeked over a parapet of s...,https://assets.msn.com/labs/mind/AAJgNsz.html,[],"[{""Label"": ""Ukraine"", ""Type"": ""G"", ""WikidataId..."
...,...,...,...,...,...,...,...,...
101522,N115249,sports,more_sports,St. Dominic soccer player tries to kick cancer...,"Sometimes, what happens on the sidelines can b...",https://assets.msn.com/labs/mind/BBWzQnK.html,[],[]
101523,N64337,finance,finance-real-estate,Mansion Monday: Contemporary Des Moines home i...,Among the perks of this unique Des Moines home...,https://assets.msn.com/labs/mind/BBWzQq8.html,"[{""Label"": ""Des Moines, Iowa"", ""Type"": ""G"", ""W...","[{""Label"": ""Des Moines, Iowa"", ""Type"": ""G"", ""W..."
101524,N100102,sports,soccer_epl,How the Sounders won MLS Cup,"Mark, Jeremiah and Casey were so excited they ...",https://assets.msn.com/labs/mind/BBWzQuK.html,"[{""Label"": ""MLS Cup"", ""Type"": ""U"", ""WikidataId...",[]
101525,N74617,autos,autossports,Best Sports Car Deals for October,,https://assets.msn.com/labs/mind/BBy5rVe.html,"[{""Label"": ""Peugeot RCZ"", ""Type"": ""V"", ""Wikida...",[]


In [3]:
def load_and_fix_df(path):
    df_behaviors = pd.read_csv(MIND_train_path / "behaviors.tsv", header=None, sep='\t')
    df_behaviors.columns = ['impression_id', 'user_id', 'timestamp', 'history', 'impressions']
    df_behaviors['timestamp'] = pd.to_datetime(df_behaviors['timestamp'])
    return df_behaviors
df_behaviors = load_and_fix_df(MIND_train_path)

In [4]:
orig_df_behaviors = df_behaviors.copy()

In [5]:
df_behaviors['date'] = df_behaviors['timestamp'].dt.date
train_groups_by_day = df_behaviors.groupby('date')
article_groups_by_day = {}
df_behaviors['history'] = df_behaviors['history'].fillna('')
for id, group in tqdm(train_groups_by_day):
    impression_rows = group['impressions'].apply(lambda x: [i.split('-')[0] for i in x.split()]).to_list()
    reading_rows = group['history'].apply(lambda x: [i for i in x.split() if type(x) == str]).to_list()
    unique_articles = set()
    for impressions in impression_rows:
        unique_articles.update(set(impressions))
    for reading in reading_rows:
        unique_articles.update(set(reading))
    article_groups_by_day[id] = ' '.join(list(unique_articles))
article_groups_df = pd.DataFrame(
    [(date, articles) for date, articles in article_groups_by_day.items()],
    columns=['Date', 'Articles']
)


100%|██████████| 6/6 [00:40<00:00,  6.75s/it]


In [7]:
new_train_path = Path('/home/users1/hardy/hardy/datasets/mind_resplit/MINDlarge_train')
new_dev_path = Path('/home/users1/hardy/hardy/datasets/mind_resplit/MINDlarge_dev')

In [8]:
orig_df_behaviors

Unnamed: 0,impression_id,user_id,timestamp,history,impressions
0,1,U87243,2019-11-10 11:30:54,N8668 N39081 N65259 N79529 N73408 N43615 N2937...,N78206-0 N26368-0 N7578-0 N58592-0 N19858-0 N5...
1,2,U598644,2019-11-12 13:45:29,N56056 N8726 N70353 N67998 N83823 N111108 N107...,N47996-0 N82719-0 N117066-0 N8491-0 N123784-0 ...
2,3,U532401,2019-11-13 11:23:03,N128643 N87446 N122948 N9375 N82348 N129412 N5...,N103852-0 N53474-0 N127836-0 N47925-1
3,4,U593596,2019-11-12 12:24:09,N31043 N39592 N4104 N8223 N114581 N92747 N1207...,N38902-0 N76434-0 N71593-0 N100073-0 N108736-0...
4,5,U239687,2019-11-14 20:03:01,N65250 N122359 N71723 N53796 N41663 N41484 N11...,N76209-0 N48841-0 N67937-0 N62235-0 N6307-0 N3...
...,...,...,...,...,...
2232743,2232744,U316192,2019-11-13 18:50:02,N122359 N37069 N95876 N28787 N73408 N11266 N61321,N113723-0 N123683-1 N5287-0 N76677-0 N53474-0
2232744,2232745,U451238,2019-11-12 08:54:06,N12575 N93816 N71643 N87236 N87236,N18861-0 N20990-0 N43085-0 N7937-1
2232745,2232746,U151246,2019-11-13 12:42:51,N27587 N49668,N39887-1 N22811-0 N110709-1 N1923-0 N24001-1 N...
2232746,2232747,U330725,2019-11-12 13:22:57,N121944 N91510 N42280 N60061 N63032 N125223 N4...,N18947-0 N88808-1 N10012-0 N38902-0 N33078-0 N...


In [10]:
dates_sorted = sorted(df_behaviors['date'].unique())
train_dates = dates_sorted[:5]
dev_dates = dates_sorted[-1:]

train_df_behaviors = orig_df_behaviors[df_behaviors['date'].isin(train_dates)]
dev_df_behaviors = orig_df_behaviors[df_behaviors['date'].isin(dev_dates)]

In [11]:
print(len(df_behaviors), len(train_df_behaviors), len(dev_df_behaviors))

2232748 1801231 431517


In [12]:
train_df_behaviors.to_csv(new_train_path / "behaviors.tsv", sep='\t', header=False, index=False)
dev_df_behaviors.to_csv(new_dev_path / "behaviors.tsv", sep='\t', header=False, index=False)

In [70]:
article_groups_df[:-1]['Articles']

0    N3726 N13499 N112710 N86153 N8221 N1478 N12091...
1    N3726 N5769 N13499 N16489 N112710 N86153 N1478...
2    N3726 N5769 N13499 N17600 N112710 N80463 N3350...
3    N3726 N5769 N13499 N112710 N80463 N33500 N8615...
4    N3726 N13499 N112710 N80463 N33500 N86153 N822...
Name: Articles, dtype: object

In [11]:
dev_article_ids = set(article_groups_df.iloc[-1]['Articles'].split())
train_article_ids = set(' '.join(article_groups_df[:-1]['Articles'].tolist()).split())
dev_set = df_news[df_news['nid'].isin(dev_article_ids)]
train_set = df_news[df_news['nid'].isin(train_article_ids)]

In [13]:
dev_set

Unnamed: 0,nid,category,subcategory,title,abstract,url,title_entities,abstract_entities
0,N88753,lifestyle,lifestyleroyals,"The Brands Queen Elizabeth, Prince Charles, an...","Shop the notebooks, jackets, and more that the...",https://assets.msn.com/labs/mind/AAGH0ET.html,"[{""Label"": ""Prince Philip, Duke of Edinburgh"",...",[]
1,N45436,news,newsscienceandtechnology,Walmart Slashes Prices on Last-Generation iPads,Apple's new iPad releases bring big deals on l...,https://assets.msn.com/labs/mind/AABmf2I.html,"[{""Label"": ""IPad"", ""Type"": ""J"", ""WikidataId"": ...","[{""Label"": ""IPad"", ""Type"": ""J"", ""WikidataId"": ..."
2,N23144,health,weightloss,50 Worst Habits For Belly Fat,These seemingly harmless habits are holding yo...,https://assets.msn.com/labs/mind/AAB19MK.html,"[{""Label"": ""Adipose tissue"", ""Type"": ""C"", ""Wik...","[{""Label"": ""Adipose tissue"", ""Type"": ""C"", ""Wik..."
3,N86255,health,medical,Dispose of unwanted prescription drugs during ...,,https://assets.msn.com/labs/mind/AAISxPN.html,"[{""Label"": ""Drug Enforcement Administration"", ...",[]
4,N93187,news,newsworld,The Cost of Trump's Aid Freeze in the Trenches...,Lt. Ivan Molchanets peeked over a parapet of s...,https://assets.msn.com/labs/mind/AAJgNsz.html,[],"[{""Label"": ""Ukraine"", ""Type"": ""G"", ""WikidataId..."
...,...,...,...,...,...,...,...,...
101318,N124635,movies,movienews,Marvel's Kevin Feige Breaks Silence on Scorses...,In his first public comments about the debate ...,https://assets.msn.com/labs/mind/BBWywC9.html,"[{""Label"": ""Martin Scorsese"", ""Type"": ""P"", ""Wi...","[{""Label"": ""Marvel Cinematic Universe"", ""Type""..."
101360,N107637,sports,football_nfl,Russell Wilson vs. 49ers' No. 1 defense: Monda...,What Russell Wilson does to a defense can't be...,https://assets.msn.com/labs/mind/BBWyx9f.html,"[{""Label"": ""Russell Wilson"", ""Type"": ""P"", ""Wik...","[{""Label"": ""Russell Wilson"", ""Type"": ""P"", ""Wik..."
101426,N99177,lifestyle,lifestyleroyals,Why Kate & Meghan Were on Different Balconies ...,There's no scandal here. It's all about the or...,https://assets.msn.com/labs/mind/BBWyynu.html,"[{""Label"": ""Meghan, Duchess of Sussex"", ""Type""...",[]
101446,N57903,news,newsgoodnews,Tennessee judge holds lawyer's baby as he swea...,Tennessee Court of Appeals Judge Richard Dinki...,https://assets.msn.com/labs/mind/BBWyzI8.html,"[{""Label"": ""Tennessee"", ""Type"": ""G"", ""Wikidata...","[{""Label"": ""Tennessee Court of Appeals"", ""Type..."


In [16]:
print(len(df_news), "total articles"
      "\n", len(dev_set), "dev articles",
      "\n", len(train_set), "train articles")

101527 total articles
 76167 dev articles 
 97165 train articles


In [14]:
train_set.to_csv(new_train_path / "news.tsv", sep='\t', header=False, index=False)
dev_set.to_csv(new_dev_path / "news.tsv", sep='\t', header=False, index=False)

In [19]:
def load_and_split(path, save_path, article_ids):
    df = pd.read_csv(path, sep='\t', header=None)
    df = df[df[0].isin(article_ids)]
    df.to_csv(save_path, sep='\t', header=False, index=False)

train_path = Path('/home/users1/hardy/hardy/datasets/mind_resplit/MINDlarge_MFC_train/news.tsv')
dev_path = Path('/home/users1/hardy/hardy/datasets/mind_resplit/MINDlarge_MFC_dev/news.tsv')
load_and_split(train_path, dev_path, dev_article_ids)
load_and_split(train_path, train_path, train_article_ids)

train_path = Path('/home/users1/hardy/hardy/datasets/mind_resplit/MINDlarge_sentiment_train/news.tsv')
dev_path = Path('/home/users1/hardy/hardy/datasets/mind_resplit/MINDlarge_sentiment_dev/news.tsv')
load_and_split(train_path, dev_path, dev_article_ids)
load_and_split(train_path, train_path, train_article_ids)



In [20]:
train_path = Path('/home/users1/hardy/hardy/datasets/mind_resplit/MINDlarge_political_train/news.tsv')
dev_path = Path('/home/users1/hardy/hardy/datasets/mind_resplit/MINDlarge_political_dev/news.tsv')
load_and_split(train_path, dev_path, dev_article_ids)
load_and_split(train_path, train_path, train_article_ids)