In [1]:
from pathlib import Path
import pandas as pd
from tqdm.notebook import trange, tqdm

In [7]:
train_news_data = None
dev_news_data = None
MIND_dev_path = Path('/mount/arbeitsdaten66/projekte/multiview/hardy/datasets/mind/MINDlarge_dev')
MIND_train_path = Path('/mount/arbeitsdaten66/projekte/multiview/hardy/datasets/mind/MINDlarge_train')
MIND_test_path = Path('/mount/arbeitsdaten66/projekte/multiview/hardy/datasets/mind/MINDlarge_test')

In [None]:
def append_history(df_behaviors):
    """
    MIND dataset has a bug where rows with the same user_id has similar history. This function will fix
    it by appending the user history with the clicked articles of previous impression for each user. 
    """
    df_behaviors['timestamp'] = pd.to_datetime(df_behaviors['timestamp'])
    df_behaviors['history'] = df_behaviors['history'].apply(lambda x: x.split() if type(x) == str else [])
    cal_history = {}
    for user_id, group in tqdm(df_behaviors.sort_values(by=['user_id', 'timestamp']).groupby('user_id')):
        cum_history = []
        for i, (index, row) in enumerate(group.iterrows()):
            if i != 0:
                row['history'].extend(cum_history)
                impression = [i.split('-')[0] for i in row['impressions'].split() if i.endswith('-1')]
                cum_history.extend(impression)
            else:
                cum_history = [i.split('-')[0] for i in row['impressions'].split() if i.endswith('-1')]
            cal_history[index] = row['history']
    history_series = pd.Series(cal_history)
    df_behaviors['history'] = history_series
    df_behaviors['history'] = df_behaviors['history'].apply(lambda x: ' '.join(x))
    last_user_rows = df_behaviors.sort_values(by='timestamp').groupby('user_id').tail(1)
    return last_user_rows

def load_and_fix_df(path):
    df_behaviors = pd.read_csv(path / "behaviors.tsv", header=None, sep='\t')
    df_behaviors.columns = ['impression_id', 'user_id', 'timestamp', 'history', 'impressions']
    df_behaviors['timestamp'] = pd.to_datetime(df_behaviors['timestamp'])
    df_behaviors = append_history(df_behaviors)
    return df_behaviors



In [23]:
df_train = load_and_fix_df(MIND_train_path)

  0%|          | 0/711222 [00:00<?, ?it/s]

In [24]:
df_dev = load_and_fix_df(MIND_dev_path)

  0%|          | 0/255990 [00:00<?, ?it/s]

In [25]:
df_test = load_and_fix_df(MIND_test_path)

  0%|          | 0/702005 [00:00<?, ?it/s]

In [26]:
print("Train set:")
print("Earliest timestamp:", df_train['timestamp'].min())
print("Latest timestamp:", df_train['timestamp'].max())

print("\nDev set:")
print("Earliest timestamp:", df_dev['timestamp'].min())
print("Latest timestamp:", df_dev['timestamp'].max())

print("\nTest set:")
print("Earliest timestamp:", df_test['timestamp'].min())
print("Latest timestamp:", df_test['timestamp'].max())

Train set:
Earliest timestamp: 2019-11-09 00:00:00
Latest timestamp: 2019-11-14 23:59:59

Dev set:
Earliest timestamp: 2019-11-15 00:00:00
Latest timestamp: 2019-11-15 23:59:43

Test set:
Earliest timestamp: 2019-11-16 00:00:05
Latest timestamp: 2019-11-22 23:59:58


In [52]:
def split_and_save(df, path):
    # Split df into groups by date (each group is a single day)
    df['date'] = df['timestamp'].dt.date
    train_groups_by_day = df.groupby('date')
    article_groups_by_day = {}
    for id, group in tqdm(train_groups_by_day):
        impression_rows = group['impressions'].apply(lambda x: [i.split('-')[0] for i in x.split()]).to_list()
        unique_articles = set()
        for impressions in impression_rows:
            unique_articles.update(set(impressions))
        article_groups_by_day[id] = ' '.join(list(unique_articles))
    article_groups_df = pd.DataFrame(
        [(date, articles) for date, articles in article_groups_by_day.items()],
        columns=['Date', 'Articles']
    )
    article_groups_df.to_csv(path / 'grouped_behaviors.tsv', sep='\t', header=False)

In [53]:
split_and_save(df_train, MIND_train_path)
split_and_save(df_test, MIND_test_path)
split_and_save(df_dev, MIND_dev_path)

  0%|          | 0/6 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

Unnamed: 0,Date,Articles
0,2019-11-09,N2102 N79529 N87803 N81940 N72466 N7864 N12325...
1,2019-11-10,N2102 N125329 N117066 N87803 N81940 N81425 N12...
2,2019-11-11,N78324 N117066 N87803 N81940 N90696 N39692 N88...
3,2019-11-12,N110939 N11975 N125329 N117066 N79529 N81425 N...
4,2019-11-13,N69396 N119317 N117066 N63935 N81940 N124008 N...
5,2019-11-14,N69396 N125329 N72308 N117066 N119317 N81940 N...
