In [None]:
import pandas as pd
import re
from pathlib import Path
import torch
from typing import List
from tqdm.notebook import trange, tqdm
from modules.res_vqvae import RVQVAE

In [2]:
train_news_data = None
dev_news_data = None
MIND_dev_path = Path('/mount/arbeitsdaten66/projekte/multiview/hardy/datasets/mind/MINDsmall_dev')
MIND_train_path = Path('/mount/arbeitsdaten66/projekte/multiview/hardy/datasets/mind/MINDlarge_train')

In [8]:
def word_tokenize(sentence: str) -> List[str]:
        """Splits a sentence into word list using regex.

        Args:
            sentence:
                Input sentence

        Returns:
            List of words.
        """
        pat = re.compile(r"[\w]+|[.,!?;|]")
        if isinstance(sentence, str):
            return pat.findall(sentence.lower())
        else:
            return []
    
def load_news_data(path: Path, split: str):
    columns_names = [
            "nid",
            "category",
            "subcategory",
            "title",
            "abstract",
            "url",
            "title_entities",
            "abstract_entities",
        ]
    news = pd.read_table(
            filepath_or_buffer=path / "news.tsv",
            header=None,
            names=columns_names,
            usecols=range(len(columns_names)),
        )
    news = news.drop(columns=["url"])
    news["abstract"] = news["abstract"].fillna("")
    news["title_entities"] = news["title_entities"].fillna("[]")
    news["abstract_entities"] = news["abstract_entities"].fillna("[]")
    # news = news.set_index("nid", drop=True)
    if split == "train":
        news_category = news["category"].drop_duplicates().reset_index(drop=True)
        categ2index = {v: k + 1 for k, v in news_category.to_dict().items()}
        df = pd.DataFrame(categ2index.items(), columns=["word", "index"])
        df.to_csv(path.parent / 'categ2index.tsv', index=False, sep="\t")
        news["category_class"] = news["category"].apply(
            lambda category: categ2index.get(category, 0)
        )
        train_news_data = news
    elif split == "dev":
        fpath = path.parent / "categ2index.tsv"
        categ2index = pd.read_table(fpath, sep="\t").set_index("word")["index"].to_dict()
        news["category_class"] = news["category"].apply(
            lambda category: categ2index.get(category, 0)
        )
        dev_news_data = news

In [39]:
df_behaviors = pd.read_csv(MIND_dev_path / "behaviors.tsv", header=None, sep='\t')
df_behaviors.columns = ['impression_id', 'user_id', 'timestamp', 'history', 'impressions']
df_behaviors['timestamp'] = pd.to_datetime(df_behaviors['timestamp'])

In [None]:
df_behaviors['history-1'] = df_behaviors['history'].apply(lambda x: [i+'-1' for i in x.split()] if type(x) == str else [])
cal_history = {}
for user_id, group in tqdm(df_behaviors.sort_values(by=['user_id', 'timestamp']).groupby('user_id')):
    cum_history = []
    for i, (index, row) in enumerate(group.iterrows()):
        if i != 0:
            row['history-1'].extend(cum_history)
            cum_history.extend(row['impressions'].split())
        else:
            cum_history = row['impressions'].split()
        cal_history[index] = row['history-1']
history_series = pd.Series(cal_history)
df_behaviors['history-1'] = history_series
sorted_df_behaviors = df_behaviors.sort_values(by=['user_id', 'timestamp'])

  0%|          | 0/50000 [00:00<?, ?it/s]

In [None]:
df_behaviors[['impression_id', 'user_id', 'history-1']].to_csv('extracted_data.tsv', sep='\t', index=False)

Unnamed: 0,impression_id,user_id,timestamp,history,impressions,history-1
0,1,U80234,2019-11-15 12:37:50,N55189 N46039 N51741 N53234 N11276 N264 N40716...,N28682-0 N48740-0 N31958-1 N34130-0 N6916-0 N5...,"[N55189-1, N46039-1, N51741-1, N53234-1, N1127..."
1,2,U60458,2019-11-15 07:11:50,N58715 N32109 N51180 N33438 N54827 N28488 N611...,N20036-0 N23513-1 N32536-0 N46976-0 N35216-0 N...,"[N58715-1, N32109-1, N51180-1, N33438-1, N5482..."
2,3,U44190,2019-11-15 09:55:12,N56253 N1150 N55189 N16233 N61704 N51706 N5303...,N36779-0 N62365-0 N58098-0 N5472-0 N13408-0 N5...,"[N56253-1, N1150-1, N55189-1, N16233-1, N61704..."
3,4,U87380,2019-11-15 15:12:46,N63554 N49153 N28678 N23232 N43369 N58518 N444...,N6950-0 N60215-0 N6074-0 N11930-0 N6916-0 N248...,"[N63554-1, N49153-1, N28678-1, N23232-1, N4336..."
4,5,U9444,2019-11-15 08:25:46,N51692 N18285 N26015 N22679 N55556,N5940-1 N23513-0 N49285-0 N23355-0 N19990-0 N3...,"[N51692-1, N18285-1, N26015-1, N22679-1, N5555..."
...,...,...,...,...,...,...
73147,73148,U77536,2019-11-15 20:40:16,N28691 N8845 N58434 N37120 N22185 N60033 N4702...,N496-0 N35159-0 N59856-0 N13270-0 N47213-0 N26...,"[N28691-1, N8845-1, N58434-1, N37120-1, N22185..."
73148,73149,U56193,2019-11-15 13:11:26,N4705 N58782 N53531 N46492 N26026 N28088 N3109...,N49285-0 N31958-0 N55237-0 N42844-0 N29862-0 N...,"[N4705-1, N58782-1, N53531-1, N46492-1, N26026..."
73149,73150,U16799,2019-11-15 15:37:06,N40826 N42078 N15670 N15295 N64536 N46845 N52294,N7043-0 N512-0 N60215-1 N45057-0 N496-0 N37055...,"[N40826-1, N42078-1, N15670-1, N15295-1, N6453..."
73150,73151,U8786,2019-11-15 08:29:26,N3046 N356 N20483 N46107 N44598 N18693 N8254 N...,N23692-0 N19990-0 N20187-0 N5940-0 N13408-0 N3...,"[N3046-1, N356-1, N20483-1, N46107-1, N44598-1..."


In [2]:
def convert_history_to_indices(model, df_histories, aspect_dict, batch_size, device):
    for i, row in tqdm(df_histories.iterrows(), total=len(df_histories)):
        all_article_vectors = []
        all_is_clicked = []
        for article in row['history-1'].split():
            article_id, is_clicked = article.split('-')
            article_id = article_id[1:]
            article_vector = aspect_dict[article_id]
            all_article_vectors.append(article_vector)
            all_is_clicked.append(int(is_clicked))
        indices_with_clicked = []
        for i in range(0, len(all_article_vectors), batch_size):
            batch = all_article_vectors[i:i + batch_size]
            batch_clicked = all_is_clicked[i:i + batch_size]
            batch_tensor = torch.tensor(batch, dtype=torch.float32, device=device)
            with torch.no_grad():
                _, _, batch_indices = model.rvq_layer(model.encoder(batch_tensor))
                batch_indices = batch_indices[-1].squeeze(1).tolist()  # Get the last layer indices and convert to list
            # Append indices with corresponding clicked information
            indices_with_clicked.extend(list(zip(batch_indices, batch_clicked)))
        df_histories.loc[i, 'history_indices'] = str(indices_with_clicked)
    return df_histories

In [3]:
def extract_raw_history(file_path):
    df_behaviors = pd.read_csv(Path(file_path) / "behaviors.tsv", header=None, sep='\t')
    df_behaviors.columns = ['impression_id', 'user_id', 'timestamp', 'history', 'impressions']
    df_behaviors['timestamp'] = pd.to_datetime(df_behaviors['timestamp'])
    df_behaviors['history-1'] = df_behaviors['history'].apply(lambda x: [i+'-1' for i in x.split()] if type(x) == str else [])
    cal_history = {}
    for user_id, group in tqdm(df_behaviors.sort_values(by=['user_id', 'timestamp']).groupby('user_id')):
        cum_history = []
        for i, (index, row) in enumerate(group.iterrows()):
            if i != 0:
                row['history-1'].extend(cum_history)
                cum_history.extend(row['impressions'].split())
            else:
                cum_history = row['impressions'].split()
            cal_history[index] = row['history-1']
    history_series = pd.Series(cal_history)
    df_behaviors['history-1'] = history_series
    df_behaviors['history-1'] = df_behaviors['history-1'].apply(lambda x: ' '.join(x))
    return df_behaviors[['impression_id', 'user_id', 'history-1']]

def load_dictionary(file_path):
    indices_dict = {}
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            parts = line.strip().split()
            key = parts[0]
            indices = [float(x)for x in parts[1:]]
            indices_dict[key] = indices
    return indices_dict


In [4]:
df_histories = extract_raw_history('/home/users1/hardy/hardy/datasets/mind/MINDsmall_dev')
aspect_dict = load_dictionary('/home/users1/hardy/hardy/project/vae/outputs/mind/dev_small_mind_category_aspect_vectors.txt')

  0%|          | 0/50000 [00:00<?, ?it/s]

In [46]:
device = 'cuda'
model = RVQVAE.load_from_checkpoint('/home/users1/hardy/hardy/project/vae/checkpoints/rvqvae_aspect_ema_reset_single-epoch=35-val_loss=0.1306.ckpt', map_location=device)
model.eval()
batch_size = 5

for idx, row in tqdm(df_histories.iterrows(), total=len(df_histories)):
    all_article_vectors = []
    all_is_clicked = []
    for article in row['history-1'].split():
        article_id, is_clicked = article.split('-')
        article_id = article_id[1:]
        article_vector = aspect_dict[article_id]
        all_article_vectors.append(article_vector)
        all_is_clicked.append(int(is_clicked))
    indices_with_clicked = []
    for i in range(0, len(all_article_vectors), batch_size):
        batch = all_article_vectors[i:i + batch_size]
        batch_clicked = all_is_clicked[i:i + batch_size]
        batch_tensor = torch.tensor(batch, dtype=torch.float32, device=device)
        with torch.no_grad():
            _, _, batch_indices = model.rvq_layer(model.encoder(batch_tensor))
            batch_indices = batch_indices[-1].squeeze(1).tolist()  # Get the last layer indices and convert to list
        # Append indices with corresponding clicked information
        indices_with_clicked.extend([f'{bi}-{bc}' for bi, bc in zip(batch_indices, batch_clicked)])
    df_histories.loc[idx, 'history_indices'] = ' '.join(indices_with_clicked)

  0%|          | 0/73152 [00:00<?, ?it/s]

In [53]:
(df_histories.apply(lambda x: len(x['history-1'].split()), axis=1) == df_histories.apply(lambda x: len(x['history_indices'].split()), axis=1)).sum() == len(df_histories)

np.True_

In [49]:
df_histories

Unnamed: 0,impression_id,user_id,history-1,history_indices
0,1,U80234,N55189-1 N46039-1 N51741-1 N53234-1 N11276-1 N...,158-1 123-1 191-1 54-1 154-1 54-1 158-1 0-1 31...
1,2,U60458,N58715-1 N32109-1 N51180-1 N33438-1 N54827-1 N...,2-1 13-1 3-1 5-1 2-1 13-1 0-1 3-1 2-1 7-1 1-1 ...
2,3,U44190,N56253-1 N1150-1 N55189-1 N16233-1 N61704-1 N5...,1-1 3-1 9-1 0-1 9-1 7-1 2-1 9-1 7-1
3,4,U87380,N63554-1 N49153-1 N28678-1 N23232-1 N43369-1 N...,2-1 4-1 7-1 7-1 2-1 2-1 3-1 11-1 7-1 11-1 2-1 ...
4,5,U9444,N51692-1 N18285-1 N26015-1 N22679-1 N55556-1,4-1 5-1 4-1 5-1 3-1
...,...,...,...,...
73147,73148,U77536,N28691-1 N8845-1 N58434-1 N37120-1 N22185-1 N6...,0-1 0-1 0-1 0-1 15-1 15-1 31-1 15-1 15-1 4-1 3...
73148,73149,U56193,N4705-1 N58782-1 N53531-1 N46492-1 N26026-1 N2...,0-1 0-1 10-1 0-1 0-1 11-1 1-1 8-1 3-1
73149,73150,U16799,N40826-1 N42078-1 N15670-1 N15295-1 N64536-1 N...,21-1 3-1 7-1 2-1 0-1 2-1 5-1
73150,73151,U8786,N3046-1 N356-1 N20483-1 N46107-1 N44598-1 N186...,2-1 2-1 2-1 1-1 2-1 13-1 1-1 5-1 1-1 5-1


In [29]:
df_histories['history_indices'].apply(lambda x: len(x.split())) == df_histories['history-1'].apply(lambda x: len(x)) 

0        False
1        False
2        False
3        False
4        False
         ...  
73147    False
73148    False
73149    False
73150    False
73151    False
Length: 73152, dtype: bool

In [54]:
pd.read_csv('/home/users1/hardy/hardy/project/vae/outputs/test/train_histories_indices.csv')

Unnamed: 0,impression_id,user_id,history-1,history_indices
0,1,U87243,N8668-1 N39081-1 N65259-1 N79529-1 N73408-1 N4...,77-1 72-1 185-1 235-1 185-1 9-1 86-1 79-1 101-...
1,2,U598644,N56056-1 N8726-1 N70353-1 N67998-1 N83823-1 N1...,40-1 66-1 129-1 188-1 63-1 12-1 56-1 12-1 123-...
2,3,U532401,N128643-1 N87446-1 N122948-1 N9375-1 N82348-1 ...,48-1 48-1 87-1 53-1 31-1 164-1 187-1 156-1 11-...
3,4,U593596,N31043-1 N39592-1 N4104-1 N8223-1 N114581-1 N9...,109-1 59-1 14-1 59-1 2-1 53-1 1-1 87-1 53-1 31...
4,5,U239687,N65250-1 N122359-1 N71723-1 N53796-1 N41663-1 ...,119-1 89-1 14-1 124-1 162-1 28-1 219-1 157-1 3...
