In [11]:
from pathlib import Path
from pymilvus import MilvusClient
import os
import numpy as np
import pandas as pd
import torch
import tqdm
import lightning as L
from lightning.pytorch.callbacks import ModelCheckpoint
import torch.nn.functional as F
from data_modules.mind_recsys_data import MINDRecSysDataModule
from data_modules.mind_component import load_news_data, load_history_data
from modules.llama_decoder import LlamaDecoderForNextArticle
from modules.res_vqvae import RVQVAE
from modules.lstur import LSTUR
from data_modules.indices_data import SeqVQVAEDataModule
os.environ['CUDA_VISIBLE_DEVICES']  = '0,1,2,3,5'

In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

codebook_size = 775
codebook_sizes = [414, 69, 106, 69, 117]


In [None]:
seqvqvae = LlamaDecoderForNextArticle.load_from_checkpoint(
        '/home/users1/hardy/hardy/project/vae/checkpoints/seqvqvae_all_sts-epoch=14-val_loss=2.1423.ckpt',
        codebook_size=codebook_size+1,
        hidden_size=768,
        intermediate_size=2048,
        num_hidden_layers=10,
        num_attention_heads=12,
        max_position_embeddings=4090)
seqvqvae.eval()
seqvqvae.to(device)


In [4]:
vqvae_checkpoint_paths = [
    '/mount/arbeitsdaten66/projekte/multiview/hardy/project/vae/checkpoints/rvqvae_std_sts-epoch=12-val_loss=0.72483.ckpt',
    '/mount/arbeitsdaten66/projekte/multiview/hardy/project/vae/checkpoints/rvqvae_cat_sts-epoch=12-val_loss=0.44667.ckpt',
    '/mount/arbeitsdaten66/projekte/multiview/hardy/project/vae/checkpoints/rvqvae_frame_sts-epoch=18-val_loss=0.17045.ckpt',
    '/mount/arbeitsdaten66/projekte/multiview/hardy/project/vae/checkpoints/rvqvae_political_sts-epoch=07-val_loss=0.27821.ckpt',
    '/mount/arbeitsdaten66/projekte/multiview/hardy/project/vae/checkpoints/rvqvae_sentiment_sts-epoch=10-val_loss=0.40279.ckpt'


]
codebook_sizes = [414, 69, 106, 69, 117]
codebook_dims = [512, 512, 512, 128, 512]
hidden_sizes = [128, 256, 1024, 128, 256]
vqvae_models = []
for vqvae_path, codebook_size, codebook_dim, hidden_size in zip(vqvae_checkpoint_paths, codebook_sizes, codebook_dims, hidden_sizes):
        rvqvae = RVQVAE.load_from_checkpoint(vqvae_path, 
                codebook_dim=codebook_dim, 
                codebook_sizes=[codebook_size],
                num_quantizers=1,
                encoder_hidden_size=hidden_size,
                decoder_hidden_size=hidden_size,
                input_size=1024)
        rvqvae.eval()
        vqvae_models.append(rvqvae)

In [4]:
def load_aspect_vectors(path: Path):
    """
    Load aspect vectors from a given path.
    """
    data = {}
    with open(path, 'r') as f:
        for line in f:
            parts = line.strip().split()
            nid = int(parts[0])
            vector = [float(x) for x in parts[1:]]
            data[nid] = np.array(vector, dtype=np.float32)
    return data

std_dev_data_path='/home/users1/hardy/hardy/datasets/mind_resplit/MINDlarge_dev'
mfc_data_path='/home/users1/hardy/hardy/datasets/mind_resplit/MINDlarge_MFC_dev'
sentiment_data_path = '/home/users1/hardy/hardy/datasets/mind_resplit/MINDlarge_sentiment_dev'
political_data_path = '/home/users1/hardy/hardy/datasets/mind_resplit/MINDlarge_political_dev'
news_std = load_news_data(Path(std_dev_data_path), 'dev')
news_mfc = load_news_data(Path(mfc_data_path), 'dev')
news_sentiment = load_news_data(Path(sentiment_data_path), 'dev')
news_political = load_news_data(Path(political_data_path), 'dev')
std_aspect_vector = load_aspect_vectors(Path('/home/users1/hardy/hardy/project/vae/outputs/mind/dev_mind_std_sts_aspect_vectors.txt'))
mfc_aspect_vector = load_aspect_vectors(Path('/home/users1/hardy/hardy/project/vae/outputs/mind/dev_mind_frame_aspect_vectors.txt'))
sentiment_aspect_vector = load_aspect_vectors(Path('/home/users1/hardy/hardy/project/vae/outputs/mind/dev_mind_sentiment_aspect_vectors.txt'))
political_aspect_vector = load_aspect_vectors(Path('/home/users1/hardy/hardy/project/vae/outputs/mind/dev_mind_political_aspect_vectors.txt'))

In [5]:
behavior = load_history_data(Path(std_dev_data_path), 'dev', news_std, fix_history=False)


In [None]:
def load_code_dict(code_dict_path):
    code_dict = {}
    with open(code_dict_path, 'r') as f:
        for line in f:
            parts = line.strip().split()
            nid = parts[0]
            code = int(parts[1])
            code_dict[nid] = code
    return code_dict

# Example usage:
std_code_dict = load_code_dict('/home/users1/hardy/hardy/project/vae/outputs/mind/dev_mind_std_sts_code_dict.txt')
cat_code_dict = load_code_dict('/home/users1/hardy/hardy/project/vae/outputs/mind/dev_mind_category_code_dict.txt')
mfc_code_dict = load_code_dict('/home/users1/hardy/hardy/project/vae/outputs/mind/dev_mind_frame_code_dict.txt')
sentiment_code_dict = load_code_dict('/home/users1/hardy/hardy/project/vae/outputs/mind/dev_mind_sentiment_code_dict.txt')
political_code_dict = load_code_dict('/home/users1/hardy/hardy/project/vae/outputs/mind/dev_mind_political_code_dict.txt')

In [6]:
def indices_map(codebook_sizes):
    """
    Maps indices to their respective codebook sizes.
    """
    indices_map = {}
    k = 0
    for i in range(len(codebook_sizes)):
         for j in range(codebook_sizes[i]):
              indices_map[(i, j)] = k
              k += 1
    return indices_map

In [7]:
indices_map_dict = indices_map(codebook_sizes)

In [None]:
def encode_behavior(behavior, code_dict, indices_map_dict, i):
    return_behavior = behavior.copy()
    for idx, row in tqdm.tqdm(return_behavior.iterrows(), total=len(return_behavior)):
        history_indices = []
        for article_id in row['history'].split():
            article_id = article_id[1:]
            index = indices_map_dict[(i, code_dict[article_id])]
            history_indices.append(str(index))
        return_behavior.loc[idx, 'history_indices'] = ' '.join(history_indices)
    return return_behavior

std_behavior = encode_behavior(behavior, std_code_dict, indices_map_dict, 0)
cat_behavior = encode_behavior(behavior, cat_code_dict, indices_map_dict, 1)
mfc_behavior = encode_behavior(behavior, mfc_code_dict, indices_map_dict, 2)
political_behavior = encode_behavior(behavior, political_code_dict, indices_map_dict, 3)
sentiment_behavior = encode_behavior(behavior, sentiment_code_dict, indices_map_dict, 4)

In [None]:
dfs_list = [std_behavior, cat_behavior, mfc_behavior, political_behavior, sentiment_behavior]
# Assumes all DataFrames have the same order and columns: ['impression_id', 'user_id', 'history-1', 'history_indices']
total_len = len(dfs_list[0])
combined_indices = {}
for i in tqdm.tqdm(range(total_len)):
    combined_indices[dfs_list[0]['impression_id'][i]] = []
    all_history_indices = [dfs_list[j]['history_indices'][i].split() for j in range(len(dfs_list))]
    flat_zipped_indices = [idx for idc in zip(*all_history_indices) for idx in idc]
    combined_indices[dfs_list[0]['impression_id'][i]] = flat_zipped_indices
result_df = pd.DataFrame(combined_indices.items(), columns=['impression_id', 'history_indices'])
# Take the longest history_indices for each impression_id
result_df['history_indices'] = result_df['history_indices'].apply(lambda x: ' '.join(map(str, x)))
result_df.to_csv('combined_history_indices.csv', index=False)

In [5]:
result_df = pd.read_csv('/home/users1/hardy/hardy/project/vae/outputs/mind/combined_dev_histories_indices.csv')

In [32]:
result_df

NameError: name 'result_df' is not defined

In [None]:
seqvqvae_data_module = SeqVQVAEDataModule(
    test_df = result_df,
    batch_size=4,
    max_len=10000,
    overlap=0,
    begin_token = sum(codebook_sizes)
)
seqvqvae_data_module.setup('test')

In [None]:
dataloader = seqvqvae_data_module.test_dataloader()

In [None]:
seqvqvae.set_predict_params(codebook_sizes=codebook_sizes, beam_size=10, n_tokens=5)

In [None]:
shapes = results[0].shape
shapes

In [None]:
results = []
i = 5
for batch in tqdm.tqdm(dataloader):
    batch = [x.cuda() for x in batch]
    outputs = seqvqvae.predict_step(batch, 0)
    results.append(outputs[0].cpu().numpy())
    i = i - 1
    if i == 0:
        break

In [None]:
len(results)

In [None]:
output_lines = []
split_results = [split for result in results for split in np.split(result, result.shape[0], axis=0)]
output_lines.append(' '.join(map(str, split_results[0].shape)))
for result in split_results:
    result = np.reshape(result, -1)
    output_lines.append(' '.join(map(str, result)))

In [None]:
with open('output_lines.txt', 'w') as f:
    for line in output_lines:
        f.write(line + '\n')

In [6]:
def load_seq_result(path: Path):
    """
    Load aspect vectors from a given path.
    """
    shapes = []
    data = []
    with open(path, 'r') as f:
        first = True
        for line in f:
            if first:
                shapes = line.strip().split()
                first = False
            else:
                parts = line.strip().split()
                vector = np.array([int(x) for x in parts])
                vector = vector.astype(np.int32).reshape(*map(int, shapes))
                data.append(vector)
    return data


In [None]:
output_lines_2 = load_seq_result(Path('output_lines.txt'))

In [7]:
test_output = load_seq_result(Path('/home/users1/hardy/hardy/project/vae/outputs/mind/seq_prediction_beam_25.txt'))

In [13]:
len(test_output[0][0])

25

In [8]:
def indices_map(codebook_sizes):
    """
    Maps indices to their respective codebook sizes.
    """
    indices_map = {}
    k = 0
    for i in range(len(codebook_sizes)):
         for j in range(codebook_sizes[i]):
              indices_map[k] = (i, j)
              k += 1
    return indices_map

indices_map_dict = indices_map(codebook_sizes)

In [15]:
from tqdm import tqdm
rows = []
for prediction in tqdm(test_output[:4]):
    candidates = []    
    for cand in prediction.squeeze(0):
        concat_vectors = []
        for i in range(cand.shape[0]):
            j, cand_idx = indices_map_dict[cand[i]]
            assert i == j, f"Index mismatch: {i} != {j} for cand_idx {cand_idx}"
            aspect_vector = vqvae_models[i].decode_from_index(torch.tensor(cand_idx, device=device, requires_grad=False)).detach().cpu().numpy()
            concat_vectors.append(aspect_vector)
        concat_vectors = np.concatenate(concat_vectors, axis=-1)
        candidates.append(concat_vectors.squeeze(0))
    rows.append(candidates)

100%|██████████| 4/4 [00:01<00:00,  2.54it/s]


In [16]:
len(rows[0])

25

In [17]:
decoded_output = []
for cand in test_output[1].squeeze(0):
    concat_vectors = []
    for i in range(cand.shape[0]):
        j, cand_idx = indices_map_dict[cand[i]]
        assert i == j, f"Index mismatch: {i} != {j} for cand_idx {cand_idx}"
        aspect_vector = vqvae_models[i].decode_from_index(torch.tensor(cand_idx, device=device, requires_grad=False)).detach().cpu().numpy()
        concat_vectors.append(aspect_vector)
    concat_vectors = np.concatenate(concat_vectors, axis=-1)
    decoded_output.append(concat_vectors.squeeze(0))

In [None]:
import matplotlib.pyplot as plt

flat_values = decoded_output[0].flatten()
plt.hist(flat_values, bins=50)
plt.title("Distribution of decoded_output[0] values")
plt.xlabel("Value")
plt.ylabel("Frequency")
plt.show()

In [12]:
client = MilvusClient("/home/users1/hardy/hardy/project/vae/outputs/mind/aspect_data_new2.db")


In [13]:
collections = client.list_collections()
print(collections)

['dev_mind_2019_11_14', 'test_mind_2019_11_15']


In [15]:
# num_entities = client.num_entities("dev_mind_2019_11_14")
# Get collection statistics to find number of entities
stats = client.get_collection_stats("dev_mind_2019_11_14")
num_entities = stats['row_count']
print(f"Number of instances in dev_mind_2019_11_14: {num_entities}")

Number of instances in dev_mind_2019_11_14: 8705


In [10]:
import random

rows = []
for prediction in tqdm(test_output[:100]):
    candidates = []    
    for cand in prediction.squeeze(0):
        # concat_vectors = []
        
        # for i in range(cand.shape[0]):
        #     j, cand_idx = indices_map_dict[cand[i]]
        #     assert i == j, f"Index mismatch: {i} != {j} for cand_idx {cand_idx}"
        #     aspect_vector = vqvae_models[i].decode_from_index(torch.tensor(cand_idx, device=device, requires_grad=False)).detach().cpu().numpy()
        #     # if random.random() < 0.25:  # Only flip vectors for non-standard aspects
        #     #     aspect_vector = -aspect_vector
        #     concat_vectors.append(aspect_vector)
        # concat_vectors_concat = np.concatenate(concat_vectors, axis=-1)
        # candidates.append(concat_vectors_concat.squeeze(0))
        concat_vectors = []
        for i in range(cand.shape[0]):
            j, cand_idx = indices_map_dict[cand[i]]
            assert i == j, f"Index mismatch: {i} != {j} for cand_idx {cand_idx}"
            aspect_vector = vqvae_models[i].decode_from_index(torch.tensor(cand_idx, device=device, requires_grad=False)).detach().cpu().numpy()
            if random.random() < 0.5:  # Only flip vectors for non-standard aspects
                aspect_vector = -aspect_vector
            concat_vectors.append(aspect_vector)
        concat_vectors_concat = np.concatenate(concat_vectors, axis=-1)
        candidates.append(concat_vectors_concat.squeeze(0))
    results = client.search(
        collection_name="dev_mind_2019_11_14",
        search_params={"metric_type": "IP"},
        anns_field="vector",
        data=candidates,
        limit=2,
        output_fields=["nid", "vector"]
    )
    outputs = [None for _ in range(len(candidates))]
    for i in range(len(candidates)):
        outputs[i] = [res['nid'] for res in results[i]]
    rows.append(outputs)
rows = [[candidate for candidates in row for candidate in candidates] for row in rows]

TypeError: 'module' object is not callable. Did you mean: 'tqdm.tqdm(...)'?

In [25]:
def calculate_pairwise_dissimilarity(features):
    features = np.stack(features)
    norms = np.linalg.norm(features, axis=1, keepdims=True)
    normalized = features / norms
    similarity_matrix = np.dot(normalized, normalized.T)
    return 1 - similarity_matrix

def average_pairwise_dissimilarity(features):
    if len(features) < 2:
        return 0.0 # A list with 0 or 1 item has no diversity

    dissimilarity_matrix = calculate_pairwise_dissimilarity(features)
    
    # We only need the upper triangle (excluding diagonal)
    # The sum of unique pairs is the sum of all elements divided by 2 (because it's symmetric)
    # and subtracting the diagonal (which is 0 for dissimilarity to itself).
    # Easier: sum the upper triangle
    upper_triangle_sum = np.sum(np.triu(dissimilarity_matrix, k=1))
    
    num_pairs = len(features) * (len(features) - 1) / 2
    
    if num_pairs == 0:
        return 0.0
        
    return upper_triangle_sum / num_pairs



In [93]:
score = 0.0
for row in rows:
    features = []
    for candidate in set(row):
        feature = std_aspect_vector[candidate]
        features.append(feature)
    dissimilarity = average_pairwise_dissimilarity(features)
    score += dissimilarity
score /= len(rows)
print(score)

0.50587916


0.46346933


In [34]:
new_rows

[(109797, 5449.9345703125),
 (123026, 4928.3740234375),
 (1830, 5077.53662109375),
 (122267, 5467.5712890625),
 (120508, 4876.751953125),
 (74495, 5215.9599609375),
 (107474, 4397.984375),
 (100997, 5275.6533203125),
 (54250, 5170.484375),
 (22424, 5207.8349609375),
 (42245, 5681.4169921875),
 (113349, 5496.6591796875),
 (92498, 5111.955078125),
 (15308, 5485.8359375),
 (69387, 4908.2509765625),
 (92498, 4784.46826171875),
 (3356, 5391.423828125),
 (41808, 4445.25146484375),
 (100261, 4554.03759765625),
 (64831, 4965.2529296875),
 (88148, 5367.2412109375),
 (75353, 5771.7099609375),
 (35161, 5821.1923828125),
 (12216, 5428.8681640625),
 (104744, 5591.14697265625),
 (110550, 4746.9208984375),
 (65504, 4202.3134765625),
 (128936, 5747.4501953125),
 (67930, 5769.76708984375),
 (43248, 5306.103515625),
 (8403, 5184.4833984375),
 (65624, 5716.9462890625),
 (112151, 4759.73388671875),
 (81716, 5559.14306640625),
 (85261, 5654.15625),
 (88148, 5298.0439453125),
 (79760, 4389.09521484375),
 (5

In [None]:
import pickle

with open('rows.pkl', 'wb') as f:
    pickle.dump(rows, f)

array([ 16428.        ,   5499.87304688,  19543.        ,   5478.94287109,
         8077.        ,   5478.125     ,  24442.        ,   5443.41992188,
       107695.        ,   5430.83642578,  57566.        ,   5428.89941406,
       100766.        ,   5415.3984375 , 110819.        ,   5401.37353516,
        75153.        ,   5373.47949219,  98472.        ,   5368.33203125,
        76463.        ,   5364.49023438,  10839.        ,   5337.35791016,
       119357.        ,   5322.09472656,  27481.        ,   5315.38427734,
       117610.        ,   5312.18066406,  96944.        ,   5310.56835938,
        47150.        ,   5283.27490234,  69563.        ,   5272.05322266,
         3061.        ,   5262.02636719,  91792.        ,   5259.10351562,
       128570.        ,   5242.12988281, 113149.        ,   5235.94140625,
        40654.        ,   5223.17285156,  90709.        ,   5222.94238281,
       117567.        ,   5204.01074219,  29940.        ,   4939.74902344,
       123706.        ,  