In [24]:
import pandas as pd
import torch
import torch.nn.functional as F


# Write a 2 layer dense neural network in PyTorch
class QueryTransformer(torch.nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(QueryTransformer, self).__init__()
        self.dense_1 = torch.nn.Linear(input_dim, hidden_dim)
        self.dense_2 = torch.nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        x = F.gelu(self.dense_1(x))
        x = F.gelu(self.dense_2(x))
        x = F.normalize(x, p=2, dim=1)
        return x

In [25]:
query_transformer = QueryTransformer(input_dim=1024, hidden_dim=512, output_dim=1024)

In [26]:
x = torch.randn(2, 1024)
y = query_transformer(x)
print(y.shape)

torch.Size([2, 1024])


In [27]:
torch.linalg.norm(y, axis=1)

tensor([1.0000, 1.0000], grad_fn=<LinalgVectorNormBackward0>)

In [28]:
from citeline.embedders import Embedder

embedder = Embedder.create(model_name="Qwen/Qwen3-Embedding-0.6B", device="mps", normalize=True)
documents = ["Hi there", "How are you?"]
vectors_np = embedder(documents, for_queries=True)

vectors = torch.tensor(vectors_np)
vectors.shape

torch.Size([2, 1024])

In [29]:
df = pd.read_json("../data/dataset/nontrivial_10.jsonl", lines=True)
reference_data = pd.read_json("../data/preprocessed/reviews.jsonl", lines=True)
expander = QueryExpander("add_prev_3", reference_data=reference_data)

In [None]:
from sklearn.model_selection import train_test_split
from citeline.query_expander import QueryExpander
from citeline.database.milvusdb import MilvusDB
from pathlib import Path

db = MilvusDB()

def create_dataset(path_to_data: str, embedder: Embedder, expansions: list[str], expansion_data_path: str, db: MilvusDB, collection_name: str, output_path: str, test_size: float = 0.2, random_state=42):
    # Set up query expander
    reference_data = pd.read_json(expansion_data_path, lines=True)
    expanders = [QueryExpander(expansion, reference_data=reference_data) for expansion in expansions]

    # Load data
    df = pd.read_json(path_to_data, lines=True)

    train_df, test_df = train_test_split(df, test_size=test_size, random_state=random_state)
    train_shape, test_shape = (len(train_df), embedder.dim), (len(train_df), embedder.dim)
    
    # TODO: do these vars need to be named or can the just be passed straight in?
    train_filename = Path(output_path) / "train.npy"
    test_filename = Path(output_path) / "test.npy"
    train_mm = np.memmap(train_filename, dtype=np.float32, mode="w+", shape=train_shape)
    test_mm = np.memmap(test_filename, dtype=np.float32, mode="w+", shape=test_shape)

    # in batches:
    for idx, row in train_df.itertuples():
        # expand the query w/no previous sentence, 1, 2 (and 3?) previous sentences
        query = row.sent_no_cit
        dois = row.citation_dois
        query_vectors = embedder([expander(query) for expander in expanders])

        # Get the target chunks' vectors from the database
        for doi in dois:
            target_entities = db.select_by_doi(doi=doi, collection_name=collection_name)
            target_vectors = np.array(target_entities['vector'].tolist())
            for q_vector in query_vectors:
                similarities = np.dot(q_vector, target_vectors.T)
                best_target = np.argmax(similarities)
                
        # for each query embedding, pick the closest target vector
        # (query embedding, target embedding) becomes your (input, label)
        # write out to dataset

create_dataset(
    path_to_data="../data/dataset/nontrivial_10.jsonl",
    embedder=embedder,
    expansion_name=["identity", "add_prev_3"]
    expansion_data_path="../data/preprocessed/reviews.jsonl",
    output_path="."
    db=db,
    collection_name="qwen06_chunks"
)

1024