In [None]:
import pandas as pd
from pathlib import Path
import bm25s
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
from scipy.stats import rankdata
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
def compute_bm25s_scores(query: str, df: pd.DataFrame):
    corpus = (df["title"].astype(str) + ": " + df["abstract"].astype(str)).tolist()
    corpus_tokens = bm25s.tokenize(corpus, stopwords="english")

    retriever = bm25s.BM25(method="lucene")
    retriever.index(corpus_tokens)
    
    # Tokenize query
    query_tokens = bm25s.tokenize(query, stopwords="english")
    
    # Get scores for all documents
    # retrieve returns (docs, scores) but we only need scores
    _, scores = retriever.retrieve(query_tokens, k=len(corpus))
    
    # BM25s returns scores in shape (1, k) - flatten to 1D
    scores = scores.flatten()
    
    # Normalize scores to 0-1 range
    if scores.max() > 0:
        scores_normalized = scores / scores.max()
    else:
        scores_normalized = np.zeros(len(corpus))
    return rankdata(-scores_normalized, "dense")

########################################################

def tfidf_vectorizer(query: str, papers: list[str]):
    if not papers:
        return np.array([])
    corpus = [query] + papers
    vectorizer = TfidfVectorizer(stop_words='english')
    tfidf_matrix = vectorizer.fit_transform(corpus)
    
    similarities = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:]).flatten()

    # Normalize scores to 0-1 range
    scores_normalized = similarities / (similarities.max() or 1)
    return rankdata(-scores_normalized, "dense")


#####################################################

def compute_lsa_scores(query: str, df: pd.DataFrame, n_components: int = 100):
    """
    Computes LSA scores for a query against a dataframe of documents.

    NOTE: This is the 'inefficient' way, as it re-builds the entire
    LSA model for every single query. See the class-based
    example below for a much more efficient, 'correct' implementation.
    """
    
    # 1. Create the corpus
    corpus = (df["title"].astype(str) + ": " + df["abstract"].astype(str)).tolist()
    
    # --- LSA Indexing ---
    # 2. Create the TF-IDF matrix (This is the input to LSA)
    # We filter out very common and very rare words, which helps LSA
    vectorizer = TfidfVectorizer(stop_words='english', max_df=0.8, min_df=5)
    tfidf_matrix = vectorizer.fit_transform(corpus)
    
    # 3. Apply Truncated SVD (This *is* LSA)
    # This finds the 'topics' or 'concepts' in the text
    svd_model = TruncatedSVD(n_components=n_components, random_state=42)
    
    # 4. Create the LSA document matrix (documents transformed into 'topic space')
    lsa_matrix = svd_model.fit_transform(tfidf_matrix)
    
    # --- LSA Querying ---
    # 5. Transform the query into the same LSA 'topic space'
    # 5a. First, get the query's TF-IDF vector
    query_tfidf = vectorizer.transform([query])
    # 5b. Then, transform that vector using the SVD model
    query_lsa = svd_model.transform(query_tfidf)
    
    # 6. Calculate Similarity
    # We use cosine similarity to find documents that are 'close' to
    # the query in the LSA topic space.
    scores = cosine_similarity(query_lsa, lsa_matrix)
    
    # 7. Flatten and Rank
    scores = scores.flatten()
    return rankdata(-scores, "dense")


#################################################################

def compute_bm25_centroid_ranks(df: pd.DataFrame, centroid_k: int = 5):
    """
    Ranks all papers by similarity to the centroid of the
    top 'centroid_k' BM25-ranked papers.
    
    This function is designed to run *after* a 'bm25_rank'
    column already exists on the DataFrame.
    
    It correctly handles missing embeddings by assigning them
    the worst possible rank.
    """
    
    # --- Step 1: Clean the Data ---
    
    # 1a. Extract vectors, safely handling empty dicts
    df['specter_vector'] = df['embedding'].apply(lambda x: x.get('specter_v2'))
    
    # 1b. Create clean df, but *keep the original index*
    # This index is crucial for mapping back to the full df.
    df_clean = df.dropna(subset=['specter_vector']).copy()
    
    # 1c. Create the clean 2D NumPy array
    try:
        embeddings_clean_all = np.stack(df_clean['specter_vector'].values)
    except ValueError as e:
        print(f"Error: Could not stack embeddings. Are they all the same length? {e}")
        # Return a useless rank for all rows if it fails
        return np.full(len(df), len(df), dtype=int)
        
    # --- Step 2: Create Pseudo-Query (Centroid) ---
    
    # 2a. Find top 'k' papers *from df_clean* based on bm25_rank
    # *** FIX: Use ascending=True because rank 1 is the best ***
    top_k_papers = df_clean.sort_values(by="bm25_rank", ascending=True).head(centroid_k)
    
    # 2b. Get the integer row positions (ilocs) of these papers
    # within the 'df_clean' dataframe.
    top_k_ilocs = df_clean.index.get_indexer(top_k_papers.index)

    # 2c. Get vectors using these ilocs from the clean array
    top_k_vectors = embeddings_clean_all[top_k_ilocs]
    
    # 2d. Create the centroid
    centroid_query_vector = np.mean(top_k_vectors, axis=0).reshape(1, -1)
    
    # --- Step 3: Re-rank ---
    
    # 3a. Calculate similarity against all 140 clean embeddings
    scores = cosine_similarity(centroid_query_vector, embeddings_clean_all)
    scores = scores.flatten() # Shape (140,)
    
    # --- Step 4: Map Ranks Back to Original DF (Fixes the ValueError) ---
    
    # 4a. Create a Series of scores, using df_clean's index
    # This aligns the 140 scores with their original 166-based indices
    ranked_scores_series = pd.Series(scores, index=df_clean.index)
    
    # 4b. Reindex to the *full* original df's index (166 rows)
    # This fills the 26 missing rows (with no embeddings) with NaN
    full_scores_series = ranked_scores_series.reindex(df.index)
    
    # 4c. Fill NaN with a very bad score (-1, since similarity is 0-1)
    # These papers couldn't be scored, so they are the least similar.
    full_scores_series = full_scores_series.fillna(-np.inf) 
    
    # 4d. Now, rank this *full* 166-length array
    # The -1 scores will correctly be given the worst (highest) rank.
    return rankdata(-full_scores_series.values, "dense")

#################################################################

def compute_rrf_relevance(df):
    return (
        1 / (df['bm25_rank'] + 60) +
        1 / (df['tfid_rank'] + 60) +
        1 / (df['lsa_rank'] + 60) +
        1 / (df['pseudo_specter_rank'] + 60)
    )


##################################################################

def compute_authority_scores(df: pd.DataFrame):
    """
    Computes authority scores based on influential citations.
    Returns dense ranks (1 = best).
    """
    # Use influentialCitationCount, fallback to citationCount * 0.3
    df['authority_raw'] = df.apply(
        lambda row: np.log10(
            max(row.get('influentialCitationCount', 0), 
                row.get('citationCount', 0) * 0.3) + 1
        ), 
        axis=1
    )
    
    # Rank: higher authority = lower rank number
    return rankdata(-df['authority_raw'].values, method='dense')


################################################################

def compute_recency_scores(df: pd.DataFrame, current_year: int = 2025, lambda_decay: float = 0.1):
    """
    Computes recency scores using exponential decay.
    Returns dense ranks (1 = newest/best).
    
    lambda_decay controls half-life:
    - 0.1 = ~10 year half-life (general science)
    - 0.3 = ~3 year half-life (fast-moving fields like ML)
    """
    df['recency_raw'] = df['year'].apply(
        lambda year: np.exp(-lambda_decay * (current_year - year)) if pd.notna(year) else 0
    )
    
    # Rank: higher recency score = lower rank number
    return rankdata(-df['recency_raw'].values, method='dense')

#######################################################################

def normalize_scores(df: pd.DataFrame, score_columns: list):
    """
    Applies min-max normalization to score columns.
    Handles edge case where max == min.
    """
    for col in score_columns:
        min_val = df[col].min()
        max_val = df[col].max()
        
        if max_val - min_val > 0:
            df[f'{col}_norm'] = (df[col] - min_val) / (max_val - min_val)
        else:
            # All scores are identical - set to 0.5 (neutral)
            df[f'{col}_norm'] = 0.5
    
    return df


In [None]:
def compute_final_ranking(df: pd.DataFrame, 
                          w_relevance: float = 0.60,
                          w_authority: float = 0.35,
                          w_recency: float = 0.05):
    """
    Computes final weighted score from all normalized signals.
    """
    
    # --- THIS IS THE FIX ---
    # 1. Compute the raw RRF SCORE (higher is better)
    df['rrf_relevance_SCORE'] = compute_rrf_relevance(df)
    
    # 2. Convert the SCORE to a RANK (lower is better)
    #    We rank the *negative* score, so the highest score gets rank 1.
    df['rrf_relevance_rank'] = rankdata(-df['rrf_relevance_SCORE'].values, method='dense')
    # --- END FIX ---

    # Compute other raw ranks (lower rank = better)
    df['authority_rank'] = compute_authority_scores(df)
    df['recency_rank'] = compute_recency_scores(df)
    
    # CRITICAL: Normalize ranks directly using min-max
    df = normalize_scores(df, ['rrf_relevance_rank', 'authority_rank', 'recency_rank'])
    
    # Invert normalized ranks so higher = better (this logic is correct)
    df['relevance_score'] = 1.0 - df['rrf_relevance_rank_norm']
    df['authority_score'] = 1.0 - df['authority_rank_norm']
    df['recency_score'] = 1.0 - df['recency_rank_norm']
    
    # Final weighted combination
    df['final_score'] = (
        w_relevance * df['relevance_score'] +
        w_authority * df['authority_score'] +
        w_recency * df['recency_score']
    )
    
    return df.sort_values('final_score', ascending=False)

In [None]:
query =  "adverse mental health outcomes social media use adolescents"

In [None]:
file_path = Path.cwd().parents[0].joinpath("searches").glob("*.json")
df = pd.concat([pd.read_json(i) for i in file_path]).reset_index(drop = True)
    # .drop(columns = ["paperId", "openAccessPdf", "authors", "venue"])
df.sample(5)

In [None]:
df = df.assign(tfid_rank = lambda df: tfidf_vectorizer(query, df.title.to_list()),
          bm25_rank = lambda df: compute_bm25s_scores(query, df),
          lsa_rank = lambda df: compute_lsa_scores(query, df),
          pseudo_specter_rank = lambda df: compute_bm25_centroid_ranks(df),
          rrf_score = lambda df: compute_rrf_relevance(df),
          authority_rank = lambda df: compute_authority_scores(df),
          recency_rank = lambda df: compute_recency_scores(df, 2025)
         )
df

In [None]:
compute_final_ranking(df)
    # .openAccessPdf.to_list()