In [1]:
import pandas as pd

df = pd.read_parquet("search_results.parquet")
df.head()

Unnamed: 0,record,results
0,"{'citation_dois': ['10.1086/311563'], 'expande...","[{'citation_count': 6, 'doi': '10.1051/0004-63..."
1,"{'citation_dois': ['10.1086/368156'], 'expande...","[{'citation_count': 102, 'doi': '10.1086/36815..."
2,"{'citation_dois': ['10.1086/317785'], 'expande...","[{'citation_count': 125, 'doi': '10.1086/15541..."
3,{'citation_dois': ['10.48550/arXiv.astro-ph/97...,"[{'citation_count': 136, 'doi': '10.1086/13328..."
4,"{'citation_dois': ['10.1086/176071', '10.1086/...","[{'citation_count': 566, 'doi': '10.1086/18230..."


In [4]:
type(df.record.iloc[0])
print(df.record.iloc[0].keys())

dict_keys(['citation_dois', 'expanded_query', 'pubdate', 'resolved_bibcodes', 'sent_cit_masked', 'sent_idx', 'sent_no_cit', 'sent_original', 'source_doi'])


In [2]:
df.results.iloc[0]

array([{'citation_count': 6, 'doi': '10.1051/0004-6361/202450018', 'metric': 0.7013970017433167, 'pubdate': 20240801, 'text': 'Observation of potential under-luminosity in some companions, suggesting possible companion binarity or atmospheric phenomena.'},
       {'citation_count': 220, 'doi': '10.1086/340358', 'metric': 0.6690324544906616, 'pubdate': 20020601, 'text': 'A faint, extended, and kinematically anomalous H I component is detected, showing up as extended wings of emission toward the systemic velocity in the H I line profiles.'},
       {'citation_count': 115, 'doi': '10.1086/185577', 'metric': 0.6650761365890503, 'pubdate': 19891101, 'text': "Nonnucleated dE's brighter than M_BT_ ~ -14.2 are distributed like spirals and irregulars."},
       {'citation_count': 134, 'doi': '10.1086/427275', 'metric': 0.6511155962944031, 'pubdate': 20050301, 'text': 'Two small ultraviolet enhancements were observed, differing in the amount of nonthermal broadening present.'},
       {'citation

In [5]:
type(df.results.tolist()[0].tolist())
df.results.tolist()[0].tolist()

[{'citation_count': 6,
  'doi': '10.1051/0004-6361/202450018',
  'metric': 0.7013970017433167,
  'pubdate': 20240801,
  'text': 'Observation of potential under-luminosity in some companions, suggesting possible companion binarity or atmospheric phenomena.'},
 {'citation_count': 220,
  'doi': '10.1086/340358',
  'metric': 0.6690324544906616,
  'pubdate': 20020601,
  'text': 'A faint, extended, and kinematically anomalous H I component is detected, showing up as extended wings of emission toward the systemic velocity in the H I line profiles.'},
 {'citation_count': 115,
  'doi': '10.1086/185577',
  'metric': 0.6650761365890503,
  'pubdate': 19891101,
  'text': "Nonnucleated dE's brighter than M_BT_ ~ -14.2 are distributed like spirals and irregulars."},
 {'citation_count': 134,
  'doi': '10.1086/427275',
  'metric': 0.6511155962944031,
  'pubdate': 20050301,
  'text': 'Two small ultraviolet enhancements were observed, differing in the amount of nonthermal broadening present.'},
 {'citati

In [6]:
results_df = pd.DataFrame(df.results.iloc[0].tolist())
results_df.head()

Unnamed: 0,citation_count,doi,metric,pubdate,text
0,6,10.1051/0004-6361/202450018,0.701397,20240801,Observation of potential under-luminosity in s...
1,220,10.1086/340358,0.669032,20020601,"A faint, extended, and kinematically anomalous..."
2,115,10.1086/185577,0.665076,19891101,Nonnucleated dE's brighter than M_BT_ ~ -14.2 ...
3,134,10.1086/427275,0.651116,20050301,Two small ultraviolet enhancements were observ...
4,13,10.1086/316553,0.647442,20000401,S0 galaxies with pronounced disk characteristi...


In [7]:
df.results.iloc[0].tolist()[0].keys()

dict_keys(['citation_count', 'doi', 'metric', 'pubdate', 'text'])

In [None]:
from metrics import get_metric

class RankFuser:
    """
    A class that produces a weighted sum of scores from multiple scoring functions,
    then uses those weights to rerank a set of results
    """

    def __init__(self, config: dict[str, float]):
        """
        Initializes the RankFuser with a configuration dictionary that maps scoring function names to their weights.

        Args:
            config (dict[str, float]): A dictionary where keys are scoring function names and values are their respective weights.
        """
        self.config = config
        self.metrics = [get_metric(name) for name in config.keys()]
        self.weights = list(config.values())

    def rerank(self, df: pd.DataFrame) -> pd.DataFrame:
        """
        Expects a dataframe with 2 columns, "record" and "results".
        - "record" is a dict representing a query. 
        - "results" is a list of dicts containing the results to be reranked.

        Returns:
            pd.DataFrame: The reranked results DataFrame, again with "record", "results" columns
        """ 
        reranked_df = pd.DataFrame(columns=["record", "results"])

        for row in df.itertuples(index=False):
            query = pd.Series(row.record)
            results = pd.DataFrame(row.results)
            reranked_results = self._rerank_single(query, results)
            reranked_df = reranked_df.append({"record": query, "results": reranked_results}, ignore_index=True)

        return reranked_df

    def _rerank_single(self, query: pd.Series, results: pd.DataFrame) -> pd.DataFrame:
        """
        Reranks the results DataFrame based on the weighted sum of scores from the configured metrics.

        Args:
            query (pd.Series): The query for which results are being reranked.
            results (pd.DataFrame): The DataFrame containing results to be reranked.

        Returns:
            pd.DataFrame: The reranked results DataFrame.
        """

        # Calculate scores for each metric
        scores = [metric(query, results) for metric in self.metrics]

        # Compute the weighted sum of scores
        weighted_scores = sum(weight * score for weight, score in zip(self.weights, scores))
        results["weighted_score"] = weighted_scores

        # Sort by the weighted score in descending order
        return results.sort_values("weighted_score", ascending=False).reset_index(drop=True)