In [1]:
import pandas as pd

df = pd.read_parquet("experiments/results/test/search_results.parquet")
df.head()

Unnamed: 0,record,results
0,"{'citation_dois': ['10.1086/311563'], 'expande...","[{'citation_count': 220, 'doi': '10.1086/34035..."
1,"{'citation_dois': ['10.1086/368156'], 'expande...","[{'citation_count': 102, 'doi': '10.1086/36815..."
2,"{'citation_dois': ['10.1086/317785'], 'expande...","[{'citation_count': 125, 'doi': '10.1086/15541..."
3,{'citation_dois': ['10.48550/arXiv.astro-ph/97...,"[{'citation_count': 136, 'doi': '10.1086/13328..."
4,"{'citation_dois': ['10.1086/176071', '10.1086/...","[{'citation_count': 566, 'doi': '10.1086/18230..."


In [None]:
from metrics import Metric


class RankFuser:
    """
    A class that produces a weighted sum of scores from multiple scoring functions,
    then uses those weights to rerank a set of results
    """

    def __init__(self, config: dict[str, float]):
        """
        Initializes the RankFuser with a configuration dictionary that maps scoring function names to their weights.

        Args:
            config (dict[str, float]): A dictionary where keys are scoring function names and values are their respective weights.
        """
        self.config = config
        self.metrics = [Metric.get_metric(name) for name in config.keys()]
        self.weights = list(config.values())

    def rerank(self, df: pd.DataFrame) -> pd.DataFrame:
        """
        Expects a dataframe with 2 columns, "record" and "results".
        - "record" is a dict representing a query.
        - "results" is a numpy array of dicts containing the results to be reranked.

        Returns:
            pd.DataFrame: The reranked results DataFrame, again with "record", "results" columns
        """
        rows = []

        for row in df.itertuples(index=False):
            query = pd.Series(row.record)
            results = pd.DataFrame(row.results.tolist())
            print(f"Got columns: {results.columns}")
            reranked_results = self._rerank_single(query, results)
            rows.append({"record": query, "results": reranked_results})

        return pd.DataFrame(rows)

    def _rerank_single(self, query: pd.Series, results: pd.DataFrame) -> pd.DataFrame:
        """
        Reranks the results DataFrame based on the weighted sum of scores from the configured metrics.

        Args:
            query (pd.Series): The query for which results are being reranked.
            results (pd.DataFrame): The DataFrame containing results to be reranked.

        Returns:
            pd.DataFrame: The reranked results DataFrame.
        """

        # Calculate scores for each metric
        scores = [metric(query, results) for metric in self.metrics]

        # # Create a column for each metric
        # TODO: Implement this
        # for metric, weight in self.config.items():
        #     results[metric.name]

        # Compute the weighted sum of scores
        weighted_scores = sum(weight * score for weight, score in zip(self.weights, scores))
        results["weighted_score"] = weighted_scores

        # Sort by the weighted score in descending order
        return results.sort_values("weighted_score", ascending=False).reset_index(drop=True)


rf = RankFuser(
    config={
        "recency": 1.0,
    }
)
print(rf.metrics)
print(rf.weights)

[<metrics.Recency object at 0x12d286f10>]
[1.0]


In [3]:
single_results_df = pd.DataFrame(df.results.iloc[0].tolist())
single_record = pd.Series(df.record.iloc[0])
# single_results_df.head()

type(single_results_df.pubdate)

pandas.core.series.Series

In [4]:
test_df = df[:3]
for row in test_df.itertuples():
    record = pd.Series(row.record)
    results = pd.DataFrame(row.results.tolist())
    print(record)
    print("============")

citation_dois                                         [10.1086/311563]
expanded_query                                                    None
pubdate                                                       20080601
resolved_bibcodes                                [1998ApJ...504L..23S]
sent_cit_masked      ([REF]), do show unusual, faint features in th...
sent_idx                                                           159
sent_no_cit          (), do show unusual, faint features in their s...
sent_original        (Shang et al. 1998), do show unusual, faint fe...
source_doi                                   10.1007/s00159-008-0010-0
dtype: object
citation_dois                                         [10.1086/368156]
expanded_query                                                    None
pubdate                                                       20040901
resolved_bibcodes                                [2003ApJ...583L..83N]
sent_cit_masked      [REF] studied the dust emission at 450 μm 

In [13]:
reranked = rf.rerank(test_df)
reranked.head()

Got columns: Index(['citation_count', 'doi', 'metric', 'pubdate', 'text'], dtype='object')
Got columns: Index(['citation_count', 'doi', 'metric', 'pubdate', 'text'], dtype='object')
Got columns: Index(['citation_count', 'doi', 'metric', 'pubdate', 'text'], dtype='object')


Unnamed: 0,record,results
0,citation_dois ...,citation_count doi...
1,citation_dois ...,citation_count ...
2,citation_dois ...,citation_count ...


In [16]:
res = reranked.results
print(res.tolist())

[    citation_count                         doi    metric   pubdate  \
0              314              10.1086/533425  0.630813  20080501   
1              342              10.1086/523853  0.632481  20071201   
2              349  10.1051/0004-6361:20064883  0.645525  20060801   
3              805              10.1086/430104  0.644438  20050601   
4              134              10.1086/427275  0.651116  20050301   
5               90              10.1086/344435  0.640089  20021201   
6              291              10.1086/342146  0.632593  20020901   
7              220              10.1086/340358  0.669032  20020601   
8              194              10.1086/319033  0.639901  20010201   
9               13              10.1086/316553  0.647442  20000401   
10             332              10.1086/117934  0.637092  19960501   
11             167              10.1086/171257  0.634246  19920501   
12             197              10.1086/116144  0.633320  19920401   
13             447 

In [12]:
for row in reranked.itertuples():
    record = pd.Series(row.record)
    results = pd.DataFrame(row.results)
    print(f"Query: {record.pubdate},{record.sent_no_cit}")
    print(f"Results: {results.pubdate.tolist()}")


Query: 20080601,(), do show unusual, faint features in their surroundings.
Results: [20080501, 20071201, 20060801, 20050601, 20050301, 20021201, 20020901, 20020601, 20010201, 20000401, 19960501, 19920501, 19920401, 19911001, 19891101, 19871101, 19870301, 19841201, 19791001, 19790601]
Query: 20040901, studied the dust emission at 450 μm in the galactic plane, and found the magnetic field to be well aligned parallel with the galactic plane (their Fig. 1), within 30 arcmin in longitude and 10 arcmin in latitude of the GC.
Results: [20031101, 20031101, 20030201, 20030201, 20030201, 20021001, 20021001, 20021001, 20000701, 20000501, 20000401, 20000201, 20000101, 20000101, 19991101, 19971201, 19960301, 19960301, 19920201, 19870301]
Query: 20190801,The momentum per unit mass of stars formed delivered by protostellar outflows to their surroundings is of order the escape speed from a protostellar surface (e.g.,  ).
Results: [20190401, 20190301, 20160301, 20150201, 20150201, 20140301, 20120901, 2