# Go Within Python

Making sure I understand how to extract the results of the go multi cluster experiment into python to output the full results.

In [1]:
import subprocess
import asyncio

In [2]:
test_query = "example search text"

cmd = [
    "go",
    "run",
    ".",
    "-search_config",
    "../configs/test_search.json",
    "multi-cluster-experiment",
    test_query,
]

In [3]:
process = subprocess.run(cmd, cwd="./", check=True, capture_output=True)

In [12]:
out = process.stdout.decode("utf-8").splitlines()[-1]

In [13]:
import json

results = json.loads(out.splitlines()[-1])

In [14]:
for cluster_res in results["all_results"]:
    print(cluster_res)

{'cluster_rank': 1, 'cluster_index': 24, 'results': [{'score': 21765, 'url': 'D802805'}, {'score': 20752, 'url': 'D3347722'}, {'score': 20325, 'url': 'D3087368'}, {'score': 19755, 'url': 'D3416613'}, {'score': 19468, 'url': 'D3118914'}], 'perf_up': 32.38997268676758, 'perf_down': 1.2493066787719727}
{'cluster_rank': 2, 'cluster_index': 12, 'results': [{'score': 24428, 'url': 'D709754'}, {'score': 21887, 'url': 'D1592000'}, {'score': 21586, 'url': 'D2222332'}, {'score': 21580, 'url': 'D360538'}, {'score': 19759, 'url': 'D1805557'}], 'perf_up': 32.389970779418945, 'perf_down': 1.249307632446289}
{'cluster_rank': 3, 'cluster_index': 15, 'results': [{'score': 35049, 'url': 'D3339009'}, {'score': 11616, 'url': 'D2338213'}, {'score': 11616, 'url': 'D803785'}, {'score': 11616, 'url': 'D3360795'}, {'score': 11616, 'url': 'D1191864'}], 'perf_up': 32.38997173309326, 'perf_down': 1.2493114471435547}
{'cluster_rank': 4, 'cluster_index': 5, 'results': [{'score': 19290, 'url': 'D1265153'}, {'score':

In [15]:
query_id = "01"
res_dict = {"query_id": query_id}
for cluster_res in results["all_results"]:
    res_dict[f"cluster_{cluster_res['cluster_rank']}_res"] = cluster_res["results"]
    res_dict[f"cluster_{cluster_res['cluster_rank']}_total_comm"] = (
        cluster_res["perf_up"] + cluster_res["perf_down"]
    )

In [16]:
res_dict

{'query_id': '01',
 'cluster_1_res': [{'score': 21765, 'url': 'D802805'},
  {'score': 20752, 'url': 'D3347722'},
  {'score': 20325, 'url': 'D3087368'},
  {'score': 19755, 'url': 'D3416613'},
  {'score': 19468, 'url': 'D3118914'}],
 'cluster_1_total_comm': 33.63927936553955,
 'cluster_2_res': [{'score': 24428, 'url': 'D709754'},
  {'score': 21887, 'url': 'D1592000'},
  {'score': 21586, 'url': 'D2222332'},
  {'score': 21580, 'url': 'D360538'},
  {'score': 19759, 'url': 'D1805557'}],
 'cluster_2_total_comm': 33.639278411865234,
 'cluster_3_res': [{'score': 35049, 'url': 'D3339009'},
  {'score': 11616, 'url': 'D2338213'},
  {'score': 11616, 'url': 'D803785'},
  {'score': 11616, 'url': 'D3360795'},
  {'score': 11616, 'url': 'D1191864'}],
 'cluster_3_total_comm': 33.639283180236816,
 'cluster_4_res': [{'score': 19290, 'url': 'D1265153'},
  {'score': 17556, 'url': 'D3345046'},
  {'score': 17245, 'url': 'D760065'},
  {'score': 17119, 'url': 'D2316368'},
  {'score': 16933, 'url': 'D2129423'}],


In [95]:
def parse_go_output(output, query_id):
    """Parse the output from the Go program and return a structured dictionary."""

    results = json.loads(output.splitlines()[-1])
    structured_results = {"query_id": query_id}
    for cluster_res in results["all_results"]:
        structured_results[f"cluster_{cluster_res['cluster_rank']}_res"] = cluster_res[
            "results"
        ]
        structured_results[f"cluster_{cluster_res['cluster_rank']}_total_comm"] = (
            cluster_res["perf_up"] + cluster_res["perf_down"]
        )
    return structured_results

In [96]:
parse_go_output(out, "01")

{'query_id': '01',
 'cluster_1_res': [{'score': 21765, 'url': 'D802805'},
  {'score': 20752, 'url': 'D3347722'},
  {'score': 20325, 'url': 'D3087368'},
  {'score': 19755, 'url': 'D3416613'},
  {'score': 19468, 'url': 'D3118914'}],
 'cluster_1_total_comm': 33.639320373535156,
 'cluster_2_res': [{'score': 24428, 'url': 'D709754'},
  {'score': 21887, 'url': 'D1592000'},
  {'score': 21586, 'url': 'D2222332'},
  {'score': 21580, 'url': 'D360538'},
  {'score': 19759, 'url': 'D1805557'}],
 'cluster_2_total_comm': 33.639235496520996,
 'cluster_3_res': [{'score': 35049, 'url': 'D3339009'},
  {'score': 11616, 'url': 'D2338213'},
  {'score': 11616, 'url': 'D803785'},
  {'score': 11616, 'url': 'D3360795'},
  {'score': 11616, 'url': 'D1191864'}],
 'cluster_3_total_comm': 33.63926410675049,
 'cluster_4_res': [{'score': 19290, 'url': 'D1265153'},
  {'score': 17556, 'url': 'D3345046'},
  {'score': 17245, 'url': 'D760065'},
  {'score': 17119, 'url': 'D2316368'},
  {'score': 16933, 'url': 'D2129423'}],


In [120]:
import pandas as pd

cluster_seach_num = 4
test_df = pd.DataFrame(
    columns=["query_id"]
    + [f"cluster_{i+1}_res" for i in range(cluster_seach_num)]
    + [f"cluster_{i+1}_total_comm" for i in range(cluster_seach_num)]
)

In [121]:
test_df = pd.concat([test_df, pd.DataFrame(parse_go_output(out, "01"))])
test_df

Unnamed: 0,query_id,cluster_1_res,cluster_2_res,cluster_3_res,cluster_4_res,cluster_1_total_comm,cluster_2_total_comm,cluster_3_total_comm,cluster_4_total_comm
0,1,"{'score': 21765, 'url': 'D802805'}","{'score': 24428, 'url': 'D709754'}","{'score': 35049, 'url': 'D3339009'}","{'score': 19290, 'url': 'D1265153'}",33.63932,33.639235,33.639264,33.639318
1,1,"{'score': 20752, 'url': 'D3347722'}","{'score': 21887, 'url': 'D1592000'}","{'score': 11616, 'url': 'D2338213'}","{'score': 17556, 'url': 'D3345046'}",33.63932,33.639235,33.639264,33.639318
2,1,"{'score': 20325, 'url': 'D3087368'}","{'score': 21586, 'url': 'D2222332'}","{'score': 11616, 'url': 'D803785'}","{'score': 17245, 'url': 'D760065'}",33.63932,33.639235,33.639264,33.639318
3,1,"{'score': 19755, 'url': 'D3416613'}","{'score': 21580, 'url': 'D360538'}","{'score': 11616, 'url': 'D3360795'}","{'score': 17119, 'url': 'D2316368'}",33.63932,33.639235,33.639264,33.639318
4,1,"{'score': 19468, 'url': 'D3118914'}","{'score': 19759, 'url': 'D1805557'}","{'score': 11616, 'url': 'D1191864'}","{'score': 16933, 'url': 'D2129423'}",33.63932,33.639235,33.639264,33.639318


In [122]:
test_df = pd.concat([test_df, pd.DataFrame(parse_go_output(out, "02"))])
test_df

Unnamed: 0,query_id,cluster_1_res,cluster_2_res,cluster_3_res,cluster_4_res,cluster_1_total_comm,cluster_2_total_comm,cluster_3_total_comm,cluster_4_total_comm
0,1,"{'score': 21765, 'url': 'D802805'}","{'score': 24428, 'url': 'D709754'}","{'score': 35049, 'url': 'D3339009'}","{'score': 19290, 'url': 'D1265153'}",33.63932,33.639235,33.639264,33.639318
1,1,"{'score': 20752, 'url': 'D3347722'}","{'score': 21887, 'url': 'D1592000'}","{'score': 11616, 'url': 'D2338213'}","{'score': 17556, 'url': 'D3345046'}",33.63932,33.639235,33.639264,33.639318
2,1,"{'score': 20325, 'url': 'D3087368'}","{'score': 21586, 'url': 'D2222332'}","{'score': 11616, 'url': 'D803785'}","{'score': 17245, 'url': 'D760065'}",33.63932,33.639235,33.639264,33.639318
3,1,"{'score': 19755, 'url': 'D3416613'}","{'score': 21580, 'url': 'D360538'}","{'score': 11616, 'url': 'D3360795'}","{'score': 17119, 'url': 'D2316368'}",33.63932,33.639235,33.639264,33.639318
4,1,"{'score': 19468, 'url': 'D3118914'}","{'score': 19759, 'url': 'D1805557'}","{'score': 11616, 'url': 'D1191864'}","{'score': 16933, 'url': 'D2129423'}",33.63932,33.639235,33.639264,33.639318
0,2,"{'score': 21765, 'url': 'D802805'}","{'score': 24428, 'url': 'D709754'}","{'score': 35049, 'url': 'D3339009'}","{'score': 19290, 'url': 'D1265153'}",33.63932,33.639235,33.639264,33.639318
1,2,"{'score': 20752, 'url': 'D3347722'}","{'score': 21887, 'url': 'D1592000'}","{'score': 11616, 'url': 'D2338213'}","{'score': 17556, 'url': 'D3345046'}",33.63932,33.639235,33.639264,33.639318
2,2,"{'score': 20325, 'url': 'D3087368'}","{'score': 21586, 'url': 'D2222332'}","{'score': 11616, 'url': 'D803785'}","{'score': 17245, 'url': 'D760065'}",33.63932,33.639235,33.639264,33.639318
3,2,"{'score': 19755, 'url': 'D3416613'}","{'score': 21580, 'url': 'D360538'}","{'score': 11616, 'url': 'D3360795'}","{'score': 17119, 'url': 'D2316368'}",33.63932,33.639235,33.639264,33.639318
4,2,"{'score': 19468, 'url': 'D3118914'}","{'score': 19759, 'url': 'D1805557'}","{'score': 11616, 'url': 'D1191864'}","{'score': 16933, 'url': 'D2129423'}",33.63932,33.639235,33.639264,33.639318


In [123]:
import ir_datasets

dataset = ir_datasets.load("msmarco-passage/dev/small")

In [126]:
next(iter(dataset.queries_iter()))

[INFO] Please confirm you agree to the MSMARCO data usage agreement found at <http://www.msmarco.org/dataset.aspx>
[INFO] If you have a local copy of https://msmarco.z22.web.core.windows.net/msmarcoranking/collectionandqueries.tar.gz, you can symlink it here to avoid downloading it again: /Users/edable-heath/.ir_datasets/downloads/31644046b18952c1386cd4564ba2ae69
[INFO] [starting] https://msmarco.z22.web.core.windows.net/msmarcoranking/collectionandqueries.tar.gz
[INFO] [finished] https://msmarco.z22.web.core.windows.net/msmarcoranking/collectionandqueries.tar.gz: [12:12] [1.06GB] [1.44MB/s]
                                                                                                                

GenericQuery(query_id='1048585', text="what is paula deen's brother")

In [129]:
dataset.qrels_dict().get("1048585")

{'7187158': 1}

In [130]:
qlist = list(dataset.queries_iter())

In [134]:
qlist[0][1]

"what is paula deen's brother"

In [135]:
qlist[0][0]

'1048585'