In [1]:
%load_ext autoreload
%autoreload 2
from dotenv import load_dotenv
load_dotenv()

True

## Prereqs

Set your OpenAI key (recommended) or TogetherAI key in llm_forecasting/cofig/keys.py, both of which provide free credits on sign up for users. 

## Import packages and load data

In [2]:
# Standard library imports
import pickle

# Third-party library imports
import pandas as pd

# Local application/library specific imports
from config.constants import PROMPT_DICT
from utils.data_utils import get_formatted_data
from utils.visualize_utils import visualize_all, visualize_all_ensemble
import ranking
import summarize
import ensemble

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
with open("sample_questions.pickle", "rb") as file:
    sample_qs = pickle.load(file)

In [4]:
sample_qs = sample_qs[:2]

In [5]:
sample_qs[0]

{'active_state': 'RESOLVED',
 'url': 'https://www.metaculus.com/api2/questions/17316/',
 'page_url': '/questions/17316/ukraine-retakes-polohy-by-1-oct-2023/',
 'id': 17316,
 'author': 133711,
 'author_name': 'Stilico',
 'author_id': 133711,
 'title_short': 'Ukraine retakes Polohy by 1 Oct. 2023?',
 'group_label': '',
 'resolution': 0.0,
 'resolved_option': None,
 'created_time': '2023-06-01T21:50:32.810195Z',
 'effected_close_time': '2023-09-30T20:59:00Z',
 'possibilities': {'type': 'binary'},
 'scoring': {},
 'type': 'forecast',
 'user_perms': {'PREDICT': True,
  'RESOLVE': False,
  'COMMENT_READ': True,
  'COMMENT_POST': True,
  'COMMENT_EDIT': True,
  'COMMENT_MOD': False,
  'EDIT_DRAFT_CONTENT': False,
  'EDIT_PENDING_CONTENT': False,
  'EDIT_UPCOMING_CONTENT': False,
  'EDIT_LIVE_CONTENT': False,
  'EDIT_CATEGORIES': False,
  'CHANGE_DRAFT_STATUS': False,
  'CHANGE_PENDING_STATUS': False,
  'CHANGE_ACTIVE_STATUS': False,
  'CROSSPOST': False,
  'VIEW_COMMUNITY_PRED': True,
  'VIEW

In [6]:
with open("sample_questions.pickle", "wb") as file:
    pickle.dump(sample_qs, file)

In [7]:
formatted_data, raw_data = get_formatted_data(
    "",
    retrieval_index=1,
    num_retrievals=5,
    questions_after="2022",
    return_raw_question_data=True,
    data=sample_qs,
)

In [8]:
# For this demo, we'll evaluate the first question.

question = formatted_data["question_list"][0]
background_info = formatted_data["background_list"][0]
resolution_criteria = formatted_data["resolution_criteria_list"][0]
answer = formatted_data["answer_list"][0]
question_dates = formatted_data["question_dates_list"][0]
retrieval_dates = formatted_data["retrieval_dates_list"][0]
urls_in_background = formatted_data["urls_in_background_list"][0]

In [9]:
print("Question:", question)
print("Background:", background_info)

Question: Will Ukraine retake Polohy by the 1st of October, 2023?
Background: As of 1. June the Ukrainian spring counteroffensive has not yet happened. In preparation for this counter offensive, the Ukrainian army allegedly has formed several new battalions, several of them with western tanks and AFVs. Accordingly, the Russian side has reacted to this threat by building defensive fortifications in territories it controls. The counteroffensive itself seems to be quite imminent or might have already even have begun.
As the city of Polohy is about 20 km from the frontline and in the direction towards which it is "most obvious" the counteroffensive would take place towards it is apparent to ask - will the Ukrainian army manage to retake Polohy with this summer offensive?


## Retrieval

In [10]:
RETRIEVAL_CONFIG = {
    "NUM_SEARCH_QUERY_KEYWORDS": 3,
    "MAX_WORDS_NEWSCATCHER": 5,
    "MAX_WORDS_GNEWS": 8,
    "SEARCH_QUERY_MODEL_NAME": "gpt-4-1106-preview",
    "SEARCH_QUERY_TEMPERATURE": 0.0,
    "SEARCH_QUERY_PROMPT_TEMPLATES": [
        PROMPT_DICT["search_query"]["0"],
        PROMPT_DICT["search_query"]["1"],
    ],
    "NUM_ARTICLES_PER_QUERY": 5,
    "SUMMARIZATION_MODEL_NAME": "gpt-3.5-turbo-1106",
    "SUMMARIZATION_TEMPERATURE": 0.2,
    "SUMMARIZATION_PROMPT_TEMPLATE": PROMPT_DICT["summarization"]["9"],
    "NUM_SUMMARIES_THRESHOLD": 10,
    "PRE_FILTER_WITH_EMBEDDING": True,
    "PRE_FILTER_WITH_EMBEDDING_THRESHOLD": 0.32,
    "RANKING_MODEL_NAME": "gpt-3.5-turbo-1106",
    "RANKING_TEMPERATURE": 0.0,
    "RANKING_PROMPT_TEMPLATE": PROMPT_DICT["ranking"]["0"],
    "RANKING_RELEVANCE_THRESHOLD": 4,
    "RANKING_COSINE_SIMILARITY_THRESHOLD": 0.5,
    "SORT_BY": "date",
    "RANKING_METHOD": "llm-rating",
    "RANKING_METHOD_LLM": "title_250_tokens",
    "NUM_SUMMARIES_THRESHOLD": 20,
    "EXTRACT_BACKGROUND_URLS": True,
}

In [11]:
(
    ranked_articles,
    all_articles,
    search_queries_list_gnews,
    search_queries_list_nc,
) = await ranking.retrieve_summarize_and_rank_articles(
    question,
    background_info,
    resolution_criteria,
    retrieval_dates,
    urls=urls_in_background,
    config=RETRIEVAL_CONFIG,
    return_intermediates=True,
)

INFO:ranking:Finding 3 search query keywords via LLM...
INFO:openai._base_client:Retrying request to /chat/completions in 0.769620 seconds
INFO:openai._base_client:Retrying request to /chat/completions in 0.984550 seconds
INFO:openai._base_client:Retrying request to /chat/completions in 0.830843 seconds
INFO:openai._base_client:Retrying request to /chat/completions in 0.921422 seconds
INFO:ranking:Search queries for NC: ['Ukrainian military capabilities 2023', 'Russian defenses Polohy', 'Will Ukraine retake Polohy by the 1st of October, 2023?', 'international military aid Ukraine', 'Russian military fortifications Ukraine', 'Ukraine counteroffensive progress', 'Ukraine counteroffensive status']
INFO:ranking:Search queries for GNews: ['Military analyst predictions Ukraine Polohy', 'Russian defenses Polohy status 2023', 'Will Ukraine retake Polohy by the 1st of October, 2023?', 'Russian military fortifications Ukraine 2023', 'Ukrainian counteroffensive progress June 2023', 'Western milit

In [12]:
all_summaries = summarize.concat_summaries(
    ranked_articles[: RETRIEVAL_CONFIG["NUM_SUMMARIES_THRESHOLD"]]
)

In [13]:
print(all_summaries[:3000], "...")

---
ARTICLES
[1] The Russian fortifications Ukraine needs to break through in its counter-offensive (published on 2023-06-11)
Summary: Ukraine's upcoming counter-offensive faces a significant challenge in breaking through Russia's extensive defensive fortifications. New satellite images reveal minefields, anti-tank ditches, and trenches spanning over 600 miles of front lines. Russian forces have constructed a three-line system of fortifications, including deep trenches, dragons' teeth, and wire track entanglements. The extensive defensive works have impressed Ukrainian troops, indicating that the counter-offensive may not be as straightforward as anticipated. With the counter-offensive imminent, the likelihood of Ukraine retaking Polohy by October 1, 2023, remains uncertain given the formidable Russian fortifications and the potential difficulty in breaking through them.

[2] Ukraine is counter-attacking in multiple directions, with mixed results (published on 2023-06-11)
Summary: Ukra

## Reasoning

In [14]:
REASONING_CONFIG = {
    "BASE_REASONING_MODEL_NAMES": ["gpt-4-1106-preview", "gpt-4-1106-preview"],
    "BASE_REASONING_TEMPERATURE": 1.0,
    "BASE_REASONING_PROMPT_TEMPLATES": [
        [
            PROMPT_DICT["binary"]["scratch_pad"]["1"],
            PROMPT_DICT["binary"]["scratch_pad"]["2"],
        ],
        [
            PROMPT_DICT["binary"]["scratch_pad"]["new_3"],
            PROMPT_DICT["binary"]["scratch_pad"]["new_6"],
        ],
    ],
    "ALIGNMENT_MODEL_NAME": "gpt-3.5-turbo-1106",
    "ALIGNMENT_TEMPERATURE": 0,
    "ALIGNMENT_PROMPT": PROMPT_DICT["alignment"]["0"],
    "AGGREGATION_METHOD": "meta",
    "AGGREGATION_PROMPT_TEMPLATE": PROMPT_DICT["meta_reasoning"]["0"],
    "AGGREGATION_TEMPERATURE": 0.2,
    "AGGREGATION_MODEL_NAME": "gpt-4",
    "AGGREGATION_WEIGTHTS": None,
}

In [15]:
today_to_close_date = [retrieval_dates[1], question_dates[1]]
ensemble_dict = await ensemble.meta_reason(
    question=question,
    background_info=background_info,
    resolution_criteria=resolution_criteria,
    today_to_close_date_range=today_to_close_date,
    retrieved_info=all_summaries,
    reasoning_prompt_templates=REASONING_CONFIG["BASE_REASONING_PROMPT_TEMPLATES"],
    base_model_names=REASONING_CONFIG["BASE_REASONING_MODEL_NAMES"],
    base_temperature=REASONING_CONFIG["BASE_REASONING_TEMPERATURE"],
    aggregation_method=REASONING_CONFIG["AGGREGATION_METHOD"],
    answer_type="probability",
    weights=REASONING_CONFIG["AGGREGATION_WEIGTHTS"],
    meta_model_name=REASONING_CONFIG["AGGREGATION_MODEL_NAME"],
    meta_prompt_template=REASONING_CONFIG["AGGREGATION_PROMPT_TEMPLATE"],
    meta_temperature=REASONING_CONFIG["AGGREGATION_TEMPERATURE"],
)

INFO:model_eval:Finished 2 base reasonings generated by gpt-4-1106-preview
INFO:model_eval:Finished 2 base reasonings generated by gpt-4-1106-preview


## Analysis

In [18]:
ensemble_dict

{'base_reasonings': [['Additional Information:\nAs of my knowledge cutoff date in March 2023, the situation on the ground can change rapidly, and various unforeseen events could affect the outcome of military operations, including political decisions, international support, and the strategic situation on the frontline.\n\nReasons why the answer might be no:\n1. The extensive Russian defensive fortifications described in the articles suggest that breaking through these defenses could be challenging for Ukrainian forces.\n2. There is an acknowledgment from U.S. and European officials that pushing all Russian forces out of Ukraine is unlikely, indicating the high level of difficulty the counteroffensive faces.\n3. The Ukrainian offensive thus far has shown mixed results, which suggests that progress may be slower than needed to achieve the control of Polohy by the 1st of October.\n\nStrength of reasons for no:\n1. The strong and elaborate defensive positions built by the Russians are typi

In [16]:
# Compute brier score (base_predictions is a list of lists of
# probabilities)
base_brier_scores = []
# For each sublist (corresponding to a base model name)
for base_predictions in ensemble_dict["base_predictions"]:
    base_brier_scores.append(
        [(base_prediction - answer) ** 2 for base_prediction in base_predictions]
    )
    
print(base_brier_scores)

[[0.12249999999999998, 0.25], [0.2025, 0.16000000000000003]]


In [19]:
# Compute brier score of ensembled prediction

print(ensemble_dict["meta_prediction"])

0.4


In [20]:
# Visualization (draw the HTML)
base_html = visualize_all(
    question_data=raw_data[0],
    retrieval_dates=retrieval_dates,
    search_queries_gnews=search_queries_list_gnews,
    search_queries_nc=search_queries_list_nc,
    all_articles=all_articles,
    ranked_articles=ranked_articles,
    all_summaries=all_summaries,
    model_names=REASONING_CONFIG["BASE_REASONING_MODEL_NAMES"],
    base_reasoning_prompt_templates=REASONING_CONFIG[
        "BASE_REASONING_PROMPT_TEMPLATES"
    ],
    base_reasoning_full_prompts=ensemble_dict["base_reasoning_full_prompts"],
    base_reasonings=ensemble_dict["base_reasonings"],
    base_predictions=ensemble_dict["base_predictions"],
    base_brier_scores=base_brier_scores,
)
meta_html = visualize_all_ensemble(
    question_data=raw_data[0],
    ranked_articles=ranked_articles,
    all_articles=all_articles,
    search_queries_gnews=search_queries_list_gnews,
    search_queries_nc=search_queries_list_nc,
    retrieval_dates=retrieval_dates,
    meta_reasoning=ensemble_dict["meta_reasoning"],
    meta_full_prompt=ensemble_dict["meta_prompt"],
    meta_prediction=ensemble_dict["meta_prediction"],
)

In [21]:
base_file_path = "sample_q_base_output.html"
meta_file_path = "sample_q_meta_output.html"

with open(base_file_path, "w") as base_file, open(meta_file_path, "w") as meta_file:
    base_file.write(base_html)
    meta_file.write(meta_html)