# Community alignment of training questions

## Imports

In [1]:
%matplotlib inline
import matplotlib.pyplot as plt

## Question sample

In [2]:
num_questions = (604, 50000)
perennial=False
live=False

In [3]:
from gather_questions import gather_questions
questions, df = gather_questions(num_questions, perennial, live)

Got 844 questions


In [4]:
questions_binary = [question for question in questions if question.api_json['question']['type'] == 'binary']

In [5]:
questions_multiple_choice = [question for question in questions if question.api_json['question']['type'] == 'multiple_choice']

In [6]:
questions_numeric = [question for question in questions if question.api_json['question']['type'] == 'numeric']

In [7]:
[len(x) for x in [questions_binary, questions_multiple_choice, questions_numeric]]

[684, 24, 136]

## Community forecast

In [8]:
from community_forecast import *

In [9]:
id_to_forecast = {question.api_json['id']: community_forecast(question) for question in questions}

In [43]:
id_to_question = {question.api_json['id']: question for question in questions}

In [10]:
community_forecast(questions_binary[0])

0.3

In [11]:
community_forecast(questions_multiple_choice[0])

{'Repealed fully': 0.10645192307692308,
 'Reduced incentives': 0.4792617521367521,
 'Neither repealed nor reduced': 0.41428632478632477}

In [12]:
community_forecast(questions_numeric[0])

{10: 10526.717731180133,
 20: 16559.874704040678,
 40: 46037.69411359078,
 60: 676388.0718683149,
 80: 4783197.661904035,
 90: 4956904.492983556}

## 000 forecast

In [13]:
import glob
fns = glob.glob('forecast_community/*.md')

In [14]:
type(questions_numeric[0])

forecasting_tools.data_models.questions.NumericQuestion

In [15]:
fn = fns[0]

In [16]:
ids = [int(fn.split('/')[1].split('.')[0]) for fn in fns]

In [17]:
forecasts = {id: open(f'forecast_community/{id}.md').read() for id in ids}

In [26]:
community_ids = list(id_to_forecast.keys())

In [28]:
type(community_ids[0])

int

In [29]:
forecast_ids = list(forecasts.keys())

In [39]:
done = list(set(forecast_ids).intersection(community_ids))

In [72]:
id = 26327
question = id_to_question[id]

In [76]:
forecasts[id]

'The measles outbreak in the United States in 2025 is unfolding against a backdrop of rising case counts, persistent clusters in under-vaccinated communities, and significant concerns about the sustainability of public health containment. As of late May, there are 1,046 confirmed cases distributed across 30 states, already marking this year as the second-highest annual total in a quarter-century and far surpassing previous years except for the 2019 peak. The majority of cases—over 90%—are associated with discrete outbreaks, with the largest centered in Texas, New Mexico, and Oklahoma, collectively accounting for approximately 82% of all cases so far. Most affected individuals are unvaccinated or have unknown vaccination status, leaving many communities vulnerable to ongoing transmission.\n\nThe current trajectory shows about 6.8 new cases per day between mid-April and late May, which, if it continues, would project an additional 1,300 to 1,500 cases by year-end, bringing the likely tot

In [74]:
from extract_forecast import *

def extract_only_forecast(id):
    question = id_to_question[id]
    question_type = type(question)
    forecast = forecasts[id]
    if question_type == forecasting_tools.data_models.questions.BinaryQuestion:
        prediction = extract_probability_from_response_as_percentage_not_decimal(forecast)/100.0
    elif question_type == forecasting_tools.data_models.questions.MultipleChoiceQuestion:
        options = question.options
        option_probabilities = extract_option_probabilities_from_response(forecast, options)
        prediction = generate_multiple_choice_forecast(options, option_probabilities)
    elif question_type == forecasting_tools.data_models.questions.NumericQuestion:
        prediction = extract_percentiles_from_response(forecast)
    return prediction

In [75]:
import forecasting_tools
extract_only_forecast(id)

extracting percentile 10 value 1300.0
extracting percentile 20 value 1500.0
extracting percentile 40 value 1700.0
extracting percentile 60 value 1950.0
extracting percentile 80 value 2400.0
extracting percentile 90 value 3000.0
extracting percentile 11 value 1320.0
extracting percentile 12 value 1340.0
extracting percentile 13 value 1360.0
extracting percentile 14 value 1380.0
extracting percentile 15 value 1400.0
extracting percentile 16 value 1420.0
extracting percentile 17 value 1440.0
extracting percentile 18 value 1460.0
extracting percentile 19 value 1480.0
extracting percentile 21 value 1510.0
extracting percentile 22 value 1520.0
extracting percentile 23 value 1530.0
extracting percentile 24 value 1540.0
extracting percentile 25 value 1550.0
extracting percentile 26 value 1560.0
extracting percentile 27 value 1570.0
extracting percentile 28 value 1580.0
extracting percentile 29 value 1590.0
extracting percentile 30 value 1600.0
extracting percentile 31 value 1610.0
extracting p

{10: 1300.0,
 20: 1500.0,
 40: 1700.0,
 60: 1950.0,
 80: 2400.0,
 90: 3000.0,
 11: 1320.0,
 12: 1340.0,
 13: 1360.0,
 14: 1380.0,
 15: 1400.0,
 16: 1420.0,
 17: 1440.0,
 18: 1460.0,
 19: 1480.0,
 21: 1510.0,
 22: 1520.0,
 23: 1530.0,
 24: 1540.0,
 25: 1550.0,
 26: 1560.0,
 27: 1570.0,
 28: 1580.0,
 29: 1590.0,
 30: 1600.0,
 31: 1610.0,
 32: 1620.0,
 33: 1630.0,
 34: 1640.0,
 35: 1650.0,
 36: 1660.0,
 37: 1670.0,
 38: 1680.0,
 39: 1690.0,
 41: 1712.5,
 42: 1725.0,
 43: 1737.5,
 44: 1750.0,
 45: 1762.5,
 46: 1775.0,
 47: 1787.5,
 48: 1800.0,
 49: 1812.5,
 50: 1825.0,
 51: 1837.5,
 52: 1850.0,
 53: 1862.5,
 54: 1875.0,
 55: 1887.5,
 56: 1900.0,
 57: 1912.5,
 58: 1925.0,
 59: 1937.5,
 61: 1967.5,
 62: 1985.0,
 63: 2002.5,
 64: 2020.0,
 65: 2037.5,
 66: 2055.0,
 67: 2075.0,
 68: 2100.0,
 69: 2125.0,
 70: 2150.0,
 71: 2175.0,
 72: 2200.0,
 73: 2225.0,
 74: 2250.0,
 75: 2275.0,
 76: 2300.0,
 77: 2325.0,
 78: 2350.0,
 79: 2375.0,
 81: 2460.0,
 82: 2520.0,
 83: 2580.0,
 84: 2640.0,
 85: 2700.0,

In [None]:
for id in done:
    print(id)
    print(extract_only_forecast(id))

In [None]:
predictions = {id: extract_only_forecast(id, txt) for id,txt in forecasts.items()}

## Forecast missing questions

In [82]:
missing = list(sorted(set(community_ids).difference(forecast_ids)))

In [83]:
len(missing)

769

In [84]:
missing[0]

604

In [81]:
from forecast import forecast

In [89]:
num_questions = (missing[0], missing[0]+1)
perennial = False
live = False

In [90]:
results = forecast(num_questions = num_questions, perennial = perennial, live=False)

Got 2 questions
Loaded existing index from forecast_index.faiss
Index contains 4656 vectors at initialization
=== Starting Forecast ===
###################################################################
Combined forecast for 605


100%|█████████████████████████████████████████████| 5/5 [01:21<00:00, 16.39s/it]

extracting percentile 10 value 2.1
extracting percentile 20 value 2.4
extracting percentile 40 value 2.7
extracting percentile 60 value 2.9
extracting percentile 80 value 3.3
extracting percentile 90 value 3.6
extracting percentile 10 value 2.1
extracting percentile 20 value 2.3
extracting percentile 40 value 2.6
extracting percentile 60 value 2.9
extracting percentile 80 value 3.3
extracting percentile 90 value 3.6
extracting percentile 10 value 1.9
extracting percentile 20 value 2.2
extracting percentile 40 value 2.6
extracting percentile 60 value 2.9
extracting percentile 80 value 3.3
extracting percentile 90 value 3.6
extracting percentile 10 value 2.0
extracting percentile 20 value 2.3
extracting percentile 40 value 2.6
extracting percentile 60 value 2.9
extracting percentile 80 value 3.3
extracting percentile 90 value 3.6





KeyError: 10