# Community alignment of training questions

In [1]:
%matplotlib inline

In [2]:
import pandas as pd
pd.set_option('display.max_columns', None)
from ollama_models import ollama_models
models = ollama_models()

In [3]:
dfs = {}
for model in models:
    break

In [4]:
from tqdm import tqdm
tqdm.pandas()

def predict(dfn, question, iterations = 3):
    ffn = f'{dfn}/{question.id_of_question}.md'
    with open(ffn, 'r') as f:
        return f.read()

def load_research(row):
    with open(f"research/{row['id']}.md", 'r') as f:
        return f.read()

def pull_asknews(row):
    fn = f'asknews/{row.id}.md'
    with open(fn, 'r') as f:
        return f.read()
        

In [5]:
model

'cogito:latest'

In [6]:
num_questions = (0,100000)

from load_forecasted_open_questions import load_forecasted_open_questions
questions = load_forecasted_open_questions(num_questions, model)
id_to_question = {question.api_json['id']: question for question in questions}

In [7]:
from community_forecast import community_forecast
id_to_forecast = {question.api_json['id']: community_forecast(question) for question in questions}

pdir = f'forecast_{model}'

In [8]:
for question in questions:
    print(question.id_of_post)
    print(community_forecast(question) )

6614
{10: 115955.2089276313, 20: 141657.88880243435, 40: 187180.36846477943, 60: 262472.4606564991, 80: 430667.04775099986, 90: 664348.730950581}
1482
{10: 44.93519802287401, 20: 54.793577987651716, 40: 65.03480357731105, 60: 71.73592743048809, 80: 78.57050911889024, 90: 84.26566387421109}
5531
{10: 8.715861509994589, 20: 8.863004028870678, 40: 9.046492147224772, 60: 9.233005468887509, 80: 9.768171111310986, 90: 11.775300261474541}
6633
{10: 10.445029923122354, 20: 114.33706147626953, 40: 7081.371471749762, 60: 34833926.83852406, 80: 70217867797927.234}
1454
{10: 6.452161830468124, 20: 7.610512904872372, 40: 9.498454793676672, 60: 11.482962740722408, 80: 14.202002515912902, 90: 16.563886871189343}
3054
{10: 25.253290384546386, 20: 28.209770377289352, 40: 30.243969062366574, 60: 31.87702287744829, 80: 33.9354253908392, 90: 35.92523567194527}
7811
{10: 1.17187223791934, 20: 1.6421397204907926, 40: 4.0259153117901185, 60: 22.021004372988717, 80: 581.1532030145431, 90: 3627.8510176643263}


In [None]:
import load_secrets
load_secrets.load_secrets()

In [None]:
question

In [None]:
question.api_json.keys()

In [None]:
from community_forecast_numeric import community_forecast_numeric

In [None]:
community_forecast_numeric(question)

In [None]:
import numpy as np

In [None]:
    """Get denormalized community forecast from a NumericQuestion object"""
    # Correct path to aggregations (nested under "question" in api_json)
    forecast_values = question.api_json["question"]["aggregations"]["recency_weighted"]["latest"]["forecast_values"]
    forecast_values

In [None]:
    # Get scaling parameters from the correct location
    range_min = question.api_json["question"]["scaling"]["range_min"]  # 0.0
    range_max = question.api_json["question"]["scaling"]["range_max"]  # 8e12
    

In [None]:
range_min, range_max

In [None]:
    # Denormalize each value in the 100-point forecast distribution
    denorm = np.array([range_min + x*(range_max - range_min) for x in forecast_values])

In [None]:
%matplotlib inline
import matplotlib.pylab as plt
plt.plot(np.arange(0,100.5, 0.5), denorm)

In [None]:
    pctiles = [10,20,40,60,80,90]
    idx = [2*x-1 for x in pctiles]
    sampled = denorm[idx]
    return dict(zip(pctiles, sampled))

In [None]:
import glob
fns = glob.glob(f'{pdir}/*.md')
ids = [int(fn.split('/')[1].split('.')[0]) for fn in fns]
forecasts = {id: open(f'{pdir}/{id}.md').read() for id in ids}
community_ids = list(id_to_forecast.keys())
forecast_ids = list(forecasts.keys())
done = list(set(forecast_ids).intersection(community_ids))

from extract_only_forecast import extract_only_forecast
for id in done:
    #print(id)
    foo = extract_only_forecast(id_to_question, forecasts, id)
predictions = {id: extract_only_forecast(id_to_question, forecasts, id) for id in done}
q_done = [id_to_question[id] for id in done]

from flatten_dict import flatten_dict
qflat = [flatten_dict(q.api_json, sep='_') for q in q_done]

import pandas as pd
df = pd.DataFrame(qflat)
df['crowd'] = df.apply(lambda row: id_to_forecast[row.id], axis=1)
df['question_options'] = df['question_options'].apply(repr)
df = df[['id',
         'open_time',
         'scheduled_resolve_time',
         'title',
         'question_description',
         'question_resolution_criteria',
         'question_fine_print',
         'question_type',
         'question_options',
         'question_group_variable',
         'question_question_weight',
         'question_unit',
         'question_open_upper_bound',
         'question_open_lower_bound',
         'question_scaling_range_max',
         'question_scaling_range_min',
         'question_scaling_zero_point',
         'crowd']]
dfn = f'forecast_{model}'

from datetime import datetime
df['today'] = datetime.now().strftime("%Y-%m-%d")
df['asknews'] = df.apply(pull_asknews, axis=1)
df['research'] = df.apply(load_research, axis=1)

from RAGForecaster import RAGForecaster
rag = RAGForecaster()

from EnhancedResearchPro import EnhancedResearchPro
research_bot = EnhancedResearchPro(rag)
df['id_of_question'] = df['id']
research_bot.process_dataframe(df, use_cutoff=False)
rag.research_bot = research_bot
# Updated learning field with raw text extraction
df['learning'] = df.apply(
    lambda row: [
        m['raw_text'] 
        for m, _ in research_bot.retrieval_cache.get(row['title'], []) 
        if 'raw_text' in m  # Safety check for legacy entries
    ], 
    axis=1
)

from prompt_question import prompt_question
df['prompt'] = df.apply(prompt_question, axis=1)

df['forecast'] = df.progress_apply(lambda question: predict(dfn, question), axis=1)

from extract_forecast import extract_forecast
df['prediction'] = df.apply(extract_forecast, axis=1)
df = df[~df.crowd.apply(lambda x: x is None)].copy()

from error import error
df['error'] = df.apply(error, axis=1)

df['model'] = model

import matplotlib.pyplot as plt
plt.hist(df.error.values, bins=20);
fn = f'community_{model}.json'
df.to_json(fn, indent=4)
print('saved', fn)
return df