# This is where I estimate baseline, naive, and ppi corrected inference parameters for each model classic, bert and gpt_zeroshot

In [1]:
import pandas as pd
import numpy as np
import os
import sys
from sklearn.model_selection import train_test_split

# load custom local packages
sys.path.append("C:\\Users\\Adam\\Desktop\\code_projects\\GitHub\\va_nlp\\utils")

from dataset_utils import dataframe_decorator
from statistics_utils import *
from ppi_plusplus_multi import *

## load data

In [2]:
df = pd.read_csv('../src/gpt_nlp/results_df.csv')
mexico_knn = pd.read_csv('../data/results/mexico_KNN.csv')
mexico_svm = pd.read_csv('../data/results/mexico_SVM.csv')
mexico_nb = pd.read_csv('../data/results/mexico_NB.csv')
mexico_bert = pd.read_csv('../data/results/mexico_bert.csv')
mexico_gpt4zs = pd.read_csv('../data/results/mexico_gpt4_zs.csv')

# Break into 80/20 split for Naive and PPI estimation

In [3]:
def data_split(input_df, test_size=0.2):
    '''
    Takes input df which has three columns, [Y, X, Y_hat]
    Subsets df to exclude 'unclassified' from Y_hat 
    returns data split unlabeled/labeled with a default 80/20 train/test split:
        X (ndarray): Covariates corresponding to the gold-standard labels.
        Y (ndarray): Gold-standard labels.
        Yhat (ndarray): Predictions corresponding to the gold-standard labels.
        X_unlabeled (ndarray): Covariates corresponding to the unlabeled data.
        Yhat_unlabeled (ndarray): Predictions corresponding to the unlabeled data.    
    '''
    
    # subset to drop unclassified
    subset = input_df[input_df['Y_hat']!='unclassified']
#     print(subset.shape)

    # 80/20 split on entire dataframe
    train_df, test_df = train_test_split(subset, test_size=0.2, random_state=42)
#     print(train_df.shape)
#     print(test_df.shape)
    
    X = train_df['X'].values
    Y = train_df['Y'].values
    Yhat = test_df['Y_hat'].values # ?? WHAT IS THIS ??
    X_unlabeled = test_df['X'].values
    Yhat_unlabeled = test_df['Y_hat'].values
    
    return X, Y, Yhat, X_unlabeled, Yhat_unlabeled

# Site: geographic regions (mexico, ap, up, dar, bohol, pemba)
# Model: ai prediction model (classic, BERT, GPT4)
# Inference: type of inference (Baseline, Naive, PPI++)

# Loop through all permutations, compute PE and CI, save results 

In [23]:
sites = df['site'].unique()
models = ['KNN', 'SVM', 'NB', 'bert', 'gpt4_zs']
inferences = ['baseline', 'naive', 'ppi_plus_plus']
results_list = []
column_names=  [
    'site', 'model', 'inference',
    'baseline_pe', 'baseline_lb', 'baseline_ub',
    'naive_pe', 'naive_lb', 'naive_ub',
    'ppi_plus_plus_pe', 'ppi_plus_plus_lb', 'ppi_plus_plus_ub'
]

np.random.seed(42)
for site in sites:
    for model in models:
        for inference in inferences:
            # read in dataframe for site and model
            load_df = pd.read_csv(f'../data/results/{site}_{model}.csv')
            
            # baseline predictions: Y_lab ~ X_lab
            baseline_pe = 1
            baseline_lb = 2
            baseline_ub = 3
            
            # split data 80/20 into labeled and unlabeled for naive and ppi++ inference
            X_lab, Y_lab, Yhat_lab, X_unlab, Yhat_unlab = data_split(load_df)
            
            # naive predictions: 20% Y_lab,X_lab and 80% Y_unlab,X_unlab
            naive_pe = 4
            naive_lb = 5
            naive_ub = 6
            
            # ppi++ predictions: 20% Y_lab,X_lab and 80% Y_unlab,X_unlab
            ppi_plus_plus_pe = 7
            ppi_plus_plus_lb = 8
            ppi_plus_plus_ub = 9
            
            result = [
                site, model, inference,
                baseline_pe, baseline_lb, baseline_ub,
                naive_pe, naive_lb, naive_ub,
                ppi_plus_plus_pe, ppi_plus_plus_lb, ppi_plus_plus_ub
            ]
            
            results_list.append(result)

# Create DataFrame
results_df = pd.DataFrame(results_list, columns=column_names)

# write to results folder
results_df.to_csv('../data/results/estimation_results.csv', index=False)

In [24]:
results_df

Unnamed: 0,site,model,inference,baseline_pe,baseline_lb,baseline_ub,naive_pe,naive_lb,naive_ub,ppi_plus_plus_pe,ppi_plus_plus_lb,ppi_plus_plus_ub
0,mexico,KNN,baseline,1,2,3,4,5,6,7,8,9
1,mexico,KNN,naive,1,2,3,4,5,6,7,8,9
2,mexico,KNN,ppi_plus_plus,1,2,3,4,5,6,7,8,9
3,mexico,SVM,baseline,1,2,3,4,5,6,7,8,9
4,mexico,SVM,naive,1,2,3,4,5,6,7,8,9
...,...,...,...,...,...,...,...,...,...,...,...,...
85,pemba,bert,naive,1,2,3,4,5,6,7,8,9
86,pemba,bert,ppi_plus_plus,1,2,3,4,5,6,7,8,9
87,pemba,gpt4_zs,baseline,1,2,3,4,5,6,7,8,9
88,pemba,gpt4_zs,naive,1,2,3,4,5,6,7,8,9


In [25]:
# select PE, LB and UB for given site and model and inference
site = 'mexico'
model = 'KNN'
inference = 'baseline'

results_df[(results_df['site']==site) & (results_df['model']==model)][['site',
                                                                       'model',
                                                                       'inference',
                                                                       f'{inference}_pe', 
                                                                       f'{inference}_lb', 
                                                                       f'{inference}_ub']]

Unnamed: 0,site,model,inference,baseline_pe,baseline_lb,baseline_ub
0,mexico,KNN,baseline,1,2,3
1,mexico,KNN,naive,1,2,3
2,mexico,KNN,ppi_plus_plus,1,2,3


# For each site/model permutation, plot the PE + CI for the three estimation procedures for each model

In [None]:
# lhat = 0 for Classical Point Estimate
ppi_plusplus_multi.ppi_multi_class_pointestimate(
    X = X_lab,
    Y = Y_lab,
    Yhat = Yhat_lab,
    X_unlabeled = X_unlab,
    Yhat_unlabeled = Yhat_unlab,
    lhat = 0,
    coord = None,
    optimizer_options=None
)

In [None]:
# lhat = 1 for Classical PPI Point Estimate
ppi_plusplus_multi.ppi_multi_class_pointestimate(
    X,
    Y,
    Yhat,
    X_unlabeled,
    Yhat_unlabeled,
    lhat = 1,
    coord = None,
    optimizer_options=None
)

In [None]:
# lhat = None for PPI++ Point Estimate
ppi_plusplus_multi.ppi_multi_class_pointestimate(
    X,
    Y,
    Yhat,
    X_unlabeled,
    Yhat_unlabeled,
    lhat = 0,
    coord = None,
    optimizer_options=None
)

In [None]:
df = pizza_eater