# This is where I estimate baseline, naive, and ppi corrected inference parameters for each model classic, bert and gpt_zeroshot

In [29]:
import pandas as pd
import numpy as np
import os
import sys
from sklearn.model_selection import train_test_split
import statsmodels.api as sm

# load custom local packages
sys.path.append("C:\\Users\\Adam\\Desktop\\code_projects\\GitHub\\va_nlp\\utils")

from dataset_utils import dataframe_decorator
from statistics_utils import *
from ppi_plusplus_multi import *

## load data

In [30]:
df = pd.read_csv('../src/gpt_nlp/results_df.csv')
mexico_knn = pd.read_csv('../data/results/mexico_KNN.csv')
mexico_svm = pd.read_csv('../data/results/mexico_SVM.csv')
mexico_nb = pd.read_csv('../data/results/mexico_NB.csv')
mexico_bert = pd.read_csv('../data/results/mexico_bert.csv')
mexico_gpt4zs = pd.read_csv('../data/results/mexico_gpt4_zs.csv')

# Break into 80/20 split for Naive and PPI estimation

In [31]:
def data_split(input_df, test_size=0.2):
    '''
    Takes input df which has three columns, [Y, X, Y_hat]
    Subsets df to exclude 'unclassified' from Y_hat 
    returns data split unlabeled/labeled with a default 80/20 train/test split:
        Y_sorted (ndarray): All gold-standard labels, sorted. 
        X_sorted (ndarray): All covariates corresponding to the gold-standard labels, sorted. 
        Y_lab (ndarray) : test_size number of gold standard labels.
        Yhat_lab (ndarray): test_size number of predictions corresponding to the gold-standard labels.
        X_lab (ndarray) : test_size number of covariates corresponding to the gold-standard labels.        
        Yhat_unlabeled (ndarray): (1-test_size) number of predictions corresponding to the gold-standard labels.
        X_unlabeled (ndarray): (1-test_size) number of covariates corresponding to the unlabeled data.
        
           
    '''
    
    # subset to drop unclassified
    subset = input_df[input_df['Y_hat']!='unclassified']

    # 80/20 split on entire dataframe into unabeled and labeled subsets
    unlab_df, lab_df = train_test_split(subset, test_size=0.2, random_state=42)
    
    # separate Y's and X's
    # full Y and X
    Y = input_df['Y'].to_numpy()
    X = input_df['X'].to_numpy()
    
    # labeled Y, X, Y_hat
    Y_lab = lab_df['Y'].to_numpy()
    X_lab = lab_df['X'].to_numpy()
    Yhat_lab = lab_df['Y_hat'].to_numpy()
    
    # unlabeled Y, X, Y_hat
    Y_unlab = unlab_df['Y'].to_numpy()
    X_unlab = unlab_df['X'].to_numpy()
    Yhat_unlab = unlab_df['Y_hat'].to_numpy()
    
    # combine 20/80 labeled unlabeled
    Y_combined = np.append(Y_lab, Y_unlab)
    X_combined = np.append(X_lab, X_unlab)
    
    # transpose X's
    X = X.reshape(-1,1)
    X_lab = X_lab.reshape(-1,1)
    X_unlab = X_unlab.reshape(-1,1)
    X_combined = X_combined.reshape(-1,1)
    
    # sort for MNLogit so that 0 is the left out reference category
    sort_idx = np.argsort(Y)
    Y_sorted = Y[sort_idx]
    X_sorted = X[sort_idx]
    Y_combined_sorted = Y_combined[sort_idx]
    X_combined_sorted = X_combined[sort_idx]
    
    return Y_sorted, X_sorted, Y_lab, Yhat_lab, X_lab, Yhat_unlab, X_unlab, Y_combined_sorted, X_combined_sorted

In [25]:
Y_sorted, X_sorted, Y_lab, Yhat_lab, X_lab, Yhat_unlab, X_unlab, Y_combined_sorted, X_combined_sorted = data_split(mexico_knn)

In [15]:
# Baseline Regression
mn_logit_baseline = sm.MNLogit(Y_sorted, X_sorted)
mn_logit_baseline_res = mn_logit_baseline.fit(method = "newton", full_output = True)
mn_logit_baseline_res.summary()

Optimization terminated successfully.
         Current function value: 0.961287
         Iterations 7


0,1,2,3
Dep. Variable:,y,No. Observations:,1306.0
Model:,MNLogit,Df Residuals:,1302.0
Method:,MLE,Df Model:,0.0
Date:,"Wed, 20 Mar 2024",Pseudo R-squ.:,0.05837
Time:,11:27:11,Log-Likelihood:,-1255.4
converged:,True,LL-Null:,-1333.3
Covariance Type:,nonrobust,LLR p-value:,

y=1,coef,std err,z,P>|z|,[0.025,0.975]
x1,0.0125,0.003,4.846,0.000,0.007,0.018
y=2,coef,std err,z,P>|z|,[0.025,0.975]
x1,0.0040,0.003,1.404,0.160,-0.002,0.010
y=3,coef,std err,z,P>|z|,[0.025,0.975]
x1,-0.0294,0.005,-5.709,0.000,-0.039,-0.019
y=4,coef,std err,z,P>|z|,[0.025,0.975]
x1,0.0429,0.002,19.309,0.000,0.039,0.047


In [17]:
# Naive Regression
mn_logit_baseline = sm.MNLogit(Y_combined_sorted, X_combined_sorted)
mn_logit_baseline_res = mn_logit_baseline.fit(method = "newton", full_output = True)
mn_logit_baseline_res.summary()

Optimization terminated successfully.
         Current function value: 0.961287
         Iterations 7


0,1,2,3
Dep. Variable:,y,No. Observations:,1306.0
Model:,MNLogit,Df Residuals:,1302.0
Method:,MLE,Df Model:,0.0
Date:,"Wed, 20 Mar 2024",Pseudo R-squ.:,0.05837
Time:,11:27:20,Log-Likelihood:,-1255.4
converged:,True,LL-Null:,-1333.3
Covariance Type:,nonrobust,LLR p-value:,

y=1,coef,std err,z,P>|z|,[0.025,0.975]
x1,0.0125,0.003,4.846,0.000,0.007,0.018
y=2,coef,std err,z,P>|z|,[0.025,0.975]
x1,0.0040,0.003,1.404,0.160,-0.002,0.010
y=3,coef,std err,z,P>|z|,[0.025,0.975]
x1,-0.0294,0.005,-5.709,0.000,-0.039,-0.019
y=4,coef,std err,z,P>|z|,[0.025,0.975]
x1,0.0429,0.002,19.309,0.000,0.039,0.047


In [27]:
theta_ppi_ci = ppi_multiclass_logistic_ci(
            X=X_sorted,
            Y=Y_sorted,
            Yhat=Yhat_lab,
            X_unlabeled=X_unlab,
            Yhat_unlabeled=Yhat_unlab,
            optimizer_options = {'disp': True, 'maxiter':1000},
        )

IndexError: index 262 is out of bounds for axis 0 with size 262

In [37]:
Y_sorted.shape

(1306,)

# Site: geographic regions (mexico, ap, up, dar, bohol, pemba)
# Model: ai prediction model (classic, BERT, GPT4)
# Inference: type of inference (Baseline, Naive, PPI++)

# Loop through all permutations, compute PE and CI, save results 

In [None]:
sites = df['site'].unique()
models = ['KNN', 'SVM', 'NB', 'bert', 'gpt4_zs']
inferences = ['baseline', 'naive', 'ppi_plus_plus']
results_list = []
column_names=  [
    'site', 'model', 'inference',
    'baseline_pe', 'baseline_lb', 'baseline_ub',
    'naive_pe', 'naive_lb', 'naive_ub',
    'ppi_plus_plus_pe', 'ppi_plus_plus_lb', 'ppi_plus_plus_ub'
]

np.random.seed(42)
for site in sites:
    for model in models:
        for inference in inferences:
            # read in dataframe for site and model
            load_df = pd.read_csv(f'../data/results/{site}_{model}.csv')
            
            # baseline predictions: Y_lab ~ X_lab
            baseline_pe = 1
            baseline_lb = 2
            baseline_ub = 3
            
            # split data 80/20 into labeled and unlabeled for naive and ppi++ inference
            X_lab, Y_lab, Yhat_lab, X_unlab, Yhat_unlab = data_split(load_df)
            
            # naive predictions: 20% Y_lab,X_lab and 80% Y_unlab,X_unlab
            naive_pe = 4
            naive_lb = 5
            naive_ub = 6
            
            # ppi++ predictions: 20% Y_lab,X_lab and 80% Y_unlab,X_unlab
            ppi_plus_plus_pe = 7
            ppi_plus_plus_lb = 8
            ppi_plus_plus_ub = 9
            
            result = [
                site, model, inference,
                baseline_pe, baseline_lb, baseline_ub,
                naive_pe, naive_lb, naive_ub,
                ppi_plus_plus_pe, ppi_plus_plus_lb, ppi_plus_plus_ub
            ]
            
            results_list.append(result)

# Create DataFrame
results_df = pd.DataFrame(results_list, columns=column_names)

# write to results folder
results_df.to_csv('../data/results/estimation_results.csv', index=False)

In [None]:
results_df

In [None]:
# select PE, LB and UB for given site and model and inference
site = 'mexico'
model = 'KNN'
inference = 'baseline'

results_df[(results_df['site']==site) & (results_df['model']==model)][['site',
                                                                       'model',
                                                                       'inference',
                                                                       f'{inference}_pe', 
                                                                       f'{inference}_lb', 
                                                                       f'{inference}_ub']]

# For each site/model permutation, plot the PE + CI for the three estimation procedures for each model

In [None]:
# lhat = 0 for Classical Point Estimate
ppi_plusplus_multi.ppi_multi_class_pointestimate(
    X = X_lab,
    Y = Y_lab,
    Yhat = Yhat_lab,
    X_unlabeled = X_unlab,
    Yhat_unlabeled = Yhat_unlab,
    lhat = 0,
    coord = None,
    optimizer_options=None
)

In [None]:
# lhat = 1 for Classical PPI Point Estimate
ppi_plusplus_multi.ppi_multi_class_pointestimate(
    X,
    Y,
    Yhat,
    X_unlabeled,
    Yhat_unlabeled,
    lhat = 1,
    coord = None,
    optimizer_options=None
)

In [None]:
# lhat = None for PPI++ Point Estimate
ppi_plusplus_multi.ppi_multi_class_pointestimate(
    X,
    Y,
    Yhat,
    X_unlabeled,
    Yhat_unlabeled,
    lhat = 0,
    coord = None,
    optimizer_options=None
)

In [None]:
df = pizza_eater