# This is where I estimate baseline, naive, and ppi corrected inference parameters for each model classic, bert and gpt_zeroshot

In [98]:
import pandas as pd
import numpy as np
import os
import sys
from sklearn.model_selection import train_test_split

# load custom local packages
from dataset_utils import dataframe_decorator
from statistics_utils import *
from ppi_plusplus_multi import *

## load data

In [88]:
df = pd.read_csv('../src/gpt_nlp/results_df.csv')
mexico_knn = pd.read_csv('../data/results/mexico_KNN.csv')
mexico_svm = pd.read_csv('../data/results/mexico_SVM.csv')
mexico_nb = pd.read_csv('../data/results/mexico_NB.csv')
mexico_bert = pd.read_csv('../data/results/mexico_bert.csv')
mexico_gpt4zs = pd.read_csv('../data/results/mexico_gpt4_zs.csv')

In [90]:
def check_age_dist(site):
    '''
    This function takes in a site string, subsets the df to that site, 
    and returns the value counts distribution.
    '''
    sub_df = df[df['site']==site]
    print(sub_df['age_yr'].value_counts())

In [91]:
check_age_dist('mexico')

age_yr
53    29
69    27
72    27
70    26
55    26
      ..
15     2
12     2
94     1
93     1
14     1
Name: count, Length: 87, dtype: int64


In [92]:
mexico_gpt4zs['X'].value_counts()

X
53    29
69    27
72    27
70    26
55    26
      ..
15     2
12     2
94     1
93     1
14     1
Name: count, Length: 87, dtype: int64

# Break into 80/20 split for Naive and PPI estimation

In [173]:
def data_split(input_df, test_size=0.2):
    '''
    Takes input df which has three columns, [Y, X, Y_hat]
    Subsets df to exclude 'unclassified' from Y_hat 
    returns data split unlabeled/labeled with a default 80/20 train/test split:
        X (ndarray): Covariates corresponding to the gold-standard labels.
        Y (ndarray): Gold-standard labels.
        Yhat (ndarray): Predictions corresponding to the gold-standard labels.
        X_unlabeled (ndarray): Covariates corresponding to the unlabeled data.
        Yhat_unlabeled (ndarray): Predictions corresponding to the unlabeled data.    
    '''
    
    # subset to drop unclassified
    subset = input_df[input_df['Y_hat']!='unclassified']
    print(subset.shape)

    # 80/20 split on entire dataframe
    train_df, test_df = train_test_split(subset, test_size=0.2, random_state=42)
    print(train_df.shape)
    print(test_df.shape)
    
    X = train_df['X'].values
    Y = train_df['Y'].values
    Yhat = test_df['Y_hat'].values # ?? WHAT IS THIS ??
    X_unlabeled = test_df['X'].values
    Yhat_unlabeled = test_df['Y_hat'].values
    
    return X, Y, Yhat, X_unlabeled, Yhat_unlabeled

In [174]:
X_lab, Y_lab, Yhat_lab, X_unlab, Yhat_unlab = data_split(mexico_knn)

(1306, 3)
(1044, 3)
(262, 3)


In [176]:
X_lab, Y_lab, Yhat_lab, X_unlab, Yhat_unlab = data_split(mexico_bert)

(1306, 3)
(1044, 3)
(262, 3)


In [177]:
X_lab, Y_lab, Yhat_lab, X_unlab, Yhat_unlab = data_split(mexico_gpt4zs)

(530, 3)
(424, 3)
(106, 3)


In [None]:
# lhat = 0 for Classical Point Estimate
ppi_plusplus_multi.ppi_multi_class_pointestimate(
    X = X_lab,
    Y = Y_lab,
    Yhat = Yhat_lab,
    X_unlabeled = X_unlab,
    Yhat_unlabeled = Yhat_unlab,
    lhat = 0,
    coord = None,
    optimizer_options=None
)

In [None]:
# lhat = 1 for Classical PPI Point Estimate
ppi_plusplus_multi.ppi_multi_class_pointestimate(
    X,
    Y,
    Yhat,
    X_unlabeled,
    Yhat_unlabeled,
    lhat = 1,
    coord = None,
    optimizer_options=None
)

In [None]:
# lhat = None for PPI++ Point Estimate
ppi_plusplus_multi.ppi_multi_class_pointestimate(
    X,
    Y,
    Yhat,
    X_unlabeled,
    Yhat_unlabeled,
    lhat = 0,
    coord = None,
    optimizer_options=None
)