In [159]:
import numpy as np
import pandas as pd

import pyarrow
import joblib

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

np.random.seed(0)

## Load dataset. 

Pre-processed by `AIF360`'s `MEPS21` preprocessing. Medical Expenditures dataset.
Prediction task: utilization of healthcare as quantified by # of visits to a healthcare provider. 

- label: `UTILIZATION`, where 1: >10 visits, 0: <10 visits
- protected attribute: `RACE`, where 1: `White`, 0: `Non-White`

In [119]:
all_data = pd.read_csv('data_raw/meps21_preprocessed.csv')

X = all_data.copy().drop(columns=['UTILIZATION'])
y = all_data.UTILIZATION

In [124]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.9, random_state=0)

In [125]:
X_train_nosens = X_train.copy().drop(columns=['RACE'])
X_test_nosens = X_test.copy().drop(columns=['RACE'])

## Train and save a classifier. 

Note that the training is done on a version of the dataset *without* `RACE` as an attribute.

In [160]:
clf = LogisticRegression(random_state=0).fit(X_train_nosens, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [129]:
clf.score(X_train_nosens, y_train)

0.8730057434588385

In [132]:
clf.score(X_test_nosens, y_test)

0.8525659200453644

In [67]:
joblib.dump(clf, 'skl_lr.joblib')

['skl_lr.joblib']

In [147]:
pretrained = joblib.load('skl_lr.joblib')

train_infs = pretrained.predict_proba(X_train.drop(columns=['RACE']))

## Save "reference" data.

In [133]:
X_train.columns = X_train.columns.str.replace('[=]', '') 
X_train.columns = X_train.columns.str.replace('[-]', '0')

In [151]:
train_infs_df = pd.DataFrame(train_infs, columns=['p_0', 'p_1'])
X_full = X_train.merge(train_infs_df, left_index=True, right_index=True)

In [154]:
X_full['gt'] = y_train

In [155]:
X_full.to_parquet('fulldata_train.parquet', index=False)

## Generate and save 8 batches of shifted data.

In [112]:
# one version of shift: 8 batches, only x-drift but 2 attributes are changing
pretrained = joblib.load('skl_lr.joblib')

pcsmus = np.arange(8)  # can do 8 separate batches here 
mcsmus = [0.5, 1.25, 2, 1, 0, -1, -2, -2]
for i in range(8):
    shifted_X = X_test[i*1763:(i+1)*1763].copy() 
    shape = len(shifted_X)
    shifted_X['PCS42'] = shifted_X['PCS42'] + np.random.normal(pcsmus[i], 1, shape)
    shifted_X['MCS42'] = shifted_X['MCS42'] + np.random.normal(mcsmus[i], 1, shape)
    
    infs = pretrained.predict_proba(shifted_X)
    infs_df = pd.DataFrame(infs, columns=['p_0', 'p_1'])

    shifted_X['gt'] = y_test[i*1763:(i+1)*1763].copy()
    
    shifted_X = shifted_X.merge(infs_df, left_index=True, right_index=True)
    
    shifted_X.to_parquet("x_shift/fulldata_" + str(i) +".parquet", index=False)

In [161]:
%ls x_shift

[0m[01;32mfulldata_0.parquet[0m*  [01;32mfulldata_3.parquet[0m*  [01;32mfulldata_6.parquet[0m*
[01;32mfulldata_1.parquet[0m*  [01;32mfulldata_4.parquet[0m*  [01;32mfulldata_7.parquet[0m*
[01;32mfulldata_2.parquet[0m*  [01;32mfulldata_5.parquet[0m*
