#### Examples for analytical/tabular methods provided by dsldPy

The goal is for users to apply analytical/tabular methods with simple, intuitive interface. The following functions are included for python:

1. dsldLinear, dsldLogit, and dsldML 
2. dsldTakeALookAround
3. dsldHunting (both C/O hunting functions)
4. dsldFrequencybyS  
5. dsldMatchedAte

In [1]:
# load necessary libraries
import pandas as pd
import numpy as np
import sys, os

# r-conversions
import rpy2.robjects as ro
from rpy2.robjects.packages import importr

# test accuracy
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, accuracy_score

# load dsld package
R = ro.r
R('if ("dsld" %in% loadedNamespaces()) detach("package:dsld", unload=TRUE)')
R('if (!requireNamespace("devtools", quietly=TRUE)) install.packages("devtools")')
R('devtools::load_all("~/Desktop/dsld", quiet=TRUE)')

parent_dir = os.path.abspath(os.path.join(os.getcwd(), ".."))
sys.path.append(os.path.join(parent_dir, "dsld"))
from Utils import preprocess_data, read_data

dsld = importr("dsld")

Error importing in API mode: ImportError("dlopen(/Users/adityamittal/miniconda3/lib/python3.12/site-packages/_rinterface_cffi_api.abi3.so, 0x0002): Library not loaded: /Library/Frameworks/R.framework/Versions/4.5-arm64/Resources/lib/libRblas.dylib\n  Referenced from: <1F2D8792-55A5-3398-8569-DDFF21A19C12> /Users/adityamittal/miniconda3/lib/python3.12/site-packages/_rinterface_cffi_api.abi3.so\n  Reason: tried: '/Library/Frameworks/R.framework/Versions/4.5-arm64/Resources/lib/libRblas.dylib' (no such file), '/System/Volumes/Preboot/Cryptexes/OS/Library/Frameworks/R.framework/Versions/4.5-arm64/Resources/lib/libRblas.dylib' (no such file), '/Library/Frameworks/R.framework/Versions/4.5-arm64/Resources/lib/libRblas.dylib' (no such file)")
Trying to import in ABI mode.
R callback write-console: 




*********************



  Navigating dsld:

      Type vignette("Quick_Start",package="dsld") for a quick overview!

      Type vignette("Function_List",package="dsld") for a categorized functi

In [2]:
### dsldLinear, dsldLogit, dsldML examples 

from dsldPyLinear import (
    dsldPyLinear, dsldPyLinearSummary, dsldPyLinearPredict, dsldPyLinearVcov, dsldPyLinearCoef, dsldPyLinearGetData
)

from dsldPyLogit import (
    dsldPyLogit, dsldPyLogitSummary, dsldPyLogitPredict, dsldPyLogitVcov, dsldPyLogitCoef, dsldPyLogitGetData
) 

from dsldPyML import dsldPyML

### data preprocessing

### most dsldPy functions require a R data.frame object as input (NOT pandas dataframe)
### the preprocessing is done in the Utils.py file by the function preprocess_data
### user needs to manually provide the categorical and numerical features (list)
### the function preprocess_data returns a R data.frame object -> this is the required input for the dsldPy functions

# svcensus data
df = read_data('~/Desktop/dsld/data/svcensus.RData')

# preprocess data
cat_features = ['educ', 'occ', 'gender']
num_features= ['age', 'wageinc', 'wkswrkd']
svcensus = preprocess_data(df, cat_features, num_features)

df_10 = df.head(2)
df_10 = df_10[['age', 'educ', 'occ', 'wkswrkd']]
cat_features = ['educ', 'occ']
num_features = ['age','wkswrkd']
svcensus_comparisons_points = preprocess_data(df_10, cat_features, num_features)

# compas1 data
df = read_data('~/Desktop/dsld/data/compas1.RData')

# preprocess data
cat_features = ["sex", "two_year_recid", "race"]
num_features = ["age","juv_fel_count","decile_score","juv_misd_count","juv_other_count","priors_count","c_jail_in","c_jail_out","c_offense_date","screening_date","in_custody","out_custody"]
compas1 = preprocess_data(df, cat_features, num_features)

df_10 = df.head(2)
df_10 = df_10[["sex", "age","juv_fel_count","decile_score","juv_misd_count","juv_other_count","priors_count","c_jail_in","c_jail_out","c_offense_date","screening_date","in_custody","out_custody"]]
cat_features = ["sex"]
num_features = ["age","juv_fel_count","decile_score","juv_misd_count","juv_other_count","priors_count","c_jail_in","c_jail_out","c_offense_date","screening_date","in_custody","out_custody"]
compas1_comparisons_points = preprocess_data(df_10, cat_features, num_features)

In [3]:
### 1. ------------------------------ dsldPyLinear/dsldPyLogit/dsldPyML ------------------------------

## dsldPyLinear - interactions = True
a = dsldPyLinear(svcensus, 'wageinc', 'gender', True)

### the object a is a list of R objects --- can be accessed using the following functions 
### note that directly looking at 'a' might not be helpful --- use the following functions to access the results and use in python

# uncomment to see the results of the functions
# dsldPyLinearSummary(a)   
# dsldPyLinearCoef(a)
# dsldPyLinearVcov(a)
# dsldPyLinearGetData(a)

# predict()
preds = dsldPyLinearPredict(a, svcensus_comparisons_points)
preds

### can also work with interactions = False as well
a2 = dsldPyLinear(svcensus, 'wageinc', 'gender', False)

# dsldPyLinearSummary(a2)   
# dsldPyLinearCoef(a2)
# dsldPyLinearVcov(a2)
# dsldPyLinearGetData(a2)

## the predict() method requires newData to include S (which is not done)

[1] "No user sComparisonPts supplied. The following rows \n                    are selected: 6482,13714,4534,3025,15140"


In [4]:
# dsldPyLogit - interactions = False

b = dsldPyLogit(compas1, 'two_year_recid', 'race', interactions=True, yesYVal = "Yes")

### the object b is a list of R objects --- can be accessed using the following functions 
### note that directly looking at 'b' might not be helpful --- use the following functions to access the results and use in python

# uncomment to see the results of the functions
# dsldPyLogitSummary(b)
# dsldPyLogitCoef(b)
# dsldPyLogitVcov(b)
# dsldPyLogitGetData(b)

# predict()
preds = dsldPyLogitPredict(b, compas1_comparisons_points)
preds

### can also work with interactions = False as well
b2 = dsldPyLogit(compas1, 'two_year_recid', 'race', interactions=False, yesYVal = "Yes")

# dsldPyLogitSummary(b2)
# dsldPyLogitCoef(b2)
# dsldPyLogitVcov(b2)
# dsldPyLogitGetData(b2)

[1] "No user sComparisonPts supplied. The following rows \n                    are selected: 3270,4214,2349,373,1896"


R callback write-console: In addition:   
  
R callback write-console: 1: glm.fit: algorithm did not converge 
  
R callback write-console: 2: glm.fit: fitted probabilities numerically 0 or 1 occurred 
  
R callback write-console: 3: glm.fit: fitted probabilities numerically 0 or 1 occurred 
  


In [11]:
## dsldPyML - returns testAcc for each sLevel and dataframe (excluding yName and sName) of predictions
### works for several qeML functions as far as I've tried
c = dsldPyML(svcensus, 'wageinc', 'gender', 'qeKNN',sComparisonPts='rand5')
print(c)

({'testAcc: female': 20348.978775510197, 'testAcc: male': 28391.778000000002},              age      educ  occ  wkswrkd   female     male
3327   36.966051  zzzOther  102     45.0  59448.0  80720.0
3491   46.395153  zzzOther  100     52.0  59076.0  68960.0
16059  36.363451  zzzOther  101     52.0  47636.0  71524.0
17627  30.599976  zzzOther  102     52.0  60440.0  80520.0
10090  41.377617  zzzOther  102     52.0  58084.0  86920.0)


In [None]:
### 2. ------------------------------ dsldTakeALookAround ------------------------------
from dsldPyTakeALookAround import dsldPyTakeALookAround
dsldPyTakeALookAround(svcensus, 'wageinc', 'gender', 4)   


Unnamed: 0,Feature Names,a,b,c
1,age,31249.319276,30951.621761,0.269
2,educ,32598.532572,30154.447135,0.247
3,occ,30206.443122,33318.131905,0.237
4,wkswrkd,26905.509363,29081.132847,0.249
5,"age,educ",30835.132566,30388.810233,0.234
6,"age,occ",30562.037098,30228.283929,0.271
7,"age,wkswrkd",25880.922267,26232.128175,0.253
8,"educ,occ",30600.412121,30243.119416,0.234
9,"educ,wkswrkd",27652.15813,25762.518054,0.243
10,"occ,wkswrkd",26523.200422,26277.287928,0.221


In [None]:
### 3. ------------------------------ dsldHunting ------------------------------

from dsldPyHunting import dsldPyCHunting, dsldPyOHunting

# dsldPyCHunting - C-Hunting
# 'a' is a python dictionary listing columns of dataset in order of importance of Y and S
a = dsldPyCHunting(svcensus, 'wageinc', 'gender')
print(a)

# # dsldPyOHunting - O-Hunting
### 'b' is a list of R objects
### note that directly looking at 'b' might not be helpful --- use ro.r("print")(b) to print the results in R
b = dsldPyOHunting(svcensus, 'wageinc', 'gender')

# to correctly print the results in R
# ro.r("print")(b)

R callback write-console: Loading required namespace: randomForest
  


{'impForY': [('wkswrkd', 718747437.8052691), ('age', 175997763.50289333), ('occ', 82085489.27041695), ('educ', 79027198.5980331)], 'impForS': [('occ', 0.01970223595878264), ('age', 0.0011118602810904878), ('wkswrkd', 0.0009673452497156447), ('educ', 0.0005858137724793742)]}


R callback write-console: In addition:   
R callback write-console: 
  


                       age    educ.14     educ.16 educ.zzzOther    occ.100
gender.female  0.009480006 -0.0114536 -0.04627929     0.0307762  0.1126074
gender.male   -0.009480006  0.0114536  0.04627929    -0.0307762 -0.1126074
                  occ.101     occ.102     occ.106     occ.140    occ.141
gender.female  0.01504479 -0.01446593  0.06434029 -0.04078825 -0.1391859
gender.male   -0.01504479  0.01446593 -0.06434029  0.04078825  0.1391859
                  wkswrkd
gender.female -0.03547912
gender.male    0.03547912


In [None]:
### 4. ------------------------------ dsldFrequencybyS ------------------------------
from dsldPyFrequencybyS import dsldPyFrequencybyS
dsldPyFrequencybyS(svcensus, 'educ', 'gender')

Unnamed: 0,Frequency of zzzOther,Frequency of 14,Frequency of 16
female,0.206805,0.020986,0.772209
male,0.217758,0.041101,0.741141


In [None]:
### 5. ------------------------------ dsldMatchedAte ------------------------------
### this is a mere print function --- there is no return value
### the R object was too messy to convert to python --- so just left it printed in R
from dsldPyMatching import dsldPyMatchedATE
dsldPyMatchedATE(compas1, 'two_year_recid', 'race', 'Caucasian')


Estimate...  -0.022545 
SE.........  0.0071265 
T-stat.....  -3.1635 
p.val......  0.0015587 

Original number of observations..............  5855 
Original number of treated obs...............  2055 
Matched number of observations...............  5855 
Matched number of observations  (unweighted).  5855 

