In [1]:
import pandas as pd
import numpy as np
import sys

sys.path.insert(0, '../visualize/')
sys.path.insert(0, '../models/')
sys.path.insert(0, '../features/')

from tcors_functions import *
from build_features import *
from counterprop import *
from linear_regression import *
from visualize import *

random_state = 42

bin_step = 0.25

Prepare Data

In [2]:
build_features(step = bin_step)

In [3]:
df = load_features('model_features.pkl')
df.head()

Unnamed: 0,screen_id,week,project,site,dose,screen_sex,screen_age,carmine_nicotine,prp_change,prp_change_bin,prp_change_bin_label
37,J-A003,week12,project 1,uvm,15.8,Female,32,16.7,0.4875,"(0.25, 0.5]",5
50,J-A004,week12,project 1,uvm,0.4,Female,38,15.9,0.725225,"(0.5, 0.75]",6
76,J-A007,week12,project 1,uvm,15.8,Female,44,16.2,0.971223,"(0.75, 1.0]",7
89,J-A010,week12,project 1,uvm,15.8,Female,28,15.8,0.895522,"(0.75, 1.0]",7
117,J-A015,week12,project 1,uvm,15.8,Female,25,16.9,1.315152,"(1.25, 1.5]",9


In [4]:
cat_targets = ['prp_change_bin_label']
num_targets = ['prp_change']
cat_predictors = ['project', 'site', 'dose', 'screen_sex']
num_predictors = ['screen_age', 'carmine_nicotine']

X = df[cat_predictors + num_predictors]
y = df[cat_targets + num_targets]

X_without_nic = X.copy()
X_without_nic.drop(['carmine_nicotine'], axis=1)

num_predictors_withoutnic = num_predictors[:]
num_predictors_withoutnic.remove('carmine_nicotine')


## Linear Regression

In [5]:
without_nic = fit_linear(
    X_without_nic,
    y[['prp_change']],
    num_predictors_withoutnic,
    cat_predictors,
    bin_step = bin_step,
    random_state=random_state
)

with_nic = fit_linear(
    X,
    y[['prp_change']],
    num_predictors,
    cat_predictors,
    bin_step = bin_step,
    random_state=random_state
)

Performance when treated as a regression

In [6]:
wo_nic_avg = without_nic.avg_scores
wo_nic_avg['label'] = 'without_nic'

w_nic_avg = with_nic.avg_scores
w_nic_avg['label'] = 'with_nic'

pd.concat([wo_nic_avg, w_nic_avg])

Unnamed: 0,mean_cv_r2,sd_cv_r2,test_r2,label
1,0.09667,0.083564,0.155133,without_nic
1,0.084482,0.085027,0.138708,with_nic


Performance when treated as a classifier

In [12]:
pd.DataFrame(
    {
        "f1" : [without_nic.clf_mets, with_nic.clf_mets],
        "label" : ["without_nic", "with_nic"]
    }
)


Unnamed: 0,f1,label
0,0.165049,without_nic
1,0.174757,with_nic
