In [690]:
import pandas as pd
import numpy as np
import rpy2.robjects as robjects
from rpy2.robjects import pandas2ri
from rpy2.robjects.conversion import localconverter
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, ConfusionMatrixDisplay
from sklearn.model_selection import RandomizedSearchCV, train_test_split
from scipy.stats import randint
from xgboost import XGBClassifier
from sklearn.tree import DecisionTreeRegressor
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.linear_model import ElasticNet

In [552]:
readRDS = robjects.r['readRDS']
data = readRDS('Training_Data/master_processed_training_data.RDS')

In [553]:
with localconverter(robjects.default_converter + pandas2ri.converter):
    df = robjects.conversion.rpy2py(data)

In [554]:
data2 = readRDS('Prediction_Data/master_processed_prediction_data.RDS')

In [555]:
with localconverter(robjects.default_converter + pandas2ri.converter):
    pred = robjects.conversion.rpy2py(data2)

In [636]:
titers = df['abtiter_wide']
titers.keys()

('metadata', 'raw_data', 'normalized_data', 'batchCorrected_data')

In [637]:
titers['raw_data'].head()

Unnamed: 0,specimen_id,IgG_PT,IgG_PRN,IgG_FHA,IgG1_PT,IgG1_PRN,IgG1_FHA,IgG1_TT,IgG1_DT,IgG1_OVA,...,IgG3_FHA,IgG3_TT,IgG3_DT,IgG3_OVA,IgG4_PT,IgG4_PRN,IgG4_FHA,IgG4_TT,IgG4_DT,IgG4_OVA
1,1.0,3.736992,2.60235,34.050956,7.334714,2.174783,3.013252,1.428852,2.389153,0.665203,...,0.624148,1.0,3.358159,1.865388,1.061706,11.673594,0.880611,3.29005,1.232849,2.622675
2,19.0,1.096366,7.652635,1.096457,1.424098,3.161591,1.287515,1.37739,1.523941,33.771912,...,1.280732,1.0,0.856594,1.119233,1.0,0.733287,0.057114,0.02482,0.003253,0.053981
3,27.0,2.046671,5.670403,1.048276,3.888604,2.591155,1.269821,1.675259,2.022924,5.777047,...,6.265944,2.33677,0.856594,1.0,6.582579,3.261863,1.089128,1.635454,0.634256,2.021985
4,37.0,3.798007,5.268274,0.084437,7.456313,2.760065,2.864834,1.537432,2.250237,4.130732,...,41.809687,2.514246,14.818885,5.446934,44.804003,1.112574,24.353645,0.920018,1.879391,1.56932
5,45.0,0.213328,0.090176,0.37929,0.084132,0.025479,0.654192,0.87492,0.369367,10.452881,...,7.505362,1.0,0.856594,13.206949,1.0,0.208993,0.98487,3.565218,0.676574,7.648106


In [638]:
subject = df['subject_specimen']
subject.head()

Unnamed: 0,specimen_id,subject_id,actual_day_relative_to_boost,planned_day_relative_to_boost,specimen_type,visit,infancy_vac,biological_sex,ethnicity,race,year_of_birth,date_of_boost,dataset,timepoint
1,1.0,1.0,-3.0,0.0,Blood,1.0,wP,Female,Not Hispanic or Latino,White,5844.0,17056.0,2020_dataset,0.0
2,2.0,1.0,1.0,1.0,Blood,2.0,wP,Female,Not Hispanic or Latino,White,5844.0,17056.0,2020_dataset,1.0
3,3.0,1.0,3.0,3.0,Blood,3.0,wP,Female,Not Hispanic or Latino,White,5844.0,17056.0,2020_dataset,3.0
4,4.0,1.0,7.0,7.0,Blood,4.0,wP,Female,Not Hispanic or Latino,White,5844.0,17056.0,2020_dataset,7.0
5,5.0,1.0,11.0,14.0,Blood,5.0,wP,Female,Not Hispanic or Latino,White,5844.0,17056.0,2020_dataset,14.0


In [639]:
ser = subject['year_of_birth']

new = pd.to_timedelta(ser, unit='D') + pd.Timestamp('1970-1-1') 
new.head(5)

1   1986-01-01
2   1986-01-01
3   1986-01-01
4   1986-01-01
5   1986-01-01
Name: year_of_birth, dtype: datetime64[ns]

In [640]:
IgG = subject.merge(titers['raw_data'], on='specimen_id')

In [641]:
IgG.columns

Index(['specimen_id', 'subject_id', 'actual_day_relative_to_boost',
       'planned_day_relative_to_boost', 'specimen_type', 'visit',
       'infancy_vac', 'biological_sex', 'ethnicity', 'race', 'year_of_birth',
       'date_of_boost', 'dataset', 'timepoint', 'IgG_PT', 'IgG_PRN', 'IgG_FHA',
       'IgG1_PT', 'IgG1_PRN', 'IgG1_FHA', 'IgG1_TT', 'IgG1_DT', 'IgG1_OVA',
       'IgG2_PT', 'IgG2_PRN', 'IgG2_FHA', 'IgG2_TT', 'IgG2_DT', 'IgG2_OVA',
       'IgG3_PT', 'IgG3_PRN', 'IgG3_FHA', 'IgG3_TT', 'IgG3_DT', 'IgG3_OVA',
       'IgG4_PT', 'IgG4_PRN', 'IgG4_FHA', 'IgG4_TT', 'IgG4_DT', 'IgG4_OVA'],
      dtype='object')

In [642]:
IgG = IgG[['subject_id',
           'specimen_id',
           'infancy_vac',
           'biological_sex',
           'year_of_birth',
           'date_of_boost',
           'actual_day_relative_to_boost',
           'planned_day_relative_to_boost',
           'ethnicity',
           'race',
           'dataset',
           'specimen_type',
           'visit',
           'IgG_PT',
           'IgG1_PT',
           'IgG2_PT',
           'IgG3_PT',
           'IgG4_PT']]

In [643]:
IgG.head()

Unnamed: 0,subject_id,specimen_id,infancy_vac,biological_sex,year_of_birth,date_of_boost,actual_day_relative_to_boost,planned_day_relative_to_boost,ethnicity,race,dataset,specimen_type,visit,IgG_PT,IgG1_PT,IgG2_PT,IgG3_PT,IgG4_PT
0,1.0,1.0,wP,Female,5844.0,17056.0,-3.0,0.0,Not Hispanic or Latino,White,2020_dataset,Blood,1.0,3.736992,7.334714,1.0,1.0,1.061706
1,1.0,3.0,wP,Female,5844.0,17056.0,3.0,3.0,Not Hispanic or Latino,White,2020_dataset,Blood,3.0,2.255534,8.284477,1.260385,1.0,1.0
2,1.0,4.0,wP,Female,5844.0,17056.0,7.0,7.0,Not Hispanic or Latino,White,2020_dataset,Blood,4.0,3.250369,6.557696,1.0,1.0,1.415608
3,1.0,5.0,wP,Female,5844.0,17056.0,11.0,14.0,Not Hispanic or Latino,White,2020_dataset,Blood,5.0,10.874112,19.38695,1.08033,1.0,2.689656
4,1.0,6.0,wP,Female,5844.0,17056.0,32.0,30.0,Not Hispanic or Latino,White,2020_dataset,Blood,6.0,12.51386,24.891105,1.08033,1.251655,2.123412


In [644]:
IgG_d14 = IgG[IgG['planned_day_relative_to_boost'] == 14.0]
IgG_d0 = IgG[IgG['planned_day_relative_to_boost'] == 0.0][['subject_id', 'IgG_PT']]
IgG_d3 = IgG[IgG['planned_day_relative_to_boost'] == 3.0][['subject_id', 'IgG_PT']]
IgG_d7 = IgG[IgG['planned_day_relative_to_boost'] == 7.0][['subject_id', 'IgG_PT']]
IgG_d0 = IgG_d0.rename(columns={'subject_id': 'subject_id', 'IgG_PT': 'IgG_PT_d0'})
IgG_d3 = IgG_d3.rename(columns={'subject_id': 'subject_id', 'IgG_PT': 'IgG_PT_d3'})
IgG_d7 = IgG_d7.rename(columns={'subject_id': 'subject_id', 'IgG_PT': 'IgG_PT_d7'})
IgG_d14 = IgG_d14.merge(IgG_d0, on='subject_id')
#IgG_d14 = IgG_d14.merge(IgG_d3, on='subject_id')
#IgG_d14 = IgG_d14.merge(IgG_d7, on='subject_id')
#IgG_d14_add = IgG_d14.merge(IgG_d3, on='subject_id')
#IgG_d14_add = IgG_d14_add.merge(IgG_d7, on='subject_id')

In [645]:
IgG_d14 = IgG_d14[['infancy_vac', 'biological_sex', 'year_of_birth', 'ethnicity', 'race', 'visit', 'IgG_PT_d0', 'IgG_PT']]

In [646]:
#IgG_d14_add = IgG_d14_add[['infancy_vac', 'biological_sex', 'year_of_birth', 'ethnicity', 'race', 'visit', 'IgG_PT_d0', 'IgG_PT_d3', 'IgG_PT_d7', 'IgG_PT']]

In [647]:
IgG_d14.head()

Unnamed: 0,infancy_vac,biological_sex,year_of_birth,ethnicity,race,visit,IgG_PT_d0,IgG_PT
0,wP,Female,5844.0,Not Hispanic or Latino,White,5.0,3.736992,10.874112
1,wP,Female,4748.0,Unknown,White,5.0,1.096366,7.041547
2,wP,Male,6574.0,Not Hispanic or Latino,Asian,5.0,2.046671,7.896541
3,wP,Male,7670.0,Not Hispanic or Latino,Asian,5.0,3.798007,5.327203
4,wP,Female,6574.0,Not Hispanic or Latino,White,5.0,0.213328,9.128886


In [648]:
IgG_d14['infancy_vac'] = IgG_d14['infancy_vac'].map({'wP':0, 'aP':1})

In [649]:
IgG_d14['biological_sex'] = IgG_d14['biological_sex'].map({'Female':0, 'Male':1})

In [650]:
IgG_d14['ethnicity'] = IgG_d14['ethnicity'].map({'Not Hispanic or Latino':0, 
                                                   'Hispanic or Latino':1,
                                                   'Unknown':2})

In [651]:
IgG_d14['race'] = IgG_d14['race'].map({'White':0, 
                                       'Asian':1,
                                       'Unknown or Not Reported': 2,
                                       'More Than One Race': 2,
                                       'Black or African American': 3,
                                       'Native Hawaiian or Other Pacific Islander': 4,
                                       'American Indian/Alaska Native':5
                                      })
IgG_d14.head()

Unnamed: 0,infancy_vac,biological_sex,year_of_birth,ethnicity,race,visit,IgG_PT_d0,IgG_PT
0,0,0,5844.0,0,0,5.0,3.736992,10.874112
1,0,0,4748.0,2,0,5.0,1.096366,7.041547
2,0,1,6574.0,0,1,5.0,2.046671,7.896541
3,0,1,7670.0,0,1,5.0,3.798007,5.327203
4,0,0,6574.0,0,0,5.0,0.213328,9.128886


In [652]:
#IgG_d14 = IgG_d14[['infancy_vac', 'biological_sex', 'year_of_birth', 'ethnicity', 'race', 'visit', 'IgG_PT']

In [653]:
X = IgG_d14.drop('IgG_PT', axis = 1)
y = IgG_d14['IgG_PT']

In [654]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [655]:
X_train

Unnamed: 0,infancy_vac,biological_sex,year_of_birth,ethnicity,race,visit,IgG_PT_d0
14,0,0,3652.0,1,0,5.0,9.970320
84,1,1,10227.0,2,2,5.0,0.901973
66,1,1,10227.0,0,5,5.0,0.385377
9,0,1,4383.0,0,1,5.0,0.132229
16,0,1,8766.0,0,1,5.0,3.478381
...,...,...,...,...,...,...,...
17,0,0,4018.0,0,0,5.0,4.857254
10,1,1,9862.0,0,0,5.0,1.581234
35,0,0,8766.0,0,1,5.0,0.838002
74,0,0,8401.0,0,0,5.0,0.993348


In [698]:
reg = LinearRegression().fit(X, y)

In [703]:
reg.score(X, y)

0.15758053419127804

In [704]:
Y_pred = reg.predict(X)

In [705]:
mse = mean_squared_error(y, Y_pred)
mae = mean_absolute_error(y, Y_pred)
r2 = r2_score(y, Y_pred)
mse, mae, r2

(21.785056742486766, 3.6549103815789135, 0.15758053419127804)

In [694]:
model = ElasticNet()
model.fit(X, y)

In [695]:
Y_pred = model.predict(X)

In [696]:
mse = mean_squared_error(y, Y_pred)
mae = mean_absolute_error(y, Y_pred)
r2 = r2_score(y, Y_pred)
mse, mae, r2

(22.248644536283994, 3.7125745331813738, 0.13965377888269737)

In [707]:
regr_1 = DecisionTreeRegressor(max_depth=2)
regr_2 = DecisionTreeRegressor(max_depth=5)
regr_3 = DecisionTreeRegressor(max_depth=10)
regr_4 = DecisionTreeRegressor(max_depth=20)
regr_1.fit(X, y)
regr_2.fit(X, y)
regr_3.fit(X, y)
regr_4.fit(X, y)

In [708]:
y_1 = regr_1.predict(X)
y_2 = regr_2.predict(X)
y_3 = regr_3.predict(X)
y_4 = regr_4.predict(X)

In [709]:
mse = mean_squared_error(y, y_1)
mae = mean_absolute_error(y, y_1)
r2 = r2_score(y, y_1)
mse, mae, r2

(15.927582684728465, 3.008124622529205, 0.3840867225869997)

In [710]:
mse = mean_squared_error(y, y_2)
mae = mean_absolute_error(y, y_2)
r2 = r2_score(y, y_2)
mse, mae, r2

(11.876035290340502, 2.2756843781814236, 0.5407584463297501)

In [711]:
mse = mean_squared_error(y, y_3)
mae = mean_absolute_error(y, y_3)
r2 = r2_score(y, y_3)
mse, mae, r2

(0.5575911007495251, 0.3323150731788808, 0.9784381742592839)

In [712]:
mse = mean_squared_error(y, y_4)
mae = mean_absolute_error(y, y_4)
r2 = r2_score(y, y_4)
mse, mae, r2

(0.0, 0.0, 1.0)

## 2022 Validation Predictions

In [713]:
titers_pred = pred['abtiter']
titers_pred = titers_pred['processed_similar_to_training'].T.reset_index().rename(columns={'index':'specimen_id'})
titers_pred['specimen_id'] = titers_pred['specimen_id'].astype(int)

In [714]:
subject_pred = pred['subject_specimen']

In [715]:
subject_pred['specimen_id'] = subject_pred['specimen_id'].astype(int)

In [716]:
IgG = subject_pred.merge(titers_pred, on='specimen_id', how = 'outer')

In [717]:
IgG.columns

Index(['specimen_id', 'subject_id', 'actual_day_relative_to_boost',
       'planned_day_relative_to_boost', 'specimen_type', 'visit',
       'infancy_vac', 'biological_sex', 'ethnicity', 'race', 'year_of_birth',
       'date_of_boost', 'dataset', 'timepoint', 'IgG_FHA', 'IgG_PRN', 'IgG_PT',
       'IgG1_DT', 'IgG1_FHA', 'IgG1_OVA', 'IgG1_PRN', 'IgG1_PT', 'IgG1_TT',
       'IgG2_DT', 'IgG2_FHA', 'IgG2_OVA', 'IgG2_PRN', 'IgG2_PT', 'IgG2_TT',
       'IgG3_DT', 'IgG3_FHA', 'IgG3_OVA', 'IgG3_PRN', 'IgG3_PT', 'IgG3_TT',
       'IgG4_DT', 'IgG4_FHA', 'IgG4_OVA', 'IgG4_PRN', 'IgG4_PT', 'IgG4_TT'],
      dtype='object')

In [718]:
IgG = IgG[['subject_id',
           'specimen_id',
           'infancy_vac',
           'biological_sex',
           'year_of_birth',
           'date_of_boost',
           'actual_day_relative_to_boost',
           'planned_day_relative_to_boost',
           'ethnicity',
           'race',
           'dataset',
           'specimen_type',
           'visit',
           'IgG_PT',
           'IgG1_PT',
           'IgG2_PT',
           'IgG3_PT',
           'IgG4_PT']]

In [719]:
IgG = IgG[IgG['planned_day_relative_to_boost'].isin([0])]# == True

In [721]:
IgG_d= IgG[['infancy_vac', 'biological_sex', 'year_of_birth', 'ethnicity', 'race', 'visit', 'IgG_PT']].rename(columns={'IgG_PT': 'IgG_PT_d0'})

In [722]:
IgG_d['infancy_vac'] = IgG_d['infancy_vac'].map({'wP':0, 'aP':1})

In [723]:
IgG_d['biological_sex'] = IgG_d['biological_sex'].map({'Female':0, 'Male':1})

In [724]:
IgG_d['ethnicity'] = IgG_d['ethnicity'].map({'Not Hispanic or Latino':0, 
                                                   'Hispanic or Latino':1,
                                                   'Unknown':2})

In [725]:
IgG_d['race'] = IgG_d['race'].map({'White':0, 
                                       'Asian':1,
                                       'Unknown or Not Reported': 2,
                                       'More Than One Race': 2,
                                       'Black or African American': 3,
                                       'Native Hawaiian or Other Pacific Islander': 4,
                                       'American Indian/Alaska Native':5
                                      })
IgG_d.head()

Unnamed: 0,infancy_vac,biological_sex,year_of_birth,ethnicity,race,visit,IgG_PT_d0
2,0,1,5844.0,0,0,3.0,1.060618
12,0,0,8401.0,0,0,5.0,1.309938
22,1,0,10592.0,1,2,5.0,1.196227
32,1,0,11323.0,0,0,5.0,0.967752
42,1,1,12053.0,0,0,3.0,1.651583


In [726]:
y_1 = regr_1.predict(IgG_d)
y_2 = regr_2.predict(IgG_d)
y_3 = regr_3.predict(IgG_d)
y_4 = regr_4.predict(IgG_d)

In [733]:
y_3

array([10.13880592, 14.66110672,  8.86373049,  1.44478329, 14.8636346 ,
       14.8636346 , 16.29933481,  5.05166119,  7.90473632,  3.42081988,
        2.09827627,  7.37472284,  1.25896521,  2.88217808, 14.66110672,
       14.8636346 ,  0.9488632 , 10.56383046, 14.8636346 ,  2.25639777,
        4.09259703])

In [734]:
rank = [abs(sorted(y_3).index(x)-19) for x in y_3]
rank

[7, 5, 8, 17, 3, 3, 1, 11, 9, 13, 16, 10, 18, 14, 5, 3, 19, 6, 3, 15, 12]

In [735]:
IgG['1.1) IgG-PT-D14-titer-Rank'] = rank

In [738]:
ser = IgG['year_of_birth']

IgG['year_of_birth'] = pd.to_timedelta(ser, unit='D') + pd.Timestamp('1970-1-1') 

In [739]:
IgG[['subject_id', 'year_of_birth', 'biological_sex', 'infancy_vac', '1.1) IgG-PT-D14-titer-Rank']]

Unnamed: 0,subject_id,year_of_birth,biological_sex,infancy_vac,1.1) IgG-PT-D14-titer-Rank
2,97.0,1986-01-01,Male,wP,7
12,98.0,1993-01-01,Female,wP,5
22,99.0,1999-01-01,Female,aP,8
32,100.0,2001-01-01,Female,aP,17
42,101.0,2003-01-01,Male,aP,3
52,102.0,2003-01-01,Male,aP,3
62,103.0,1994-01-01,Female,wP,1
72,104.0,1989-01-01,Female,wP,11
81,105.0,1994-01-01,Female,wP,9
91,106.0,1996-01-01,Female,aP,13
