Aggregating data to the level of individuals

In [14]:
%load_ext autoreload
%autoreload 2
%reload_ext autoreload
from matplotlib import pyplot as plt
import pandas as pd
from bsmcalls import individuals
from bsmcalls import readVCF
from bsmcalls import preprocessing
import statsmodels.api as sm
import fwsel
import re

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


Import calls and clinical data and filter for NeuN+ samples

In [2]:
calls, clin = individuals.get_data(merge=False)
calls = calls.loc(axis=0)[:, 'NeuN_pl']

In [3]:
aggcalls = individuals.agg_calls(calls)
data = pd.concat([aggcalls, clin], axis=1)

In [4]:
aggcalls

Variable,nCalls,AF,AF,BaseQRankSum,BaseQRankSum,DP,DP,FS,FS,SOR,...,FILTER/PASS,culprit,culprit,culprit,ChromatinState_DLPFC,ChromatinState_DLPFC,ChromatinState_DLPFC,evolConstrain,evolConstrain,evolConstrain
Transform,count,mean,std,mean,std,mean,std,mean,std,mean,...,entropy,marg_mode,frequency,entropy,marg_mode,frequency,entropy,marg_mode,frequency,entropy
Individual ID,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
CMC_MSSM_027,56,0.094286,0.110055,1.394286,2.469961,202.892857,27.324766,2.737946,3.867827,0.981536,...,0.872250,QD,0.946429,0.242999,Quies,0.732143,0.965981,0,0.910714,0.300882
CMC_MSSM_055,49,0.046939,0.068380,2.242102,2.001271,255.020408,25.005408,2.351408,2.822638,1.120061,...,0.902886,QD,0.979592,0.099623,Quies,0.632653,1.152910,0,0.979592,0.099623
CMC_MSSM_056,9,0.082222,0.040552,1.131333,2.036603,104.222222,10.802520,1.398000,1.819863,0.813111,...,0.848686,QD,0.888889,0.348832,Quies,0.666667,0.848686,0,1.000000,0.000000
CMC_MSSM_069,12,0.106667,0.032287,-0.287417,2.675964,58.916667,6.141636,12.272083,11.296428,2.714083,...,1.314374,QD,1.000000,0.000000,Quies,0.750000,0.836988,0,0.916667,0.286836
CMC_MSSM_097,96,0.078125,0.131135,1.063500,2.669541,372.781250,58.511034,1.965771,2.715713,0.887000,...,1.254138,QD,0.875000,0.487732,Quies,0.687500,1.030783,0,0.937500,0.233792
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
CMC_PITT_098,26,0.102308,0.059686,0.961385,1.958133,161.230769,14.475656,2.299462,2.279725,0.935885,...,0.271189,QD,1.000000,0.000000,Quies,0.615385,1.318487,0,0.923077,0.271189
CMC_PITT_101,13,0.123077,0.092681,1.000154,1.917042,130.769231,12.410707,1.480077,1.671059,0.823077,...,0.687092,QD,0.769231,0.687092,Quies,0.769231,0.540204,0,1.000000,0.000000
CMC_PITT_113,33,0.103030,0.159462,0.546909,3.012843,251.727273,22.765479,1.648909,1.838384,0.977273,...,1.047320,QD,0.969697,0.135794,Quies,0.666667,1.110038,0,1.000000,0.000000
CMC_PITT_117,17,0.096471,0.086958,1.080529,1.835480,182.411765,19.419896,3.010176,2.533470,1.063294,...,0.362211,QD,0.941176,0.223718,Quies,0.823529,0.578325,0,1.000000,0.000000


In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 85 entries, CMC_MSSM_027 to CMC_PITT_118
Data columns (total 57 columns):
 #   Column                             Non-Null Count  Dtype   
---  ------                             --------------  -----   
 0   (nCalls, count)                    85 non-null     int64   
 1   (AF, mean)                         85 non-null     float64 
 2   (AF, std)                          85 non-null     float64 
 3   (BaseQRankSum, mean)               85 non-null     float64 
 4   (BaseQRankSum, std)                85 non-null     float64 
 5   (DP, mean)                         85 non-null     float64 
 6   (DP, std)                          85 non-null     float64 
 7   (FS, mean)                         85 non-null     float64 
 8   (FS, std)                          85 non-null     float64 
 9   (SOR, mean)                        85 non-null     float64 
 10  (SOR, std)                         85 non-null     float64 
 11  (VQSLOD, mean)                 

## Modeling

First let's preprocess the data

In [6]:
data1 = preprocessing.preprocess(data)
# we remove the marginal modes since these are all degenerate variables
cols2drop = [y for y in data1.columns if re.match('.*_marg_mode', y)]
data1 = data1.drop(columns=cols2drop)

### Forward variable selection

In [7]:
best1 = fwsel.forward_selected(data1, 'Dx')

In [8]:
best1.model.formula

'Dx ~ AntipsychAtyp + YearofAutopsy + AntipsychTyp + EV3 + Institution + Benzodiazepines + causeOfDeath + Ethnicity + SOR_std + DP_std + DP_mean + ageOfDeath + evolConstrain_entropy + Alcohol + ReportedGender + REF_frequency + VQSLOD_std + BrainWeightingrams + BaseQRankSum_mean + REF_entropy + FILTERPASS_entropy + Antidepress + szdbCNVcount_mean + EV1 + 1'

In [9]:
best1.summary()

0,1,2,3
Dep. Variable:,Dx,R-squared:,0.817
Model:,OLS,Adj. R-squared:,0.725
Method:,Least Squares,F-statistic:,8.903
Date:,"Wed, 26 Aug 2020",Prob (F-statistic):,3.35e-12
Time:,15:37:32,Log-Likelihood:,18.279
No. Observations:,85,AIC:,21.44
Df Residuals:,56,BIC:,92.28
Df Model:,28,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,0.6140,0.131,4.695,0.000,0.352,0.876
AntipsychAtyp[T.1],0.6069,0.068,8.875,0.000,0.470,0.744
AntipsychTyp[T.1],0.2547,0.095,2.686,0.010,0.065,0.445
Institution[T.Pitt],0.0460,0.099,0.464,0.645,-0.153,0.245
Benzodiazepines[T.1],-0.0692,0.095,-0.728,0.470,-0.260,0.121
causeOfDeath[T.2],0.1150,0.203,0.566,0.573,-0.292,0.522
causeOfDeath[T.3],0.2080,0.217,0.960,0.341,-0.226,0.642
causeOfDeath[T.4],-0.0052,0.276,-0.019,0.985,-0.559,0.548
causeOfDeath[T.5],0.1703,0.117,1.450,0.153,-0.065,0.406

0,1,2,3
Omnibus:,2.634,Durbin-Watson:,1.888
Prob(Omnibus):,0.268,Jarque-Bera (JB):,1.732
Skew:,0.108,Prob(JB):,0.421
Kurtosis:,2.335,Cond. No.,19.8


In [29]:
sel_col = ['AntipsychAtyp', 'Ethnicity', 'Alcohol', 'Antidepress', 'nCalls_count', 'DP_mean', 'DP_std', 'REF_frequency', 'REF_entropy']
y = sm.Logit(data1['Dx'], data1[sel_col]).fit()

ValueError: Pandas data cast to numpy dtype of object. Check input data with np.asarray(data).

In [10]:
%connect_info

{
  "shell_port": 35261,
  "iopub_port": 38857,
  "stdin_port": 41841,
  "control_port": 58381,
  "hb_port": 52999,
  "ip": "127.0.0.1",
  "key": "5184d0bb-65382c8c3fc9341b2e8f0757",
  "transport": "tcp",
  "signature_scheme": "hmac-sha256",
  "kernel_name": ""
}

Paste the above JSON into a file, and connect with:
    $> jupyter <app> --existing <file>
or, if you are local, you can connect with just:
    $> jupyter <app> --existing kernel-f67795be-7f36-4907-af3c-267cd48c157a.json
or even just:
    $> jupyter <app> --existing
if this is the most recent Jupyter kernel you have started.
