Selecting variables

In [16]:
%load_ext autoreload
%autoreload 2
%reload_ext autoreload
import readVCF
from matplotlib import pyplot as plt
import pandas as pd
from bsm import clean
import fwsel

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [17]:
calls = readVCF.readVCFs()

Welcome, Attila Jones!



### Filtering

We filter for the set of variables below and remove all other variables

In [18]:
v1 = ['AF', 'ALT', 'BaseQRankSum', 'DP', 'FILTER/PASS', 'FS', 'GWASpval', 'REF', 'ReadPosRankSum', 'SOR', 'VQSLOD', 'chromatinState_DLPFC', 'culprit', 'evolConstrain', 'szdbCNVcount']
v2 = ['Dx', 'AntipsychAtyp', 'AntipsychTyp', 'Institution', 'EV.3']

The following operations perform the filtering and also prettify variable names (replace whitespace and `./\:` characters with `_`).

In [19]:
calls = clean.prettify_colnames(clean.preselect(calls, vnames=v1 + v2))
calls = clean.dummify_var(calls, vname='Dx')
calls.count()

AF                      3301
ALT                     3301
BaseQRankSum            3301
DP                      3301
FILTER_PASS             3301
FS                      3301
GWASpval                3301
REF                     3301
ReadPosRankSum          3298
SOR                     3301
VQSLOD                  3301
chromatinState_DLPFC    3301
culprit                 3301
evolConstrain           3301
szdbCNVcount            3301
Dx                      3301
AntipsychAtyp           3301
AntipsychTyp            3301
Institution             3301
EV_3                    3154
dtype: int64

### Imputing variables with missing data

In [20]:
calls = clean.impute_vars(calls, vnames=['ReadPosRankSum', 'EV_3'], v1=v1, v2=v2)
calls.count()

AF                      3301
ALT                     3301
BaseQRankSum            3301
DP                      3301
FILTER_PASS             3301
FS                      3301
GWASpval                3301
REF                     3301
ReadPosRankSum          3298
SOR                     3301
VQSLOD                  3301
chromatinState_DLPFC    3301
culprit                 3301
evolConstrain           3301
szdbCNVcount            3301
Dx                      3301
AntipsychAtyp           3301
AntipsychTyp            3301
Institution             3301
EV_3                    3301
dtype: int64

## Forward variable selection

In [6]:
best1 = fwsel.forward_selected(calls.loc[:, :'Dx'], 'Dx')

In [7]:
best1.model.formula

'Dx ~ culprit + DP + ALT + AF + FILTER_PASS + VQSLOD + BaseQRankSum + SOR + REF + GWASpval + 1'

In [8]:
best1.summary()

0,1,2,3
Dep. Variable:,Dx,R-squared:,0.047
Model:,OLS,Adj. R-squared:,0.04
Method:,Least Squares,F-statistic:,7.284
Date:,"Fri, 14 Aug 2020",Prob (F-statistic):,2.34e-22
Time:,19:23:54,Log-Likelihood:,-1929.0
No. Observations:,3301,AIC:,3904.0
Df Residuals:,3278,BIC:,4044.0
Df Model:,22,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,1.0287,0.120,8.550,0.000,0.793,1.265
culprit[T.FS],-0.3283,0.142,-2.319,0.020,-0.606,-0.051
culprit[T.MQ],-0.1283,0.093,-1.380,0.168,-0.311,0.054
culprit[T.MQRankSum],-0.6091,0.178,-3.420,0.001,-0.958,-0.260
culprit[T.QD],-0.3199,0.086,-3.713,0.000,-0.489,-0.151
culprit[T.ReadPosRankSum],-0.2100,0.104,-2.016,0.044,-0.414,-0.006
culprit[T.SOR],-0.3166,0.093,-3.387,0.001,-0.500,-0.133
ALT[T.C],0.0689,0.030,2.320,0.020,0.011,0.127
ALT[T.G],0.0823,0.032,2.606,0.009,0.020,0.144

0,1,2,3
Omnibus:,763.739,Durbin-Watson:,0.092
Prob(Omnibus):,0.0,Jarque-Bera (JB):,609.459
Skew:,-0.957,Prob(JB):,4.5500000000000005e-133
Kurtosis:,2.125,Cond. No.,8270.0


In [9]:
#v1_1 = ['culprit', + DP + ALT + AF + FILTER_PASS + VQSLOD + BaseQRankSum + SOR + REF + GWASpval

### Standardization does not affect results

In [10]:
stdcalls = calls.apply(lambda y: (y - y.mean()) / y.std() if (y.dtype == 'float64' or y.dtype == 'int64') else y, axis=0)
best2 = fwsel.forward_selected(stdcalls.loc[:, :'Dx'], 'Dx')
best2.model.formula == best1.model.formula

True

In [11]:
best2.summary()

0,1,2,3
Dep. Variable:,Dx,R-squared:,0.047
Model:,OLS,Adj. R-squared:,0.04
Method:,Least Squares,F-statistic:,7.284
Date:,"Fri, 14 Aug 2020",Prob (F-statistic):,2.34e-22
Time:,19:24:00,Log-Likelihood:,-1929.0
No. Observations:,3301,AIC:,3904.0
Df Residuals:,3278,BIC:,4044.0
Df Model:,22,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,0.9985,0.103,9.663,0.000,0.796,1.201
culprit[T.FS],-0.3283,0.142,-2.319,0.020,-0.606,-0.051
culprit[T.MQ],-0.1283,0.093,-1.380,0.168,-0.311,0.054
culprit[T.MQRankSum],-0.6091,0.178,-3.420,0.001,-0.958,-0.260
culprit[T.QD],-0.3199,0.086,-3.713,0.000,-0.489,-0.151
culprit[T.ReadPosRankSum],-0.2100,0.104,-2.016,0.044,-0.414,-0.006
culprit[T.SOR],-0.3166,0.093,-3.387,0.001,-0.500,-0.133
ALT[T.C],0.0689,0.030,2.320,0.020,0.011,0.127
ALT[T.G],0.0823,0.032,2.606,0.009,0.020,0.144

0,1,2,3
Omnibus:,763.739,Durbin-Watson:,0.092
Prob(Omnibus):,0.0,Jarque-Bera (JB):,609.459
Skew:,-0.957,Prob(JB):,4.5500000000000005e-133
Kurtosis:,2.125,Cond. No.,54.0


In [12]:
%connect_info

{
  "shell_port": 50773,
  "iopub_port": 53007,
  "stdin_port": 50393,
  "control_port": 47589,
  "hb_port": 56609,
  "ip": "127.0.0.1",
  "key": "cf814183-cadffbbfe64c6ad7faf0b395",
  "transport": "tcp",
  "signature_scheme": "hmac-sha256",
  "kernel_name": ""
}

Paste the above JSON into a file, and connect with:
    $> jupyter <app> --existing <file>
or, if you are local, you can connect with just:
    $> jupyter <app> --existing kernel-d03bb879-ec6d-44d2-803e-af8392760a6a.json
or even just:
    $> jupyter <app> --existing
if this is the most recent Jupyter kernel you have started.
