# Import packages

In [1]:
from __future__ import division
import pandas as pd
import numpy as np
import tat_neuro_functions as tnf
from sklearn.linear_model import LogisticRegression
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

# Import data

In [2]:
df = pd.read_csv('/Users/Greg/Projects/HIV_Neuro_NGS/DataModeling/dataframes/sklearn_preprocessed_df2.csv')

clin_cols = ['GDS','TMHDS','Age','Gender','log10_VL','log10_pVL','CD4','nCD4','CD8','nCD8',
             'Years_seropositive','Race_Black', 'Race_Native_Hawaiian', 'Race_White', 'Race_Multiple',
             'Exposure_MSM', 'Exposure_IDU', 'Exposure_blood_transfusion', 'Exposure_heterosexual',
             'Exposure_heterosexual_and_IDU','Exposure_other']

clin_df = tnf.select_dataframe_columns(df, clin_cols)
clin_df.head()

Unnamed: 0,GDS,TMHDS,Age,Gender,log10_VL,log10_pVL,CD4,nCD4,CD8,nCD8,...,Race_Black,Race_Native_Hawaiian,Race_White,Race_Multiple,Exposure_MSM,Exposure_IDU,Exposure_blood_transfusion,Exposure_heterosexual,Exposure_heterosexual_and_IDU,Exposure_other
0,0.583333,4.0,59.0,Male,1.30103,2.994317,797.0,400.0,1131.0,1060.0,...,1,0,0,0,0,1,0,0,0,0
1,1.416667,4.0,59.0,Male,1.30103,2.672098,1167.0,448.0,663.0,478.0,...,1,0,0,0,1,0,0,0,0,0
2,0.583333,7.0,63.0,Male,1.531479,4.455606,881.0,403.0,1599.0,888.0,...,1,0,0,0,1,0,0,0,0,0
3,0.0,6.5,68.0,Male,1.30103,4.595199,771.0,553.0,958.0,958.0,...,1,0,0,0,0,0,0,0,1,0
4,1.333333,8.0,54.0,Male,1.30103,4.898034,561.0,224.0,1607.0,1288.0,...,1,0,0,0,1,1,0,0,0,1


# Prepare for pipeline

In [3]:
target = 'GDS'
threshold = 0.5
features = ['Age','Gender','log10_VL','log10_pVL','CD4','nCD4','CD8','nCD8','TMHDS',
            'Years_seropositive','Race_Black', 'Race_Native_Hawaiian', 'Race_White', 'Race_Multiple',
            'Exposure_MSM', 'Exposure_IDU', 'Exposure_blood_transfusion', 'Exposure_heterosexual',
            'Exposure_heterosexual_and_IDU','Exposure_other']

X_df, y = tnf.run_preparation_pipeline2(clin_df, features, target, threshold)
print y

# Impaired and nonimpaired split
print 'Impaired:',sum(y==1)
print 'Nonimpaired:',sum(y==0)

X_df.head()

[1 1 1 0 1 1 1 0 1 1 1 1 1 0 0 1 0 0 1 1 1 1 0 1 0 1 0 0 1 0 0 0 1 0 1 1 1
 0 1 0 1 1 1 1 1 0 1 0 1 0 1 1 1 1 0 0 1 0 1 1 0 0 1 0 0 0 1 0 0 1 1 0 0 1
 1 0 0 0 0 1 1 0 1 1 1 0 1 0 0 0 1 1 1 0 0 1 1 1 1 0 1 0 1 1 0 0 1 0 0 0 0
 1 0 1 1 1 1]
Impaired: 66
Nonimpaired: 51


Unnamed: 0,Age,log10_VL,log10_pVL,CD4,nCD4,CD8,nCD8,TMHDS,Years_seropositive,Race_Black,Race_Native_Hawaiian,Race_White,Race_Multiple,Exposure_MSM,Exposure_IDU,Exposure_blood_transfusion,Exposure_heterosexual,Exposure_heterosexual_and_IDU,Exposure_other,Gender_Male
0,59.0,1.30103,2.994317,797.0,400.0,1131.0,1060.0,4.0,19.0,1,0,0,0,0,1,0,0,0,0,1
1,59.0,1.30103,2.672098,1167.0,448.0,663.0,478.0,4.0,21.0,1,0,0,0,1,0,0,0,0,0,1
2,63.0,1.531479,4.455606,881.0,403.0,1599.0,888.0,7.0,30.0,1,0,0,0,1,0,0,0,0,0,1
3,68.0,1.30103,4.595199,771.0,553.0,958.0,958.0,6.5,16.0,1,0,0,0,0,0,0,0,1,0,1
4,54.0,1.30103,4.898034,561.0,224.0,1607.0,1288.0,8.0,26.0,1,0,0,0,1,1,0,0,0,1,1


# Variance Filtering

In [4]:
from sklearn.feature_selection import VarianceThreshold

selector = VarianceThreshold(threshold=0.1)
T = selector.fit_transform(X_df)

#print X_df.shape
#print selector.variances_
#print T.shape

keep_col_list = []
for c in X_df.columns:
    v = np.var(list(X_df[c]))
    if v>0.1:
        keep_col_list.append(c)

print keep_col_list

['Age', 'log10_VL', 'log10_pVL', 'CD4', 'nCD4', 'CD8', 'nCD8', 'TMHDS', 'Years_seropositive', 'Exposure_MSM', 'Exposure_IDU', 'Exposure_heterosexual', 'Gender_Male']


In [5]:
X_df = pd.DataFrame(T)
X_df.columns = keep_col_list

# Impaired and nonimpaired split
print 'Impaired:',sum(y==1)
print 'Nonimpaired:',sum(y==0)

print X_df.shape
X_df.head()

Impaired: 66
Nonimpaired: 51
(117, 13)


Unnamed: 0,Age,log10_VL,log10_pVL,CD4,nCD4,CD8,nCD8,TMHDS,Years_seropositive,Exposure_MSM,Exposure_IDU,Exposure_heterosexual,Gender_Male
0,59.0,1.30103,2.994317,797.0,400.0,1131.0,1060.0,4.0,19.0,0.0,1.0,0.0,1.0
1,59.0,1.30103,2.672098,1167.0,448.0,663.0,478.0,4.0,21.0,1.0,0.0,0.0,1.0
2,63.0,1.531479,4.455606,881.0,403.0,1599.0,888.0,7.0,30.0,1.0,0.0,0.0,1.0
3,68.0,1.30103,4.595199,771.0,553.0,958.0,958.0,6.5,16.0,0.0,0.0,0.0,1.0
4,54.0,1.30103,4.898034,561.0,224.0,1607.0,1288.0,8.0,26.0,1.0,1.0,0.0,1.0


# Normalize

In [6]:
X_df = tnf.scale_dataframe(X_df)

# Impaired and nonimpaired split
print 'Impaired:',sum(y==1)
print 'Nonimpaired:',sum(y==0)

print X_df.shape
X_df.head()

Impaired: 66
Nonimpaired: 51
(117, 13)


Unnamed: 0,Age,log10_VL,log10_pVL,CD4,nCD4,CD8,nCD8,TMHDS,Years_seropositive,Exposure_MSM,Exposure_IDU,Exposure_heterosexual,Gender_Male
0,0.977462,-0.450448,-0.780043,0.286021,0.598424,0.478651,1.112201,-1.571159,0.275913,-0.547723,2.607681,-1.098885,0.776363
1,0.977462,-0.450448,-1.021905,1.232499,0.834713,-0.617639,-0.502773,-1.571159,0.592401,1.825742,-0.383482,-1.098885,0.776363
2,1.496115,-0.203587,0.316819,0.500897,0.613192,1.574941,0.634923,-0.459311,2.016596,1.825742,-0.383482,-1.098885,0.776363
3,2.144432,-0.450448,0.421599,0.219511,1.351597,0.073398,0.829164,-0.644619,-0.198819,-0.547723,-0.383482,-1.098885,0.776363
4,0.329145,-0.450448,0.648911,-0.317679,-0.267972,1.593681,1.744872,-0.088694,1.38362,1.825742,2.607681,-1.098885,0.776363


In [7]:
# Save data
X_df['GDS'] = y
clin_clean_df = X_df
print clin_clean_df.shape

clin_clean_df.to_csv('/Users/Greg/Projects/HIV_Neuro_NGS/DataModeling/dataframes/clin_clean_df.csv', index=False)
clin_clean_df.head()

(117, 14)


Unnamed: 0,Age,log10_VL,log10_pVL,CD4,nCD4,CD8,nCD8,TMHDS,Years_seropositive,Exposure_MSM,Exposure_IDU,Exposure_heterosexual,Gender_Male,GDS
0,0.977462,-0.450448,-0.780043,0.286021,0.598424,0.478651,1.112201,-1.571159,0.275913,-0.547723,2.607681,-1.098885,0.776363,1
1,0.977462,-0.450448,-1.021905,1.232499,0.834713,-0.617639,-0.502773,-1.571159,0.592401,1.825742,-0.383482,-1.098885,0.776363,1
2,1.496115,-0.203587,0.316819,0.500897,0.613192,1.574941,0.634923,-0.459311,2.016596,1.825742,-0.383482,-1.098885,0.776363,1
3,2.144432,-0.450448,0.421599,0.219511,1.351597,0.073398,0.829164,-0.644619,-0.198819,-0.547723,-0.383482,-1.098885,0.776363,0
4,0.329145,-0.450448,0.648911,-0.317679,-0.267972,1.593681,1.744872,-0.088694,1.38362,1.825742,2.607681,-1.098885,0.776363,1
