# Import packages

In [1]:
import pandas as pd
import numpy as np

# Check dataframes

In [2]:
sklearn_df = pd.read_csv('/Users/Greg/Projects/HIV_Neuro_NGS/DataModeling/dataframes/sklearn_df.csv')
genetic_df = pd.read_csv('/Users/Greg/Projects/HIV_Neuro_NGS/DataModeling/dataframes/genetic_df.csv')
print sklearn_df.shape
print genetic_df.shape

(118, 2047)
(118, 2020)


In [3]:
genetic_list = sklearn_df.columns[27:]
genetic_list

Index([u'1A', u'1R', u'1N', u'1D', u'1C', u'1Q', u'1E', u'1G', u'1H', u'1I',
       ...
       u'101L', u'101K', u'101M', u'101F', u'101P', u'101S', u'101T', u'101W',
       u'101Y', u'101V'],
      dtype='object', length=2020)

# Convert "checked" to 1/0

In [4]:
clin_cols = sklearn_df.columns[:27]
sklearn_df = sklearn_df.replace(to_replace='Unchecked',value=0)
sklearn_df = sklearn_df.replace(to_replace='Checked',value=1)
sklearn_df[clin_cols].head()

Unnamed: 0,Patient,Visit,Age,Gender,ART,VL,iVL,pVL,CD4,iCD4,...,Race_Native_Hawaiian,Race_White,Race_Multiple,Exposure_MSM,Exposure_IDU,Exposure_blood_transfusion,Exposure_heterosexual,Exposure_heterosexual_and_IDU,Exposure_other,GDS
0,A0001,R09,59.0,Male,on,20.0,987.0,987.0,797.0,400.0,...,0,0,0,0,1,0,0,0,0,0.583333
1,A0010,R08,59.0,Male,on,20.0,50.0,470.0,1167.0,448.0,...,0,0,0,1,0,0,0,0,0,1.416667
2,A0012,R02,63.0,Male,on,34.0,2083.0,28550.0,881.0,745.0,...,0,0,0,1,0,0,0,0,0,0.583333
3,A0013,R09,68.0,Male,on,20.0,144.0,39373.0,771.0,564.0,...,0,0,0,0,0,0,0,1,0,0.0
4,A0015,R10,54.0,Male,on,20.0,79074.0,79074.0,561.0,309.0,...,0,0,0,1,1,0,0,0,1,1.333333


# Impute NaNs with mean

In [5]:
sklearn_df = pd.read_csv('/Users/Greg/Projects/HIV_Neuro_NGS/DataModeling/dataframes/sklearn_df.csv')
genetic_df = pd.read_csv('/Users/Greg/Projects/HIV_Neuro_NGS/DataModeling/dataframes/genetic_df.csv')
sklearn_df = sklearn_df.replace(to_replace='Unchecked',value=0)
sklearn_df = sklearn_df.replace(to_replace='Checked',value=1)
print sklearn_df.shape
print genetic_df.shape

clin_cols = ['Age', 'VL', 'iVL', 'pVL', 'CD4', 'iCD4','nCD4', 'CD8', 'iCD8', 'nCD8', 'TMHDS',
             'Years_seropositive','Race_Black', 'Race_Native_Hawaiian', 'Race_White', 'Race_Multiple',
             'Exposure_MSM', 'Exposure_IDU', 'Exposure_blood_transfusion', 'Exposure_heterosexual',
             'Exposure_heterosexual_and_IDU','Exposure_other', 'GDS']

for col_name in clin_cols:
    sklearn_df[col_name].fillna(round(sklearn_df[col_name].mean()), inplace=True)
    
genetic_list = sklearn_df.columns[27:]
for c in genetic_list:
    sklearn_df[c].fillna(round(sklearn_df[c].mean(),4), inplace=True)
    
# perform log transformation on viral loads
def log10_transform(x):
    return np.log10(x)

VL_t = sklearn_df['VL'].apply(log10_transform).copy()
iVL_t = sklearn_df['iVL'].apply(log10_transform).copy()
pVL_t = sklearn_df['pVL'].apply(log10_transform).copy()
sklearn_df.loc[:,'log10_VL'] = VL_t
sklearn_df.loc[:,'log10_iVL'] = iVL_t
sklearn_df.loc[:,'log10_pVL'] = pVL_t

# drop the transgender sample
transgender_mask = sklearn_df.Gender != 'Transgender'
sklearn_df = sklearn_df[transgender_mask]

# save to CSV
#sklearn_df.to_csv('/Users/Greg/Documents/Drexel/ML/sklearn_preprocessed_df1.csv', index=False)
sklearn_df.to_csv('/Users/Greg/Projects/HIV_Neuro_NGS/DataModeling/dataframes/sklearn_preprocessed_df1.csv',
                  index=False)

(118, 2047)
(118, 2020)


# Impute NaNs with median

In [6]:
sklearn_df = pd.read_csv('/Users/Greg/Projects/HIV_Neuro_NGS/DataModeling/dataframes/sklearn_df.csv')
genetic_df = pd.read_csv('/Users/Greg/Projects/HIV_Neuro_NGS/DataModeling/dataframes/genetic_df.csv')
sklearn_df = sklearn_df.replace(to_replace='Unchecked',value=0)
sklearn_df = sklearn_df.replace(to_replace='Checked',value=1)
print sklearn_df.shape
print genetic_df.shape

clin_cols = ['Age', 'VL', 'iVL', 'pVL', 'CD4', 'iCD4','nCD4', 'CD8', 'iCD8', 'nCD8', 'TMHDS',
             'Years_seropositive','Race_Black', 'Race_Native_Hawaiian', 'Race_White', 'Race_Multiple',
             'Exposure_MSM', 'Exposure_IDU', 'Exposure_blood_transfusion', 'Exposure_heterosexual',
             'Exposure_heterosexual_and_IDU','Exposure_other', 'GDS']

for col_name in clin_cols:
    sklearn_df[col_name].fillna(round(sklearn_df[col_name].median()), inplace=True)
    
genetic_list = sklearn_df.columns[27:]
for c in genetic_list:
    sklearn_df[c].fillna(round(sklearn_df[c].median(),4), inplace=True)
    
# perform log transformation on viral loads
def log10_transform(x):
    return np.log10(x)

VL_t = sklearn_df['VL'].apply(log10_transform).copy()
iVL_t = sklearn_df['iVL'].apply(log10_transform).copy()
pVL_t = sklearn_df['pVL'].apply(log10_transform).copy()
sklearn_df.loc[:,'log10_VL'] = VL_t
sklearn_df.loc[:,'log10_iVL'] = iVL_t
sklearn_df.loc[:,'log10_pVL'] = pVL_t

# drop the transgender sample
transgender_mask = sklearn_df.Gender != 'Transgender'
sklearn_df = sklearn_df[transgender_mask]

# save to CSV
#sklearn_df.to_csv('/Users/Greg/Documents/Drexel/ML/sklearn_preprocessed_df1.csv', index=False)
sklearn_df.to_csv('/Users/Greg/Projects/HIV_Neuro_NGS/DataModeling/dataframes/sklearn_preprocessed_df2.csv',
                  index=False)

(118, 2047)
(118, 2020)


In [8]:
print sklearn_df.shape
print genetic_df.shape

(117, 2050)
(118, 2020)


In [7]:
for c in list(sklearn_df.columns):
    print c

Patient
Visit
Age
Gender
ART
VL
iVL
pVL
CD4
iCD4
nCD4
CD8
iCD8
nCD8
TMHDS
Years_seropositive
Race_Black
Race_Native_Hawaiian
Race_White
Race_Multiple
Exposure_MSM
Exposure_IDU
Exposure_blood_transfusion
Exposure_heterosexual
Exposure_heterosexual_and_IDU
Exposure_other
GDS
1A
1R
1N
1D
1C
1Q
1E
1G
1H
1I
1L
1K
1M
1F
1P
1S
1T
1W
1Y
1V
2A
2R
2N
2D
2C
2Q
2E
2G
2H
2I
2L
2K
2M
2F
2P
2S
2T
2W
2Y
2V
3A
3R
3N
3D
3C
3Q
3E
3G
3H
3I
3L
3K
3M
3F
3P
3S
3T
3W
3Y
3V
4A
4R
4N
4D
4C
4Q
4E
4G
4H
4I
4L
4K
4M
4F
4P
4S
4T
4W
4Y
4V
5A
5R
5N
5D
5C
5Q
5E
5G
5H
5I
5L
5K
5M
5F
5P
5S
5T
5W
5Y
5V
6A
6R
6N
6D
6C
6Q
6E
6G
6H
6I
6L
6K
6M
6F
6P
6S
6T
6W
6Y
6V
7A
7R
7N
7D
7C
7Q
7E
7G
7H
7I
7L
7K
7M
7F
7P
7S
7T
7W
7Y
7V
8A
8R
8N
8D
8C
8Q
8E
8G
8H
8I
8L
8K
8M
8F
8P
8S
8T
8W
8Y
8V
9A
9R
9N
9D
9C
9Q
9E
9G
9H
9I
9L
9K
9M
9F
9P
9S
9T
9W
9Y
9V
10A
10R
10N
10D
10C
10Q
10E
10G
10H
10I
10L
10K
10M
10F
10P
10S
10T
10W
10Y
10V
11A
11R
11N
11D
11C
11Q
11E
11G
11H
11I
11L
11K
11M
11F
11P
11S
11T
11W
11Y
11V
12A
12R
12N
12D
12C
12Q
12