# Import all packages

In [1]:
from __future__ import division
import pandas as pd
import numpy as np
from collections import Counter
import matplotlib.pyplot as plt
%matplotlib inline

# Import long form of data

In [8]:
datapath = '../DataProcessing/MergedData/NGS_GDS_abundance.csv'

df = pd.read_csv(datapath)
print(df.shape)
print(df.columns)

(9684, 75)
Index(['Patient', 'Visit', 'DateOfVisit', 'Age', 'Gender', 'ART', 'VL', 'iVL',
       'pVL', 'CD4', 'iCD4', 'nCD4', 'CD8', 'iCD8', 'nCD8', 'Race_Asian',
       'Race_Native_American', 'Race_Black', 'Race_Native_Hawaiian',
       'Race_White', 'Race_Multiple', 'Race_Unknown', 'Exposure_MSM',
       'Exposure_IDU', 'Exposure_blood_transfusion', 'Exposure_heterosexual',
       'Exposure_hemophilia', 'Exposure_perinatal', 'Exposure_MSM_and_IDU',
       'Exposure_heterosexual_and_IDU', 'Exposure_other', 'Exposure_unknown',
       'Years_seropositive', 'TMHDS', 'VisitDate', 'MMSE_CNNS_T',
       'WRAT4Reading_Manual_T', 'WAIS4Information_Manual_T',
       'WAIS3DigSymCoding_manual_T', 'GroovedPegDom_Heaton_T',
       'GroovedPegNonDom_Heaton_T', 'TrailA_Heaton_T', 'TrailB_Heaton_T',
       'LetterFluencyFL_SENAS_T', 'CategoryFluency_SENAS_T', 'ROCF_CNNS_T',
       'WorkingMemory_SENAS_T', 'WordListLearning_SENAS_T', 'BVMTimmed_CNNS_T',
       'BVMTdelay_CNNS_T', 'BVMTrecog_CNNS_T'

# Quick data exploration

In [11]:
# race data
race_features = ['Race_Asian', 'Race_Native_American', 'Race_Black',
                 'Race_Native_Hawaiian', 'Race_White', 'Race_Multiple', 
                 'Race_Unknown']
race_df = df[race_features] == 'Checked'
print(race_df.sum(),'\n')

# exposure data
exposure_features = ['Exposure_MSM', 'Exposure_IDU', 'Exposure_blood_transfusion',
                     'Exposure_heterosexual', 'Exposure_hemophilia', 'Exposure_perinatal',
                     'Exposure_MSM_and_IDU', 'Exposure_heterosexual_and_IDU', 
                     'Exposure_other', 'Exposure_unknown']
exposure_df = df[exposure_features] == 'Checked'
exposure_df.head()
print(exposure_df.sum())

Race_Asian                 0
Race_Native_American       0
Race_Black              9186
Race_Native_Hawaiian     100
Race_White               330
Race_Multiple            247
Race_Unknown               0
dtype: int64 

Exposure_MSM                     2199
Exposure_IDU                     1203
Exposure_blood_transfusion        143
Exposure_heterosexual            5358
Exposure_hemophilia                 0
Exposure_perinatal                  0
Exposure_MSM_and_IDU                0
Exposure_heterosexual_and_IDU     683
Exposure_other                    438
Exposure_unknown                    0
dtype: int64


# Select desired features

In [12]:
datapath = '../DataProcessing/MergedData/NGS_GDS_abundance.csv'
df = pd.read_csv(datapath)

C = ['Patient','Visit','DateOfVisit','Age','Gender','ART','VL','iVL','pVL',
     'CD4','iCD4','nCD4','CD8','iCD8','nCD8','TMHDS','VisitDate','Years_seropositive',
     'Race_Black','Race_Native_Hawaiian','Race_White','Race_Multiple',
     'Exposure_MSM', 'Exposure_IDU','Exposure_blood_transfusion','Exposure_heterosexual',
     'Exposure_heterosexual_and_IDU','Exposure_other',
     'GDS','Prot','AAPos','Coverage',
     'A','R','N','D','C','Q','E','G','H','I','L','K','M','F','P','S','T','W','Y','V']

df = df[C]
df.head()

Unnamed: 0,Patient,Visit,DateOfVisit,Age,Gender,ART,VL,iVL,pVL,CD4,...,L,K,M,F,P,S,T,W,Y,V
0,A0001,R09,2014-11-10,59.0,Male,on,20.0,987.0,987.0,797.0,...,0.006249,0.002083,0.513425,0.000197,0.002969,0.001033,0.000738,0.428595,0.031623,0.000262
1,A0010,R08,2014-11-12,59.0,Male,on,20.0,50.0,470.0,1167.0,...,0.006289,0.006289,0.477987,0.0,0.0,0.0,0.0,0.509434,0.0,0.0
2,A0012,R02,2014-09-22,63.0,Male,on,34.0,2083.0,28550.0,881.0,...,0.015309,0.003827,0.124658,0.0,0.0,0.002187,0.002187,0.075451,0.002734,0.0
3,A0013,R09,2014-11-10,68.0,Male,on,20.0,144.0,39373.0,771.0,...,0.0,0.004886,0.26873,0.0,0.0,0.0,0.0,0.228013,0.0,0.0
4,A0015,R10,2013-10-22,54.0,Male,on,20.0,79074.0,79074.0,561.0,...,0.012422,0.0,0.447205,0.0,0.0,0.0,0.0,0.298137,0.0,0.0


# Define functions to reformat data

In [13]:
def slice_position(df, pos):
    m = df['AAPos'] == pos
    sliced_df = df[m]
    return sliced_df

def reformat_section(df):
    '''Must be a dataframe containing only 1 position'''
    pos = str(int(list(df.AAPos)[0]))
    AAs = ['A','R','N','D','C','Q','E','G','H','I',
           'L','K','M','F','P','S','T','W','Y','V']
    newnames_dict = {}
    for item in AAs:
        newnames_dict[item]= pos+item
    df_renamed = df.rename(columns = newnames_dict)
    df_renamed.drop(['AAPos','Coverage', 'Prot', 'DateOfVisit', 'VisitDate'], axis=1, inplace=True)
    return df_renamed

# Reformat data into wide form

In [14]:
tat_length = 101
on_columns = ['Patient', 'Visit', 'Age', 'Gender', 'ART', 'VL','iVL', 'pVL', 'CD4', 'iCD4', 
              'nCD4', 'CD8', 'iCD8', 'nCD8', 'Years_seropositive', 'TMHDS', 'GDS',
              'Race_Black','Race_Native_Hawaiian','Race_White','Race_Multiple',
              'Exposure_MSM', 'Exposure_IDU','Exposure_blood_transfusion','Exposure_heterosexual',
              'Exposure_heterosexual_and_IDU','Exposure_other']

start_df = slice_position(df, 1)
sklearn_df = reformat_section(start_df)
for i in range(2,tat_length+1):
    pos_df = slice_position(df, i)
    pos_df2 = reformat_section(pos_df)
    sklearn_df = pd.merge(sklearn_df, pos_df2,
               left_on=on_columns,
               right_on=on_columns,
               how='outer') 
    
print(sklearn_df.shape)
sklearn_df.head()

(118, 2047)


Unnamed: 0,Patient,Visit,Age,Gender,ART,VL,iVL,pVL,CD4,iCD4,...,101L,101K,101M,101F,101P,101S,101T,101W,101Y,101V
0,A0001,R09,59.0,Male,on,20.0,987.0,987.0,797.0,400.0,...,0.001036,0.0,0.0,0.0,0.000453,0.000356,0.0,0.000129,0.0,0.001975
1,A0010,R08,59.0,Male,on,20.0,50.0,470.0,1167.0,448.0,...,0.0,0.0,0.0,0.0,0.0,0.004473,0.001278,0.0,0.0,0.0
2,A0012,R02,63.0,Male,on,34.0,2083.0,28550.0,881.0,745.0,...,0.003682,0.0,0.0,0.0,0.003399,0.000283,0.00085,0.0,0.0,0.000142
3,A0013,R09,68.0,Male,on,20.0,144.0,39373.0,771.0,564.0,...,0.00241,0.0,0.0,0.0,0.000278,0.003894,0.000278,0.0,0.0,0.0
4,A0015,R10,54.0,Male,on,20.0,79074.0,79074.0,561.0,309.0,...,,,,,,,,,,


# Race and Exposure summary

In [15]:
race_features_2 = ['Race_Black','Race_Native_Hawaiian', 'Race_White', 'Race_Multiple']
rdf2 = sklearn_df[race_features_2] == 'Checked'
print(rdf2.sum(),'\n')

exposure_features_2 = ['Exposure_MSM', 'Exposure_IDU','Exposure_blood_transfusion',
                       'Exposure_heterosexual','Exposure_heterosexual_and_IDU','Exposure_other']
edf2 = sklearn_df[exposure_features_2] == 'Checked'
print(edf2.sum())

Race_Black              112
Race_Native_Hawaiian      1
Race_White                4
Race_Multiple             4
dtype: int64 

Exposure_MSM                     28
Exposure_IDU                     15
Exposure_blood_transfusion        2
Exposure_heterosexual            64
Exposure_heterosexual_and_IDU     8
Exposure_other                    5
dtype: int64


# Designate genetic-only and full data sets

In [19]:
all_features = list(sklearn_df.columns)

#nongenetic_features = ['Patient','Visit','Age','Gender','ART','VL','iVL','pVL','CD4',
#     'iCD4','nCD4','CD8','iCD8','nCD8','TMHDS','GDS', 'GDS_Threshold']

nongenetic_features = ['Patient', 'Visit', 'Age', 'Gender', 'ART', 'VL','iVL', 'pVL', 'CD4', 'iCD4', 
              'nCD4', 'CD8', 'iCD8', 'nCD8', 'Years_seropositive', 'TMHDS', 'GDS',
              'Race_Black','Race_Native_Hawaiian','Race_White','Race_Multiple',
              'Exposure_MSM', 'Exposure_IDU','Exposure_blood_transfusion','Exposure_heterosexual',
              'Exposure_heterosexual_and_IDU','Exposure_other']

genetic_feature_list = [x for x in all_features if x not in nongenetic_features]
len(genetic_feature_list)

2020

In [20]:
genetic_df = sklearn_df[genetic_feature_list]
#genetic_df = genetic_df.fillna(genetic_df.mean())
print(genetic_df.shape)
genetic_df.head()

(118, 2020)


Unnamed: 0,1A,1R,1N,1D,1C,1Q,1E,1G,1H,1I,...,101L,101K,101M,101F,101P,101S,101T,101W,101Y,101V
0,0.000213,0.002378,0.00018,4.9e-05,0.002477,8.2e-05,0.004051,0.001247,0.0,0.002149,...,0.001036,0.0,0.0,0.0,0.000453,0.000356,0.0,0.000129,0.0,0.001975
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.004473,0.001278,0.0,0.0,0.0
2,0.0,0.00164,0.0,0.0,0.000547,0.0,0.000547,0.000547,0.0,0.770366,...,0.003682,0.0,0.0,0.0,0.003399,0.000283,0.00085,0.0,0.0,0.000142
3,0.0,0.001629,0.0,0.0,0.0,0.0,0.001629,0.0,0.0,0.495114,...,0.00241,0.0,0.0,0.0,0.000278,0.003894,0.000278,0.0,0.0,0.0
4,0.0,0.006211,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.236025,...,,,,,,,,,,


In [21]:
sklearn_df.to_csv('/Users/Greg/Projects/HIV_Neuro_NGS/DataModeling/dataframes/sklearn_df.csv',index=False)
genetic_df.to_csv('/Users/Greg/Projects/HIV_Neuro_NGS/DataModeling/dataframes/genetic_df.csv',index=False)