# 05. Diversity Analysis

This notebook combines the NGS data of Tat alognside the corresponding clinical data for each patient.

<p><b>Input:</b></p>
<ul>
<li>Merged clinical and sequencing data
</ul>
<p><b>Output:</b></p>
<ul>
<li>Figures
</ul>

# Import Requirements

In [2]:
from __future__ import division
import pandas as pd
import numpy as np
from scipy import stats
import itertools
from collections import Counter
import operator
import random
import matplotlib as mpl
import matplotlib.pyplot as plt
%matplotlib inline

# set fonts
mpl.rc('font',family='Arial')

# Define functions

In [4]:
def import_data(filepath):
    DF = pd.read_csv(filepath)
    C = ['Patient','Visit','DateOfVisit','Age','Gender','ART','VL','iVL','pVL',
     'CD4','iCD4','nCD4','CD8','iCD8','nCD8','TMHDS','VisitDate','GDS','Prot', 
     'AAPos','Coverage','A','R','N','D','C','Q','E','G','H','I','L','K','M',
     'F','P','S','T','W','Y','V']
    DF = DF[C]
    return DF

# calculate diversity using a dictionary object as input
def diversity_column(counts, hill):
    vals = np.array(counts.values(), dtype=np.float64)
    vals /= vals.sum()
    if hill == 1:
        return np.exp(-np.sum(vals*np.log(vals)))
    else:
        return (vals**hill).sum()**(1/(1-hill))

# drop empty keys
def modify_dict(d):
    for k,v in d.items():
        if v == 0.0:
           del d[k]
    return d

def getIntrapatientDiversity(counts_df, hill_number):
    cols = ['A','R','N','D','C','Q','E','G','H','I',
            'L','K','M','F','P','S','T','W','Y','V']
    diversity_dict = {'Position':[], 'Diversity':[]}
    
    for i, g in counts_df.groupby('AAPos'):
        for j, row in g.iterrows():
            d = row[cols].to_dict()
            d2 = modify_dict(d)
            div = diversity_column(d2, hill_number)
            diversity_dict['Position'].append(i)
            diversity_dict['Diversity'].append(div)
            
    divDF = pd.DataFrame(diversity_dict)
    div_dict2 = {'Position':[], 'Mean Diversity':[], 'Std Diversity':[],
             'Minimum':[], 'Maximum':[], 'Distribution':[]}

    # information to return
    for i, g in divDF.groupby('Position'):
        div_dict2['Position'].append(int(i))
        div_dict2['Distribution'].append(list(g['Diversity']))
        div_dict2['Mean Diversity'].append(np.mean(g['Diversity']))
        div_dict2['Std Diversity'].append(np.std(g['Diversity']))
        div_dict2['Minimum'].append(min(g['Diversity']))
        div_dict2['Maximum'].append(max(g['Diversity']))
    divDF2 = pd.DataFrame(div_dict2)
    return divDF2

# Analysis

In [None]:
DF = import_data('/Users/greg/Desktop/FinalTatNeuro/MergedData/NGS_GDS_counts.csv')
DF1 = DF[DF['GDS']>=0.5]
DF2 = DF[DF['GDS']<0.5]
print DF.shape
print DF1.shape
print DF2.shape