# Parsing the datasets

### Importing libraries

In [1]:
# Numpy
import numpy as np
from numpy import concatenate, array
from numpy.random import randn
# Decimal precision value to display in the matrix
np.set_printoptions(precision=5, suppress=True)

# Scipy
import scipy
import scipy.stats as stats

# Matplotlib
import matplotlib.pyplot as pyplot
import matplotlib.cm as cm
import matplotlib.dates as mdates
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
%matplotlib inline
#mpl.rc('figure', figsize=(10, 8))

# DBscan from sklearn
from sklearn import cluster, datasets
from sklearn.cluster import DBSCAN
from sklearn import metrics
from sklearn.datasets.samples_generator import make_blobs
from sklearn.preprocessing import StandardScaler

# Pandas experiments
import pandas as pd
from pandas import Series, DataFrame, Panel

# Misc
import time
import datetime as dt
import math
print 'All libraries loaded.'

All libraries loaded.


## Plot function definitions

In [2]:
def plot_pdf(data, title):
    data.sort()
    pdf = stats.norm.pdf(data, np.mean(data), np.std(data))
    pyplot.plot(data, pdf, '-o', linewidth=3)
    pyplot.grid()
    pyplot.title(title)
    pyplot.show()

# Make dataframes from data

In [3]:
# Make dataframes from data

# eGFR data
egfr_df = pd.read_csv('~/code/independent/datasets/cdr_gfr_derived.csv', parse_dates=['resultdata'])
egfr_df.drop('gfr', axis=1, inplace=True)
egfr_df.columns = ['pid', 'timestamp', 'gender', 'birthyear', 'age', 'gfr']

# Findings data
findings_df = pd.read_csv('~/code/independent/datasets/cdr_finding.csv', parse_dates=['finddate'], usecols=['idperson', 'finddate', 'valuename', 'findvalnum'])
findings_df = findings_df[['idperson', 'finddate', 'valuename', 'findvalnum']]
findings_df.columns = ['pid', 'timestamp', 'testname', 'testval']

# Lab reports data
lab_df = pd.read_csv('~/code/independent/datasets/cdr_lab_result.csv', parse_dates=['resultdate'], usecols=['idperson', 'resultdate', 'valuename', 'resultvaluenum'])
lab_df = lab_df[['idperson', 'resultdate', 'valuename', 'resultvaluenum']]
lab_df.columns = ['pid', 'timestamp', 'testname', 'testval']
# Make all lab tests values uppercase
lab_df.testname = map(lambda x: x.upper(), lab_df.testname)

In [4]:
# Normalize dates (to remove the time part of it)
egfr_df.timestamp = egfr_df.timestamp.map(pd.datetools.normalize_date)
findings_df.timestamp = findings_df.timestamp.map(pd.datetools.normalize_date)
lab_df.timestamp = lab_df.timestamp.map(pd.datetools.normalize_date)

# Set the index as a combination of the person ID and timestamp
egfr_df.set_index(['pid', 'timestamp'], inplace=True)
findings_df.set_index(['pid', 'timestamp'], inplace=True)
lab_df.set_index(['pid', 'timestamp'], inplace=True)

In [67]:
# NaN values
total_rowcount = len(lab_df.testval.values)
nan_rowcount = len([x for x in lab_df.testval.values if math.isnan(x)])
print '\n',(str(nan_rowcount*100/total_rowcount)+ "% of the values are NaN")

# Drop NaN rows
# lab_df.dropna(inplace=True)


0% of the values are NaN


## Make tiny versions of the datasets

In [16]:
egfr_df = egfr_df[:10000]
lab_df = lab_df[:10000]
findings_df = findings_df[:10000]

## Make the new dataframe (empty)

In [65]:
# Column names will be a combination of all the lab test names and finding names
unique_findings = set(findings_df.testname.values)
unique_labtests = set(lab_df.testname.values)
print '\nFindings:', list(unique_findings)
print 'Lab tests:', list(unique_labtests)
final_col_names = np.append(np.append(egfr_df.columns.values, list(unique_findings)), list(unique_labtests))

# Make new frame
combined_df = DataFrame(columns=final_col_names)


Findings: ['FND_BPS']
Lab tests: ['LR_AST', 'LR_MICROCR', 'LR_TRIG', 'LR_GFR', 'LR_VITD 25', 'LR_ALT', 'LR_PHOS', 'LR_GFR_AFRAMER']


## Fill new dataframe based on joins on the other dataframes