## GenericsPrediction - Data Visualization

###Contact: Adrian Lam; ayplam@gmail.com

#### Notes:
* No project is ever complete without some data visualization!
* JSON files are created to allow visualization with D3
* All results can be found at [ayplam.github.io/fdagx](ayplam.github.io/fdagx)


In [None]:
import pandas as pd
import json
import numpy as np
from datetime import date, timedelta as td, datetime

%matplotlib inline
import matplotlib
import matplotlib.pylab as plt
import seaborn as sns
matplotlib.rcParams['savefig.dpi'] = 2 * matplotlib.rcParams['savefig.dpi']

def year_frac(pd_series):
    return pd_series.apply(lambda x: x.year + float((x-datetime(x.year,1,1) ).days) / \
                           ( datetime(x.year+1,1,1)-datetime(x.year,1,1) ).days )

def unix_time(dt):
    epoch = datetime.utcfromtimestamp(0)
    delta = dt - epoch
    
    if isinstance(delta,pd.Series):
        return delta.astype('timedelta64[s]') + 3600
    else:
        return delta.total_seconds()

def unix_time_millis(dt):
    return unix_time(dt)*1000

## Visualization of drug history over time

In [None]:
# Get the drug history, two line plots.
df_drugcounthistory = pd.read_csv('drugcounthistory.csv', sep = ',')
df_drugcounthistory['andacumsum'] = df_drugcounthistory['andacount'].cumsum()
df_drugcounthistory['ndacumsum'] = df_drugcounthistory['ndacount'].cumsum()
df_drugcounthistory['andastt'] = pd.to_datetime(df_drugcounthistory['andastt'])
df_drugcounthistory['ndastt'] = pd.to_datetime(df_drugcounthistory['ndastt'])


df_drugcounthistory['anda_unixms'] = unix_time_millis(df_drugcounthistory['andastt'])
df_drugcounthistory['nda_unixms'] = unix_time_millis(df_drugcounthistory['ndastt'])

json_data = []


json_data.append ( dict({'key': 'Number of Unique ANDAs', 
                         'color': '#4455dd',
                         'values': [dict(zip(['x','y'],[x,y])) for x,y in zip(df_drugcounthistory['anda_unixms'].values, \
                                                         df_drugcounthistory['andacumsum'].values)] }) )

json_data.append ( dict({'key': 'Number of Unique NDAs',
                         'color': '#ff7f0e',
                         'values': [dict(zip(['x','y'],[x,y])) for x,y in zip(df_drugcounthistory['nda_unixms'].values, \
                                                         df_drugcounthistory['ndacumsum'].values)] }) )



print len(df_drugcounthistory)
ticks = (df_drugcounthistory['ndastt'][-1:].values - df_drugcounthistory['ndastt'][:1].values) / 4
for i in (np.arange(4)+1):
    print unix_time_millis(df_drugcounthistory['ndastt'][:1])
    print unix_time_millis(df_drugcounthistory['ndastt'][:1] + ticks*i).map(str)

with open('/home/vagrant/website/data/drugcounthistory.json', 'w') as outfile:
    json.dump(json_data, outfile)

In [None]:
# Drug history, plot #2. Stacked bar graph
df_drugcounthistory = pd.read_csv('drugcounthistory.csv', sep = ',')

df_drugcounthistory['andastt'] = pd.to_datetime(df_drugcounthistory['andastt'])
df_drugcounthistory['ndastt'] = pd.to_datetime(df_drugcounthistory['ndastt'])

years = np.arange(15) + 2000
andayrcount = []
ndayrcount = []

for year in years:
    andayrcount.append( df_drugcounthistory[df_drugcounthistory['andastt'].map(lambda x: x.year == year)]['andacount'].sum() )
    ndayrcount.append( df_drugcounthistory[df_drugcounthistory['ndastt'].map(lambda x: x.year == year)]['ndacount'].sum() )
    
json_data = []

years_str = []
for year in years:
    years_str.append("Year: " + str(year))


json_data.append ( dict({'key': 'Number of Unique ANDAs', 
                         'color': '#4455dd',
                         'values': [dict(zip(['x','y'],[x,y])) for x,y in zip(years, \
                                                         andayrcount)] }) )

json_data.append ( dict({'key': 'Number of Unique NDAs', 
                         'color': '#ff7f0e',
                         'values': [dict(zip(['x','y'],[x,y])) for x,y in zip(years, \
                                                         ndayrcount)] }) )

with open('/home/vagrant/website/data/drugcounthistory_bar.json', 'w') as outfile:
    json.dump(json_data, outfile)

In [None]:

def unix_time(dt):
    epoch = datetime.utcfromtimestamp(0)
    delta = dt - epoch
    
    if isinstance(delta,pd.Series):
        return delta.astype('timedelta64[s]')
    else:
        return delta.total_seconds()


# The bar graph csv had UNIQUE NDAs and ANDAs while the FIRST graph had ALL NDAs/ANDAs released, regardless of unique or not.
df_drugcounthistory = pd.read_csv('drugcounthistory.csv', sep = ',')

df_drugcounthistory['andastt'] = pd.to_datetime(df_drugcounthistory['andastt'])
df_drugcounthistory['ndastt'] = pd.to_datetime(df_drugcounthistory['ndastt'])
df_drugcounthistory = df_drugcounthistory[df_drugcounthistory['andastt'].map(lambda x: x.year >= 2000)]
df_drugcounthistory['anda_unixms'] = unix_time_millis(df_drugcounthistory['andastt'])
df_drugcounthistory['nda_unixms'] = unix_time_millis(df_drugcounthistory['ndastt'])
df_drugcounthistory['andacumsum'] = df_drugcounthistory['andacount'].cumsum()
df_drugcounthistory['ndacumsum'] = df_drugcounthistory['ndacount'].cumsum()

json_data = []

json_data.append ( dict({'key': 'Unique ANDAs', 
                         'color': '#4455dd',
                         'values': [dict(zip(['x','y'],[int(x),int(y)])) for x,y in zip(df_drugcounthistory['anda_unixms'].values, \
                                                         df_drugcounthistory['andacumsum'])] }) )

json_data.append ( dict({'key': 'Unique NDAs', 
                         'color': '#ff7f0e',
                         'values': [dict(zip(['x','y'],[int(x),int(y)])) for x,y in zip(df_drugcounthistory['nda_unixms'].values, \
                                                         df_drugcounthistory['ndacumsum'])] }) )


with open('C:/Users/Adrian Lam/Dropbox/Personal/Website/gxbackground/drugcounthistory_line.json', 'w') as outfile:
    json.dump(json_data, outfile)

In [None]:
# bar graph for day of week releases

dow = ['Sunday','Monday','Tuesday','Wednesday','Thursday','Friday','Saturday']
xran = xrange(7)
release_fda_anda = [528,5440,4920,5091,4341,4067,804]
release_mck = [1,27,359,254,227,219,344]
json_dow= []


json_dow.append ( dict({'key': 'FDA Releases', 
                         'color': '#4455dd',
                         'values': [dict(zip(['x','y'],[x,y])) for x,y in zip(xran, \
                                                         release_fda_anda)] }) )
json_dow.append ( dict({'key': 'McK Releases', 
                         'color': '#ff7f0e',
                         'values': [dict(zip(['x','y'],[x,y])) for x,y in zip(xran, \
                                                         release_mck)] }) )


with open('/home/vagrant/website/data/andavsmck_dow.json', 'w') as outfile:
    json.dump(json_dow, outfile)

In [None]:
# line graph for day of week releases

dow = ['Sunday','Monday','Tuesday','Wednesday','Thursday','Friday','Saturday']
xran = xrange(7)
release_fda_anda = [528,5440,4920,5091,4341,4067,804]
release_mck = [1,27,359,254,227,219,344]
json_dow= []


json_dow.append ( dict({'key': 'FDA Releases', 
                         'color': '#4455dd',
                         'values': [dict(zip(['x','y'],[x,y])) for x,y in zip(xran, release_fda_anda)] }) )

json_dow.append ( dict({'key': 'McK Releases',
                         'color': '#ff7f0e',
                         'values': [dict(zip(['x','y'],[x,y])) for x,y in zip(xran, release_mck)] }) )


with open('/home/vagrant/website/data/andavsmck_dow.json', 'w') as outfile:
    json.dump(json_dow, outfile)

In [None]:
# Releases over time: 2009 - Current
df_majorsuppliers = pd.read_csv('major_supplier_release_dates.csv', sep = ',')
print df_majorsuppliers.columns
df_majorsuppliers['startmktdate'] = pd.to_datetime(df_majorsuppliers['startmktdate'])
s_releases = df_majorsuppliers.sort(['startmktdate']).groupby(['startmktdate'])['startmktdate'].count()

In [None]:
s_releases_after_2k = s_releases[s_releases.index.year >= 2009]

ser = [0] * len(s_releases_after_2k)
df_year = pd.DataFrame()
df_year['year'] = s_releases_after_2k.index
x = year_frac(df_year['year'])
json_data = []
# print s_releases_after_2k.index.map(lambda x: str(x.date()))
# print s_releases_after_2k.values
json_data.append ( dict({'key': 'Number of Launches 2009 - current', 
                         'values': [dict(zip(['x','y','series'],[x,y,0])) for x,y in zip(x, \
                                                         s_releases_after_2k.values)] }) )


with open('/home/vagrant/website/data/releases09toCurr.json', 'w') as outfile:
    json.dump(json_data, outfile)

In [None]:
# FFT of releases:



xdata = list(xVals[:1186])
ydata = np.abs(fft)[:1186]
ser = [0] * len(xdata)
json_data = []

# print s_releases_after_2k.index.map(lambda x: str(x.date()))
# print s_releases_after_2k.values
json_data.append ( dict({'key': 'Release Frequency', 
                         'values': [dict(zip(['x','y','series'],[x,y[0],z])) for x,y,z in zip(xdata, \
                                                         ydata.tolist(), ser)] }) )



with open('/home/vagrant/website/data/frequency.json', 'w') as outfile:
    json.dump(json_data, outfile)

In [None]:
## Number of Releases, Grouped By Day of Month. MCK seriously looks pretty random

In [None]:
df_daymo = pd.read_csv('2006toCurr_GroupByDayInMonth.csv', sep = ',')

df_daymo['fda_norm'] = df_daymo['fdacnt'] / df_daymo['fdacnt'].sum()
df_daymo['mck_norm'] = df_daymo['mckcnt'] / df_daymo['mckcnt'].sum()
df_daymo[['fda_norm','mck_norm']].plot()