In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import psycopg2 as pg2

In [2]:
def sql_query_read(table_name, dbname='medicare', user='postgres'):
    query = '''
            SELECT *
            FROM {};
            '''.format(table_name)
    conn = pg2.connect(dbname=dbname, user=user)
    df = pd.read_sql_query(query, conn)

    return df

In [3]:
def null_flag(df, col):
    df[col] = [1 if n is None else 0 for n in df[col]]
    df[col] = df[col].astype('category')

In [4]:
def specialty_avg_cost(col_claim, col_specialty, df):
    """
    Computes the average cost of drugs per specialty
    Returns original data frame with column added

    Parameters
    ----------
    col_claim : list of strings
        first part of column names to be computed
    col_specialty : string
        name of column with the specialty descriptions
    df : pandas df
        data frame containing data

    Returns
    -------
    df : pandas df
        original data frame with additional column 'col_claim_avg' \
        with the average cost per claim for that row's specialty

    Examples
    --------
    """
    df_cost = df[['{}_cost'.format(col_claim), col_specialty]].groupby(col_specialty).sum()
    df_count = df[['{}_count'.format(col_claim), col_specialty]].groupby(col_specialty).sum()
    avg_cost = df_cost.divide(np.array(df_count), axis=1)
    avg_cost.fillna(np.mean(avg_cost), inplace=True)
    
    return pd.merge(df, avg_cost, how='inner', left_on=col_specialty, right_index=True, suffixes=('','_avg'))


def impute_claim_cost(col_claim, col_specialty, df):
    """
    Imputes claim costs for all nulls in a column

    Parameters
    ----------
    col_claim : string
        first part of column names to be computed
    col_specialty : string
        name of column with the specialty descriptions
    df : pandas df
        data frame containing data

    Returns
    -------
    df : pandas df
        data frame with columns imputed

    Examples
    --------
    """
    df = specialty_avg_cost(col_claim, col_specialty, df)
    fill_count_null(col_claim+'_count', df)
    mask = df['{}_cost'.format(col_claim)].isnull()
    df['{}_cost'.format(col_claim)][mask] = \
       df['{}_count'.format(col_claim)][mask] * df[mask].iloc[:,-1]

    return df


def fill_count_null(col_claim, df):
    """
    Imputes claim counts for all nulls in a column.
    Set to 5.5 temporarily. May be modified in the future.

    Parameters
    ----------
    col_claim : string
        first part of column names to be computed
    df : pandas df
        data frame containing data

    Returns
    -------
    none : modified in place

    Examples
    --------
    """
    VALUE = 5.5
    df[col_claim].fillna(VALUE, inplace=True)

In [5]:
def normalize_numerics(df, normalize_col):
    numerics = ['int64', 'float64']
    for col in df[1:]:
        if df[col].dtypes in numerics:
            df[col+'_norm'] = df[col] / df[normalize_col]

In [34]:
def feature_engineer_drug_agg(df):
    #total cost per claim
    df['tot_cost_per_claim'] = df['total_drug_cost'] / df['total_claim_count']
    #total cost per day
    df['total_cost_per_day'] = df['total_drug_cost'] / df['total_day_supply']
    #generic to brand claim count
    df['generic_brand_count_ratio'] = df['generic_claim_count'] / (df['brand_claim_count']+df['generic_claim_count'])
    #generic to brand claim cost
    df['generic_brand_cost_ratio'] = df['generic_claim_cost'] / (df['brand_claim_cost']+df['generic_claim_cost'])
    #brand to total claim ratio
    df['brand_total_claim_ratio'] = df['brand_claim_count'] / df['total_claim_count']
    #replace infinite with 1
    #df.replace(np.inf, 1, inplace=True)

In [6]:
df_npi = sql_query_read('npi_13')

In [7]:
flag_cols = ['bene_count_ge65_redact', 
             'ge65_redact_flag', 
             'brand_redact_flag', 
             'generic_redact_flag', 
             'other_redact_flag', 
             'mapd_redact_flag', 
             'pdp_redact_flag',
             'lis_redact_flag',
             'nonlis_redact_flag']

In [8]:
for col in flag_cols:
    null_flag(df_npi, col)

In [9]:
impute_count_only = ['bene_count',
                      'bene_count_ge65',
                      'total_claim_count_ge65'
                    ]

In [10]:
for col in impute_count_only:
    fill_count_null(col, df_npi)

In [12]:
impute_cost_cols = ['brand_claim',
                    'generic_claim',
                    'other_claim',
                    'mapd_claim',
                    'pdp_claim',
                    'lis_claim',
                    'nonlis_claim']

In [13]:
for col in impute_cost_cols:
    df_npi = impute_claim_cost(col, 'specialty_desc', df_npi)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [15]:
normalize_numerics(df_npi, 'bene_count')

In [35]:
feature_engineer_drug_agg(df_npi)

In [36]:
df_npi.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
npi,1049381,1.499770e+09,2.879001e+08,1.003000e+09,1.245588e+09,1.497978e+09,1.740597e+09,1.993000e+09
bene_count,1049381,1.243218e+02,1.761377e+02,5.500000e+00,1.900000e+01,5.800000e+01,1.650000e+02,2.416400e+04
total_claim_count,1049381,1.303850e+03,3.151893e+03,1.100000e+01,4.900000e+01,1.830000e+02,9.560000e+02,1.915300e+05
total_drug_cost,1049381,9.873998e+04,2.390657e+05,0.000000e+00,1.553850e+03,8.999200e+03,8.097637e+04,1.660254e+07
total_day_supply,1049381,5.016021e+04,1.186718e+05,1.100000e+01,8.850000e+02,4.158000e+03,3.351700e+04,4.339456e+06
bene_count_ge65,1049381,8.660807e+01,1.483653e+02,0.000000e+00,5.500000e+00,2.200000e+01,1.110000e+02,2.050000e+04
bene_count_ge65_redact,1049381,5.700770e-01,4.950651e-01,0.000000e+00,0.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00
total_claim_count_ge65,1049381,9.580694e+02,2.521815e+03,0.000000e+00,1.400000e+01,1.060000e+02,5.990000e+02,1.661710e+05
ge65_redact_flag,1049381,7.843500e-01,4.112727e-01,0.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00
total_drug_cost_ge65,823082,8.248920e+04,1.814131e+05,0.000000e+00,2.020305e+03,1.142712e+04,7.850018e+04,9.916113e+06


## Try Model

In [32]:
df_npi.columns

Index([u'npi', u'last_name', u'first_name', u'middle_initial', u'credentials',
       u'gender', u'entity_code', u'provider_street1', u'prover_street2',
       u'provider_city', u'provider_zip', u'provider_state',
       u'provider_country', u'specialty_desc', u'description_flag',
       u'bene_count', u'total_claim_count', u'total_drug_cost',
       u'total_day_supply', u'bene_count_ge65', u'bene_count_ge65_redact',
       u'total_claim_count_ge65', u'ge65_redact_flag', u'total_drug_cost_ge65',
       u'total_day_supply_ge65', u'brand_claim_count', u'brand_redact_flag',
       u'brand_claim_cost', u'generic_claim_count', u'generic_redact_flag',
       u'generic_claim_cost', u'other_claim_count', u'other_redact_flag',
       u'other_claim_cost', u'mapd_claim_count', u'mapd_redact_flag',
       u'mapd_claim_cost', u'pdp_claim_count', u'pdp_redact_flag',
       u'pdp_claim_cost', u'lis_claim_count', u'lis_redact_flag',
       u'lis_claim_cost', u'nonlis_claim_count', u'nonlis_redact_flag

In [37]:
keep_cols = [u'bene_count', 
             u'total_claim_count', 
             u'total_drug_cost',
             u'total_day_supply', 
             #u'brand_claim_count', 
             u'brand_redact_flag',
             #u'brand_claim_cost', 
             #u'generic_claim_count', 
             u'generic_redact_flag',
             #u'generic_claim_cost', 
             #u'other_claim_count', 
             #u'other_redact_flag',
             #u'other_claim_cost', 
             #u'mapd_claim_count', 
             #u'mapd_redact_flag',
             #u'mapd_claim_cost', 
             #u'pdp_claim_count', 
             #u'pdp_redact_flag',
             #u'pdp_claim_cost', 
             #u'lis_claim_count', 
             #u'lis_redact_flag',
             #u'lis_claim_cost', 
             #u'nonlis_claim_count', 
             #u'nonlis_redact_flag',
             #u'nonlis_claim_cost', 
             #u'brand_claim_cost_avg',
             #u'generic_claim_cost_avg', 
             #u'other_claim_cost_avg',
             #u'mapd_claim_cost_avg', 
             #u'pdp_claim_cost_avg', 
             #u'lis_claim_cost_avg',
             #u'nonlis_claim_cost_avg', 
             u'npi_norm', 
             u'bene_count_norm',
             u'total_claim_count_norm', 
             u'total_drug_cost_norm',
             u'total_day_supply_norm', 
             #u'bene_count_ge65_norm',
             #u'bene_count_ge65_redact_norm', 
             #u'total_claim_count_ge65_norm',
             #u'ge65_redact_flag_norm', 
             #u'total_drug_cost_ge65_norm',
             #u'total_day_supply_ge65_norm', 
             u'brand_claim_count_norm',
             #u'brand_redact_flag_norm', 
             u'brand_claim_cost_norm',
             u'generic_claim_count_norm', 
             #u'generic_redact_flag_norm',
             u'generic_claim_cost_norm', 
             #u'other_claim_count_norm',
             #u'other_redact_flag_norm', 
             #u'other_claim_cost_norm',
             #u'mapd_claim_count_norm', 
             #u'mapd_redact_flag_norm',
             #u'mapd_claim_cost_norm', 
             #u'pdp_claim_count_norm',
             #u'pdp_redact_flag_norm', 
             #u'pdp_claim_cost_norm',
             #u'lis_claim_count_norm', 
             #u'lis_redact_flag_norm',
             #u'lis_claim_cost_norm', 
             #u'nonlis_claim_count_norm',
             #u'nonlis_redact_flag_norm', 
             #u'nonlis_claim_cost_norm',
             #u'brand_claim_cost_avg_norm', 
             #u'generic_claim_cost_avg_norm',
             #u'other_claim_cost_avg_norm', 
             #u'mapd_claim_cost_avg_norm',
             #u'pdp_claim_cost_avg_norm', 
             #u'lis_claim_cost_avg_norm',
             #u'nonlis_claim_cost_avg_norm', 
             u'tot_cost_per_claim',
             u'total_cost_per_day', 
             u'generic_brand_count_ratio',
             u'generic_brand_cost_ratio', 
             u'brand_total_claim_ratio']

In [38]:
X_train, X_test = mf.make_train_test_dataset(df_npi)

NameError: name 'mf' is not defined