In [None]:
!pip install --upgrade pandas
!pip install geopy
!pip install xgboost
!pip install holidays

In [None]:
import pandas as pd
from pandas.tseries.holiday import USFederalHolidayCalendar
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from datetime import date, datetime
import random
from geopy.distance import distance
import xgboost as xgb
from sklearn.model_selection import train_test_split
from tqdm.notebook import tqdm
tqdm.pandas()
import holidays
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from math import sqrt

pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", 500)
outputs = []

def clean_data(thisDf): 
    thisDf['ORDER_CODE'] = thisDf['ORDER_CODE'].astype(str)
    # remove anything with a negative TAT
#     thisDf = thisDf[(thisDf['TAT_HOUR']>=0) | (thisDf['TAT_HOUR'].isnull())]
#     # remove anything that is over the 99th percentile
#     thisDf = thisDf[(thisDf['TAT_HOUR']<=thisDf['TAT_HOUR'].quantile(.99)) | (thisDf['TAT_HOUR'].isnull())]
    return thisDf
    
def initial_drop_columns(thisDf): 
    return thisDf.drop([
        #'RECORD_ID',
        'PERFORMING_LAB_NAME',
        'BU_NAME',
        'BU_LATITUDE',
        'BU_LONGITUDE',
        'ORDERING_LAB_CODE',
        'ORDERING_LAB_NAME',     
        'BILLING_LEGAL_ENTITY',
        'ACCOUNT_NUMBER',
        'ACCOUNT_NAME',
        'ACCOUNT_STATE',
        'ACCOUNT_ZIP_CODE',
        'SPECIALTY_DESC',
        'PHYSICIAN_NPI',
        'PHYSICIAN_NAME',
        'BILL_ONLY_INDICATOR',
        'ORDER_UNIT_CODE',
        'ORDER_NAME',
        'ORDER_CODE_MNEMONIC',
        'PUBLISHED_TAT',
        'MAX_TAT',
        'STAT_ROUTINE_INDICATOR'   
    ], axis=1)



def get_distances(thisDf):

    def geo_distance(x):

        try: 
            return round(distance( (x['PERFORMING_LAB_LATITUDE'], x['PERFORMING_LAB_LONGITUDE']),
                               (x['ORDERING_LAB_LATITUDE'], x['ORDERING_LAB_LONGITUDE'])
                           ).miles)
        except:
            return 0
        
    try:
        thisDf['Distance'] = thisDf.progress_apply(geo_distance, axis=1)
    except:
        thisDf['Distance'] = thisDf.apply(geo_distance, axis=1)
    return thisDf


def clean_ordercode(thisDf):

    def clean_code(x):
        if (str(x['ORDER_CODE']).isnumeric()):
            return str(x['ORDER_CODE'])
        elif (str(x['ORDER_CODE'])[-3:] == 'NHD'):
            return 'NHD' 
        elif (str(x['ORDER_CODE'])[-3:] == 'PTH'):
            return 'PTH' 
        elif (str(x['ORDER_CODE'])[-4:] == 'CALC'):
            return 'CALC' 
        elif (str(x['ORDER_CODE'])[-4:] == '6517'):
            return 'ALBUM' 
        elif (str(x['ORDER_CODE'])[0:2] == 'AT'):
            return 'TISSUE' 
        elif (str(x['ORDER_CODE'])[-3:] == 'CRL'):
            return 'SBCRL' 
        elif (str(x['ORDER_CODE'])[0:8] == 'INTERPRE'):
            return 'INTPR' 
        else:
            return str(x['ORDER_CODE'])
    
    thisDf['ORDER_CODE_N'] = thisDf.apply(clean_code, axis=1)
    return thisDf

def do_concats(thisDf): 
    # df_t['Lab_Order'] = df_t['LAB_SYSTEM_ID'].astype(str) + df_t['ORDER_CODE'].astype(str)
    # df_t['Performing_Lab'] = df_t['PERFORMING_LAB_SITE_TYPE'].astype(str) + df_t['PERFORMING_LAB_CODE'].astype(str)

    return thisDf

def update_add_on_exists(thisDf):
    thisDf['Add_On_Exists'] =thisDf['ADD_ON_ORDER_DATE'].isnull()
    thisDf['Add_On_Exists'] =thisDf['Add_On_Exists'].apply(lambda x: 0 if x is True else 1)
    return thisDf

def do_date_stuff(thisDf):
#     def day_of_week(x):
#         return x.day_name()

    thisDf['COLLECTION_DATE'] = pd.to_datetime(thisDf['COLLECTION_DATE'])
    thisDf['ACCESSION_DATE'] = pd.to_datetime(thisDf['ACCESSION_DATE'])
    thisDf['Collection_DOW'] = thisDf['COLLECTION_DATE'].dt.day_name()
    thisDf['Accession_DOW'] = thisDf['ACCESSION_DATE'].dt.day_name()
    
    #check to see if holiday
    us_holidays = holidays.US()
    
    thisDf['Accession_is_Holiday'] = thisDf['ACCESSION_DATE'].apply(lambda x: x in us_holidays)
    thisDf['Collection_is_Holiday'] = thisDf['COLLECTION_DATE'].apply(lambda x: x in us_holidays)
    thisDf['Collection_is_Holiday'] = thisDf['Collection_is_Holiday'].apply(lambda x: 1 if x is True else 0)
    thisDf['Accession_is_Holiday'] = thisDf['Accession_is_Holiday'].apply(lambda x: 1 if x is True else 0)
    
    
    # get collection hour
    thisDf['Collection_Hour'] = thisDf['COLLECTION_DATE'].dt.hour
    
    # do hours between collection/accession
    thisDf['Hours_Collection_to_Accession'] = thisDf['ACCESSION_DATE'] - thisDf['COLLECTION_DATE']
    thisDf['Hours_Collection_to_Accession'] = thisDf['Hours_Collection_to_Accession'].dt.total_seconds()/60/60
    
    thisDf['Bad_Accession_Date'] = thisDf['COLLECTION_DATE'] > thisDf['ACCESSION_DATE'] 
    thisDf['Bad_Accession_Date'] = thisDf['Bad_Accession_Date'].apply(lambda x: 1 if x is True else 0)

    
    return thisDf

def second_drop_columns(thisDf):
    return thisDf.drop(
        [
            'PERFORMING_LAB_LATITUDE',
            'PERFORMING_LAB_LONGITUDE',
            'ORDERING_LAB_LATITUDE',
            'ORDERING_LAB_LONGITUDE',
            'COLLECTION_DATE',
            'ACCESSION_DATE',
            'ADD_ON_ORDER_DATE',
            'WORKLIST_CODE',
            'PERFORMING_LAB_SITE_TYPE',
            'ORDER_CODE', #new column used
            #'PERFORMING_LAB_CODE',
        ], axis=1)

def get_dummy_vars(thisDf):
    return pd.get_dummies(thisDf, drop_first = True)

In [None]:
%%time 

s3_bucket = 'dgx-ds-use1-dev-landing-s3/kamal/input'
filename = 'orderdata_split.csv'
data_location = 's3://{}/{}'.format(s3_bucket, filename)

try:
    #df = pd.read_csv('datathon_full.tab.zip', sep='\t')
    df = pd.read_csv(data_location)
    print(df.shape)
except Exception as inst:
    print(inst)
        


In [None]:
#first column cleanup
df.drop(df.columns[[0,1]], axis=1, inplace=True)
df.head(2)

In [None]:
# samples that will be used for development
#df_t = df.sample(n=2000000, random_state=42)

In [None]:
%%time

# initial data column drops
df_t = initial_drop_columns(df)
df_t = clean_data(df_t)

#df.sample(100).to_csv('Sample.csv')

In [None]:
%%time

df_lat_lon = df_t[['PERFORMING_LAB_LATITUDE', 'PERFORMING_LAB_LONGITUDE', 'ORDERING_LAB_LATITUDE', 'ORDERING_LAB_LONGITUDE']]
df_lat_lon.drop_duplicates(inplace=True)
df_lat_lon = get_distances(df_lat_lon)

df_t = pd.merge(
    df_t,
    df_lat_lon,
    on=['PERFORMING_LAB_LATITUDE', 'PERFORMING_LAB_LONGITUDE', 'ORDERING_LAB_LATITUDE', 'ORDERING_LAB_LONGITUDE'],
    how='left'
)

In [None]:
%%time

df_t = clean_ordercode(df_t)
#df.sample(100).to_csv('Sample.csv')

In [None]:
#Distance zscore
#from scipy.stats import zscore
#df['Distancez']=zscore(df['Distance'])


In [None]:
%%time

df_t = do_date_stuff(df_t)

In [None]:
df_t = update_add_on_exists(df_t)

In [None]:
df_t = second_drop_columns(df_t)

In [None]:
# Get an idea of how many new features we'll get from dummy explosion
for col in list(df_t):
    if (df_t[col].dtype =='object'):
        print('col:', col, 'unique vals: ', df_t[col].nunique())

In [None]:
#Check for nulls
df_t.isna().any()
#df['ACCNTYPE'].fillna('OTHER', inplace=True)
#df_t.columns[df.isna().any()]

In [None]:
dfx = df_t[df_t['TAT_HOUR'].isna()]
dfx.head(5)

In [None]:
%%time

df_t['TAT_HOUR'].fillna(round(df_t['TAT_HOUR'].mean()), inplace=True)
df_t['ACCN_PROCESS_TYPE_CODE'].fillna('OTHER', inplace=True)
df_t['MARKET_SEGMENT_DESC'].fillna('OTHER', inplace=True)

In [None]:
df_t.head()
#df_t.drop(df_t[[0]], inplace=True)
#df_t.sample(1000).to_csv('Sample.csv', index=False)

In [None]:
bucket = 'dgx-ds-use1-dev-landing-s3' 
data_key = 'kamal/input/order_data_prep_job1.csv' 
comm_data_location = 's3://{}/{}'.format(bucket, data_key) 
df_t.to_csv(comm_data_location, index=False)