In [None]:
# import libraries
import boto3, re, sys, math, json, os, sagemaker, urllib.request
from sagemaker import get_execution_role
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from IPython.display import Image
from IPython.display import display
from time import gmtime, strftime
import seaborn as sns

# Define IAM role
role = get_execution_role()
bucket_name = 'dgx-ds-use1-dev-landing-s3'
prefix = 'kamal/OrderTAT-tf'
my_region = boto3.session.Session().region_name # set the region of the instance

In [None]:
data_key = 'kamal/input/order_data_prep_job1.csv' 
data_location = 's3://{}/{}'.format(bucket_name, data_key) 

try:
  df = pd.read_csv(data_location, index_col=0)
  print('Success: Data loaded into dataframe.')
except Exception as e:
    print('Data load error: ',e)

In [None]:
df.head(5)

In [None]:
df.groupby(['PERFORMING_REGION'])['TAT_HOUR'].count().sort_values(ascending=False).head(20)

In [None]:
# Get an idea of how many new features we'll get from dummy explosion
for col in list(df):
    if (df[col].dtype =='object'):
        print('col:', col, 'unique vals: ', df[col].nunique() )

In [None]:
# Get an idea of how many new features we'll get from dummy explosion
for col in list(df):
    if (df[col].dtype =='object'):
        print(df.groupby([col])['TAT_HOUR'].count().sort_values(ascending=False).head(20))

In [None]:
df[df['TAT_HOUR']>df['TAT_HOUR'].quantile(.998)]

In [None]:
df[df['TAT_HOUR']< 0] #df['TAT_HOUR'].quantile(.001)]

In [None]:
dft = df[(df['TAT_HOUR']<=df['TAT_HOUR'].quantile(.99)) & (df['TAT_HOUR']>=0)]

In [None]:
#sns.boxplot(df_t[df_t['TAT_HOUR']<=df_t['TAT_HOUR'].quantile(.95)][['TAT_HOUR',  'PERFORMING_REGION']].sample(2000))
plt.figure(figsize = (15,6))
sns.set(font_scale=1.0)
sns.boxplot(x="MARKET_SEGMENT_DESC", y="TAT_HOUR", data=dft[['TAT_HOUR',  'MARKET_SEGMENT_DESC']].sample(200000))

In [None]:
#sns.boxplot(df_t[df_t['TAT_HOUR']<=df_t['TAT_HOUR'].quantile(.95)][['TAT_HOUR',  'PERFORMING_REGION']].sample(2000))
plt.figure(figsize = (10,8))
sns.set(font_scale=1.0)
sns.boxplot(x="Collection_Hour", y="TAT_HOUR", data=dft[['TAT_HOUR',  'Collection_Hour']].sample(200000))

In [None]:
plt.figure(figsize = (13,6))
sns.set(font_scale=1.0)
sns.boxplot(x="PERFORMING_REGION", y="TAT_HOUR", data=dft[['TAT_HOUR','PERFORMING_REGION']].sample(200000))

In [None]:
plt.figure(figsize = (3,6))
sns.set(font_scale=1.0)
sns.boxplot(x="Collection_is_Holiday", y="TAT_HOUR", data=dft[['TAT_HOUR',  'Collection_is_Holiday']].sample(200000))

In [None]:
plt.figure(figsize = (8,6))
sns.set(font_scale=1.0)
sns.boxplot(x="Collection_DOW", y="TAT_HOUR", data=dft[['TAT_HOUR',  'Collection_DOW']].sample(200000))

In [None]:
plt.figure(figsize = (12,6))
sns.set(font_scale=1.0)
sns.boxplot(x="LAB_SYSTEM_ID", y="TAT_HOUR", data=dft[['TAT_HOUR',  'LAB_SYSTEM_ID']].sample(200000))

In [None]:
plt.figure(figsize = (12,6))
sns.set(font_scale=1.0)
sns.boxplot(x="ORDERING_LAB_SITE_TYPE", y="TAT_HOUR", data=dft[['TAT_HOUR',  'ORDERING_LAB_SITE_TYPE']].sample(200000))

In [None]:
plt.figure(figsize = (8,2))
sns.set(font_scale=1.1)

sns.heatmap(dft[['TAT_HOUR', 'Distance', 'Collection_Hour', 'Hours_Collection_to_Accession']].sample(200000).corr(), annot=True)

In [None]:
plt.figure(figsize = (12,12))
sns.set(font_scale=1.1)
sns.pairplot(df[['TAT_HOUR', 'Distance', 'Collection_Hour', 'Hours_Collection_to_Accession', 'PERFORMING_REGION']].sample(10000))