#   ETL for prices, sales, quant_fe and ecomm data


In [1]:
import pandas as pd
import os
import io
import getpass
import psycopg2
from datetime import datetime


In [2]:
# get resource folder path
parent_dir = os.path.abspath(os.path.join(os.getcwd(), os.pardir))
resource_dir = os.path.join(parent_dir, 'resources')


##   STEP 1 - IMPORT AND PREPARE THE DATA


In [3]:
# load csv files that need transposing into pandas dataframes
df_prices_raw = pd.read_csv(os.path.join(resource_dir,'data3','prices.csv'))
df_quant_fe_raw = pd.read_csv(os.path.join(resource_dir,'data3','quant_fe.csv'))
df_sales_raw = pd.read_csv(os.path.join(resource_dir,'data3','sales.csv'))

# import the e-commerce csv file 
df_ecomm = pd.read_csv(os.path.join(resource_dir,'data7','walmart_com-ecommerce_product_details.csv'))


In [4]:
# check for empty string text ids (keys)
print((df_sales_raw['id'].values == '').sum())
print((df_prices_raw['id'].values == '').sum())
print((df_quant_fe_raw['id'].values == '').sum())


0
0
0


In [5]:
# check that all ids in prices and quant are in sales
print('  length of df_prices_raw: ' + str(len(df_prices_raw['id'])))
print('     df_prices_raw unique: ' + str(df_prices_raw['id'].nunique()))

print('\n   length of df_sales_raw: ' + str(len(df_sales_raw['id'])))
print('      df_sales_raw unique: ' + str(df_sales_raw['id'].nunique()))

print('\nlength of df_quant_fe_raw: ' + str(len(df_sales_raw['id'])))
print('   df_quant_fe_raw unique: ' + str(df_sales_raw['id'].nunique()))

df_ids = df_sales_raw.merge(df_prices_raw, how='outer', on='id')
print('\n       sales outer prices: ' + str(df_ids['id'].nunique()))

df_ids = df_sales_raw.merge(df_quant_fe_raw, how='outer', on='id')
print('\n        sales outer quant: ' + str(df_ids['id'].nunique()))


  length of df_prices_raw: 30490
     df_prices_raw unique: 30490

   length of df_sales_raw: 42840
      df_sales_raw unique: 42840

length of df_quant_fe_raw: 42840
   df_quant_fe_raw unique: 42840

       sales outer prices: 42840

        sales outer quant: 42840


### Sales
####     returns:
####         - df_sales_raw (main table with foreign keys, requires transposition)
####         - df_sales_categories (keyed dimension table)
####         - df_sales_departments (keyed dimension table)
####         - df_sales_stores (keyed dimension table)
####         - df_sales_states (keyed dimension table)
####         - df_sales_items (keyed dimension table)

In [6]:
# create category data from the prices/sales/quant data
df_sales_id = df_sales_raw.groupby('id').size().reset_index()
df_sales_id = df_sales_id.loc[:, ['id']]
df_sales_id.rename(columns = {'id': 'id_desc'}, inplace=True) 
df_sales_id['id'] = df_sales_id.index

df_sales_categories = df_sales_raw[['cat_id']].dropna(axis=0).groupby('cat_id').size().reset_index()
df_sales_categories = df_sales_categories.loc[:, ['cat_id']]
df_sales_categories.rename(columns = {'cat_id': 'cat_desc'}, inplace=True) 
df_sales_categories['cat_id'] = df_sales_categories.index

df_sales_departments = df_sales_raw[['dept_id']].dropna(axis=0).groupby('dept_id').size().reset_index()
df_sales_departments = df_sales_departments.loc[:, ['dept_id']]
df_sales_departments.rename(columns = {'dept_id': 'dept_desc'}, inplace=True) 
df_sales_departments['dept_id'] = df_sales_departments.index

df_sales_stores = df_sales_raw[['store_id']].dropna(axis=0).groupby('store_id').size().reset_index()
df_sales_stores = df_sales_stores.loc[:, ['store_id']]
df_sales_stores.rename(columns = {'store_id': 'store_desc'}, inplace=True) 
df_sales_stores['store_id'] = df_sales_stores.index 

# df_sales_states = df_sales_raw[['state_id']].dropna(axis=0).groupby('state_id').size().reset_index()
# df_sales_states = df_sales_states.loc[:, ['state_id']]
# df_sales_states.rename(columns = {'state_id': 'state_desc'}, inplace=True) 
# df_sales_states['state_id'] = df_sales_states.index 

# manually create the states dataframe - to align the keys with other tables being used!
states = [[6,'CA'],[48,'TX'],[55,'WI'],[99,'X']]
df_sales_states = pd.DataFrame(states, columns = ['state_id', 'state_desc'])
# df_sales_states.index = df_sales_states['state_id']


df_sales_items = df_sales_raw[['item_id']].dropna(axis=0).groupby('item_id').size().reset_index()
df_sales_items = df_sales_items.loc[:, ['item_id']]
df_sales_items.rename(columns = {'item_id': 'item_desc'}, inplace=True) 
df_sales_items['item_id'] = df_sales_items.index


In [7]:
df_sales_states.dtypes



state_id       int64
state_desc    object
dtype: object

In [8]:
# confirm no nulls in new dataframes
print('nulls in df_sales_id\n' + str(len(df_sales_id) - df_sales_id.count()))
print('\nnulls in df_sales_categories\n' + str(len(df_sales_categories) - df_sales_categories.count()))
print('\nnulls in df_sales_departments\n' + str(len(df_sales_departments) - df_sales_departments.count()))
print('\nnulls in df_sales_stores\n' + str(len(df_sales_stores) - df_sales_stores.count()))
print('\nnulls in df_sales_states\n' + str(len(df_sales_states) - df_sales_states.count()))
print('\nnulls in df_sales_items\n' + str(len(df_sales_items) - df_sales_items.count()))

nulls in df_sales_id
id_desc    0
id         0
dtype: int64

nulls in df_sales_categories
cat_desc    0
cat_id      0
dtype: int64

nulls in df_sales_departments
dept_desc    0
dept_id      0
dtype: int64

nulls in df_sales_stores
store_desc    0
store_id      0
dtype: int64

nulls in df_sales_states
state_id      0
state_desc    0
dtype: int64

nulls in df_sales_items
item_desc    0
item_id      0
dtype: int64


In [9]:
# add the foreign keys for categories, departments, items, states and stores to the ***sales*** table
# rename the previous non-numeric ids (delete them after)
df_sales_raw.rename(columns = {'id': 'id_old','cat_id': 'cat_id_old','dept_id': 'dept_id_old','item_id': 'item_id_old','state_id': 'state_id_old','store_id': 'store_id_old'}, inplace=True)

# merge df_sales (left) with df_sales_id on id_old = id_desc to get the new numeric id foreign key
df_sales_raw = df_sales_raw.merge(df_sales_id, how='left', left_on='id_old', right_on='id_desc')
df_sales_raw['id'] = df_sales_raw['id'].astype('Int64')

# merge df_sales (left) with df_sales_categories on cat_id_old = cat_id_desc to get the new numeric cat_id foreign key
df_sales_raw = df_sales_raw.merge(df_sales_categories, how='left', left_on='cat_id_old', right_on='cat_desc')
df_sales_raw['cat_id'] = df_sales_raw['cat_id'].astype('Int64')

# merge df_sales (left) with df_sales_departments on dept_id_old = dept_id_desc to get the new numeric dept_id foreign key
df_sales_raw = df_sales_raw.merge(df_sales_departments, how='left', left_on='dept_id_old', right_on='dept_desc')
df_sales_raw['dept_id'] = df_sales_raw['dept_id'].astype('Int64')

# merge df_sales (left) with df_sales_items on item_id_old = item_id_desc to get the new numeric item_id foreign key
df_sales_raw = df_sales_raw.merge(df_sales_items, how='left', left_on='item_id_old', right_on='item_desc')
df_sales_raw['item_id'] = df_sales_raw['item_id'].astype('Int64')

# merge df_sales (left) with df_sales_states on state_id_old = state_id_desc to get the new numeric state_id foreign key
df_sales_raw = df_sales_raw.merge(df_sales_states, how='left', left_on='state_id_old', right_on='state_desc')
df_sales_raw['state_id'] = df_sales_raw['state_id'].astype('Int64')

# merge df_sales (left) with df_sales_stores on store_id_old = store_id_desc to get the new numeric store_id foreign key
df_sales_raw = df_sales_raw.merge(df_sales_stores, how='left', left_on='store_id_old', right_on='store_desc')
df_sales_raw['store_id'] = df_sales_raw['store_id'].astype('Int64')

df_sales_raw

Unnamed: 0,id_old,item_id_old,dept_id_old,cat_id_old,store_id_old,state_id_old,d_1,d_2,d_3,d_4,...,cat_desc,cat_id,dept_desc,dept_id,item_desc,item_id,state_id,state_desc,store_desc,store_id
0,HOBBIES_1_001_CA_1,HOBBIES_1_001,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,HOBBIES,1,HOBBIES_1,5,HOBBIES_1_001,1443,6,CA,CA_1,1
1,HOBBIES_1_002_CA_1,HOBBIES_1_002,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,HOBBIES,1,HOBBIES_1,5,HOBBIES_1_002,1444,6,CA,CA_1,1
2,HOBBIES_1_003_CA_1,HOBBIES_1_003,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,HOBBIES,1,HOBBIES_1,5,HOBBIES_1_003,1445,6,CA,CA_1,1
3,HOBBIES_1_004_CA_1,HOBBIES_1_004,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,HOBBIES,1,HOBBIES_1,5,HOBBIES_1_004,1446,6,CA,CA_1,1
4,HOBBIES_1_005_CA_1,HOBBIES_1_005,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,HOBBIES,1,HOBBIES_1,5,HOBBIES_1_005,1447,6,CA,CA_1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
42835,WI_3_X,X,X,X,WI_3,WI,4038,4198,3317,3211,...,X,3,X,10,X,3059,55,WI,WI_3,12
42836,CA_X,X,X,X,CA,CA,14195,13805,10108,11047,...,X,3,X,10,X,3059,6,CA,CA,0
42837,TX_X,X,X,X,TX,TX,9438,9630,6778,7381,...,X,3,X,10,X,3059,48,TX,TX,5
42838,WI_X,X,X,X,WI,WI,8998,8314,6897,6984,...,X,3,X,10,X,3059,55,WI,WI,9


In [10]:
# drop old non-numeric ids and redundant merged descriptions
df_sales_raw.drop('id_old',axis=1,inplace=True)
df_sales_raw.drop('id_desc',axis=1,inplace=True)

df_sales_raw.drop('cat_id_old',axis=1,inplace=True)
df_sales_raw.drop('cat_desc',axis=1,inplace=True)

df_sales_raw.drop('dept_id_old',axis=1,inplace=True)
df_sales_raw.drop('dept_desc',axis=1,inplace=True)

df_sales_raw.drop('item_id_old',axis=1,inplace=True)
df_sales_raw.drop('item_desc',axis=1,inplace=True)

df_sales_raw.drop('state_id_old',axis=1,inplace=True)
df_sales_raw.drop('state_desc',axis=1,inplace=True)

df_sales_raw.drop('store_id_old',axis=1,inplace=True)
df_sales_raw.drop('store_desc',axis=1,inplace=True)

df_sales_raw


Unnamed: 0,d_1,d_2,d_3,d_4,d_5,d_6,d_7,d_8,d_9,d_10,...,d_1940,d_1941,start,scale,id,cat_id,dept_id,item_id,state_id,store_id
0,0,0,0,0,0,0,0,0,0,0,...,0,1,901,0.773603,18915,1,5,1443,6,1
1,0,0,0,0,0,0,0,0,0,0,...,0,0,143,0.422049,18926,1,5,1444,6,1
2,0,0,0,0,0,0,0,0,0,0,...,0,1,1105,0.511990,18937,1,5,1445,6,1
3,0,0,0,0,0,0,0,0,0,0,...,2,6,36,1.778245,18948,1,5,1446,6,1
4,0,0,0,0,0,0,0,0,0,0,...,1,0,112,1.102354,18959,1,5,1447,6,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
42835,4038,4198,3317,3211,2132,4590,4486,5991,4850,3240,...,4519,4757,0,538.851470,39779,3,10,3059,55,12
42836,14195,13805,10108,11047,9925,11322,12251,16610,14696,11822,...,23103,24644,0,1970.131511,3103,3,10,3059,6,0
42837,9438,9630,6778,7381,5912,9006,6226,9440,9376,7319,...,13681,14815,0,1349.161939,39745,3,10,3059,48,5
42838,8998,8314,6897,6984,3309,8883,9533,11882,8664,6431,...,14734,14879,0,1440.429603,42839,3,10,3059,55,9


### Prices
####     returns:
####         - df_prices_raw (main table with foreign keys, requires transposition - has same primary keying as sales)
####         - df_prices_start (these describe item/store-level data and will be at a different level than the transposed daily data)
####         - df_prices_scale (these describe item/store-level data and will be at a different level than the transposed daily data)

In [11]:
# add the foreign key id for the ***prices*** table
# delete the previous non-numeric ids
df_prices_raw.rename(columns = {'id': 'id_old'}, inplace=True)

# merge df_sales (left) with df_sales_id on id_old = id_desc to get the new numeric id foreign key
df_prices_raw = df_prices_raw.merge(df_sales_id, how='left', left_on='id_old', right_on='id_desc')
df_prices_raw['id'] = df_prices_raw['id'].astype('Int64')
df_prices_raw

Unnamed: 0,id_old,d_1,d_2,d_3,d_4,d_5,d_6,d_7,d_8,d_9,...,d_1962,d_1963,d_1964,d_1965,d_1966,d_1967,d_1968,d_1969,id_desc,id
0,FOODS_1_001_CA_1,2.00,2.00,2.00,2.00,2.00,2.00,2.00,2.00,2.00,...,2.24,2.24,2.24,2.24,2.24,2.24,2.24,2.24,FOODS_1_001_CA_1,3104
1,FOODS_1_001_CA_2,2.00,2.00,2.00,2.00,2.00,2.00,2.00,2.00,2.00,...,2.24,2.24,2.24,2.24,2.24,2.24,2.24,2.24,FOODS_1_001_CA_2,3105
2,FOODS_1_001_CA_3,2.00,2.00,2.00,2.00,2.00,2.00,2.00,2.00,2.00,...,2.24,2.24,2.24,2.24,2.24,2.24,2.24,2.24,FOODS_1_001_CA_3,3106
3,FOODS_1_001_CA_4,2.00,2.00,2.00,2.00,2.00,2.00,2.00,2.00,2.00,...,2.24,2.24,2.24,2.24,2.24,2.24,2.24,2.24,FOODS_1_001_CA_4,3107
4,FOODS_1_001_TX_1,2.00,2.00,2.00,2.00,2.00,2.00,2.00,2.00,2.00,...,2.24,2.24,2.24,2.24,2.24,2.24,2.24,2.24,FOODS_1_001_TX_1,3108
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30485,HOUSEHOLD_2_516_TX_2,5.94,5.94,5.94,5.94,5.94,5.94,5.94,5.94,5.94,...,5.94,5.94,5.94,5.94,5.94,5.94,5.94,5.94,HOUSEHOLD_2_516_TX_2,36645
30486,HOUSEHOLD_2_516_TX_3,5.94,5.94,5.94,5.94,5.94,5.94,5.94,5.94,5.94,...,5.94,5.94,5.94,5.94,5.94,5.94,5.94,5.94,HOUSEHOLD_2_516_TX_3,36646
30487,HOUSEHOLD_2_516_WI_1,5.94,5.94,5.94,5.94,5.94,5.94,5.94,5.94,5.94,...,5.94,5.94,5.94,5.94,5.94,5.94,5.94,5.94,HOUSEHOLD_2_516_WI_1,36647
30488,HOUSEHOLD_2_516_WI_2,,,,,,,,,,...,5.94,5.94,5.94,5.94,5.94,5.94,5.94,5.94,HOUSEHOLD_2_516_WI_2,36648


In [12]:
df_prices_raw.drop('id_old',axis=1,inplace=True)
df_prices_raw.drop('id_desc',axis=1,inplace=True)

In [13]:
# the start and scale columns shouldn't be transposed in this table because they're item-level, not item/day-level 
# (the information would be duplicated 1,941 times)
df_prices_start = df_sales_raw[['id','start']]
df_prices_scale = df_sales_raw[['id','scale']]
df_sales_raw.drop('start',axis=1,inplace=True)
df_sales_raw.drop('scale',axis=1,inplace=True)
df_sales_raw

Unnamed: 0,d_1,d_2,d_3,d_4,d_5,d_6,d_7,d_8,d_9,d_10,...,d_1938,d_1939,d_1940,d_1941,id,cat_id,dept_id,item_id,state_id,store_id
0,0,0,0,0,0,0,0,0,0,0,...,3,3,0,1,18915,1,5,1443,6,1
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,18926,1,5,1444,6,1
2,0,0,0,0,0,0,0,0,0,0,...,2,3,0,1,18937,1,5,1445,6,1
3,0,0,0,0,0,0,0,0,0,0,...,3,0,2,6,18948,1,5,1446,6,1
4,0,0,0,0,0,0,0,0,0,0,...,0,2,1,0,18959,1,5,1447,6,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
42835,4038,4198,3317,3211,2132,4590,4486,5991,4850,3240,...,3398,4126,4519,4757,39779,3,10,3059,55,12
42836,14195,13805,10108,11047,9925,11322,12251,16610,14696,11822,...,16297,17430,23103,24644,3103,3,10,3059,6,0
42837,9438,9630,6778,7381,5912,9006,6226,9440,9376,7319,...,9162,12303,13681,14815,39745,3,10,3059,48,5
42838,8998,8314,6897,6984,3309,8883,9533,11882,8664,6431,...,11504,12819,14734,14879,42839,3,10,3059,55,9


### quant_fe
####     returns:
####         - df_quant_fe (main table with id that matches sales and prices)

In [14]:
# add the foreign key for id to the ***quant_fe*** table
# delete the previous non-numeric ids
df_quant_fe_raw.rename(columns = {'id': 'id_old'}, inplace=True)

# merge df_sales (left) with df_sales_id on id_old = id_desc to get the new numeric id foreign key
df_quant_fe_raw = df_quant_fe_raw.merge(df_sales_id, how='left', left_on='id_old', right_on='id_desc')
df_quant_fe_raw['id'] = df_quant_fe_raw['id'].astype('Int64')
df_quant_fe_raw

Unnamed: 0,id_old,ord,Q5,Q25,Q165,Q250,Q500,Q750,Q835,Q975,Q995,id_desc,id
0,HOBBIES_1_001_CA_1,0,0.000,0.000,0.000,0.00,0.0,0.00,0.000,0.000,0.000,HOBBIES_1_001_CA_1,18915
1,HOBBIES_1_002_CA_1,0,0.000,0.000,0.000,0.00,0.0,0.00,0.000,0.000,0.000,HOBBIES_1_002_CA_1,18926
2,HOBBIES_1_003_CA_1,0,0.000,0.000,0.000,0.00,0.0,0.00,0.000,0.000,0.000,HOBBIES_1_003_CA_1,18937
3,HOBBIES_1_004_CA_1,0,0.000,0.000,0.000,0.00,0.0,0.00,0.935,2.000,2.695,HOBBIES_1_004_CA_1,18948
4,HOBBIES_1_005_CA_1,0,0.000,0.000,0.000,0.00,0.0,0.00,0.000,0.000,0.000,HOBBIES_1_005_CA_1,18959
...,...,...,...,...,...,...,...,...,...,...,...,...,...
942475,WI_3_X,21,2893.460,3008.750,3316.375,3543.00,4050.5,4534.25,4727.535,5663.275,5837.900,WI_3_X,39779
942476,CA_X,21,14520.130,14627.750,15475.185,16064.25,17507.0,21918.25,23075.610,24625.575,24688.700,CA_X,3103
942477,TX_X,21,9267.315,9581.875,10081.985,10454.00,11342.0,12331.50,13250.025,14643.950,15883.330,TX_X,39745
942478,WI_X,21,10130.645,10448.475,11082.840,11803.00,13534.0,14866.25,15378.950,18950.675,19490.245,WI_X,42839


In [15]:
df_quant_fe_raw.drop('id_old',axis=1,inplace=True)
df_quant_fe_raw.drop('id_desc',axis=1,inplace=True)
df_quant_fe_raw

Unnamed: 0,ord,Q5,Q25,Q165,Q250,Q500,Q750,Q835,Q975,Q995,id
0,0,0.000,0.000,0.000,0.00,0.0,0.00,0.000,0.000,0.000,18915
1,0,0.000,0.000,0.000,0.00,0.0,0.00,0.000,0.000,0.000,18926
2,0,0.000,0.000,0.000,0.00,0.0,0.00,0.000,0.000,0.000,18937
3,0,0.000,0.000,0.000,0.00,0.0,0.00,0.935,2.000,2.695,18948
4,0,0.000,0.000,0.000,0.00,0.0,0.00,0.000,0.000,0.000,18959
...,...,...,...,...,...,...,...,...,...,...,...
942475,21,2893.460,3008.750,3316.375,3543.00,4050.5,4534.25,4727.535,5663.275,5837.900,39779
942476,21,14520.130,14627.750,15475.185,16064.25,17507.0,21918.25,23075.610,24625.575,24688.700,3103
942477,21,9267.315,9581.875,10081.985,10454.00,11342.0,12331.50,13250.025,14643.950,15883.330,39745
942478,21,10130.645,10448.475,11082.840,11803.00,13534.0,14866.25,15378.950,18950.675,19490.245,42839


### e-commerce
####     returns:
####         - df_ecomm_prod_cat (main table with foreign keys to the 6 category levels)
####         - df_ecomm_category_level1
####         - df_ecomm_category_level2
####         - df_ecomm_category_level3
####         - df_ecomm_category_level4
####         - df_ecomm_category_level5
####         - df_ecomm_category_level6

In [16]:
# create category data from the ecomm data
df_ecomm_prod_cat = df_ecomm[['Category']].dropna(axis=0).groupby('Category').size().reset_index()
df_ecomm_prod_cat = df_ecomm_prod_cat.loc[:, ['Category']]
df_ecomm_prod_cat.rename(columns = {'Category': 'category_combined_desc'}, inplace=True) 
df_ecomm_prod_cat.index.name = 'category_id'

# split the combined category column into its 6 levels on the '_' in the combined name
df_ecomm_prod_cat[['category_level1','category_level2','category_level3','category_level4','category_level5','category_level6']] = df_ecomm_prod_cat['category_combined_desc'].str.split('|',5,expand=True)

# category_level1
df_ecomm_category_level1 = df_ecomm_prod_cat[['category_level1']].dropna(axis=0).groupby('category_level1').size().reset_index()
df_ecomm_category_level1 = df_ecomm_category_level1.loc[:, ['category_level1']]
df_ecomm_category_level1.rename(columns = {'category_level1': 'category_level1_desc'}, inplace=True) 
df_ecomm_category_level1['category_level1_id'] = df_ecomm_category_level1.index

# category_level2
df_ecomm_category_level2 = df_ecomm_prod_cat[['category_level2']].dropna(axis=0).groupby('category_level2').size().reset_index()
df_ecomm_category_level2 = df_ecomm_category_level2.loc[:, ['category_level2']]
df_ecomm_category_level2.rename(columns = {'category_level2': 'category_level2_desc'}, inplace=True) 
df_ecomm_category_level2['category_level2_id'] = df_ecomm_category_level2.index

# category_level3
df_ecomm_category_level3 = df_ecomm_prod_cat[['category_level3']].dropna(axis=0).groupby('category_level3').size().reset_index()
df_ecomm_category_level3 = df_ecomm_category_level3.loc[:, ['category_level3']]
df_ecomm_category_level3.rename(columns = {'category_level3': 'category_level3_desc'}, inplace=True) 
df_ecomm_category_level3['category_level3_id'] = df_ecomm_category_level3.index

# category_level4
df_ecomm_category_level4 = df_ecomm_prod_cat[['category_level4']].dropna(axis=0).groupby('category_level4').size().reset_index()
df_ecomm_category_level4 = df_ecomm_category_level4.loc[:, ['category_level4']]
df_ecomm_category_level4.rename(columns = {'category_level4': 'category_level4_desc'}, inplace=True) 
df_ecomm_category_level4['category_level4_id'] = df_ecomm_category_level4.index

# category_level5
df_ecomm_category_level5 = df_ecomm_prod_cat[['category_level5']].dropna(axis=0).groupby('category_level5').size().reset_index()
df_ecomm_category_level5 = df_ecomm_category_level5.loc[:, ['category_level5']]
df_ecomm_category_level5.rename(columns = {'category_level5': 'category_level5_desc'}, inplace=True) 
df_ecomm_category_level5['category_level5_id'] = df_ecomm_category_level5.index

# category_level6
df_ecomm_category_level6 = df_ecomm_prod_cat[['category_level6']].dropna(axis=0).groupby('category_level6').size().reset_index()
df_ecomm_category_level6 = df_ecomm_category_level6.loc[:, ['category_level6']]
df_ecomm_category_level6.rename(columns = {'category_level6': 'category_level6_desc'}, inplace=True) 
df_ecomm_category_level6['category_level6_id'] = df_ecomm_category_level6.index


In [17]:
# rename the previous non-numeric ids (delete them after)
df_ecomm_prod_cat = df_ecomm_prod_cat.merge(df_ecomm_category_level1, how='left', left_on='category_level1', right_on='category_level1_desc')
df_ecomm_prod_cat['category_level1_id'] = df_ecomm_prod_cat['category_level1_id'].astype('Int64')

df_ecomm_prod_cat = df_ecomm_prod_cat.merge(df_ecomm_category_level2, how='left', left_on='category_level2', right_on='category_level2_desc')
df_ecomm_prod_cat['category_level2_id'] = df_ecomm_prod_cat['category_level2_id'].astype('Int64')

df_ecomm_prod_cat = df_ecomm_prod_cat.merge(df_ecomm_category_level3, how='left', left_on='category_level3', right_on='category_level3_desc')
df_ecomm_prod_cat['category_level3_id'] = df_ecomm_prod_cat['category_level3_id'].astype('Int64')

df_ecomm_prod_cat = df_ecomm_prod_cat.merge(df_ecomm_category_level4, how='left', left_on='category_level4', right_on='category_level4_desc')
df_ecomm_prod_cat['category_level4_id'] = df_ecomm_prod_cat['category_level4_id'].astype('Int64')

df_ecomm_prod_cat = df_ecomm_prod_cat.merge(df_ecomm_category_level5, how='left', left_on='category_level5', right_on='category_level5_desc')
df_ecomm_prod_cat['category_level5_id'] = df_ecomm_prod_cat['category_level5_id'].astype('Int64')

df_ecomm_prod_cat = df_ecomm_prod_cat.merge(df_ecomm_category_level6, how='left', left_on='category_level6', right_on='category_level6_desc')
df_ecomm_prod_cat['category_level6_id'] = df_ecomm_prod_cat['category_level6_id'].astype('Int64')

df_ecomm_prod_cat

Unnamed: 0,category_combined_desc,category_level1,category_level2,category_level3,category_level4,category_level5,category_level6,category_level1_desc,category_level1_id,category_level2_desc,category_level2_id,category_level3_desc,category_level3_id,category_level4_desc,category_level4_id,category_level5_desc,category_level5_id,category_level6_desc,category_level6_id
0,"Arts, Crafts & Sewing|Art Supplies|Art Sets","Arts, Crafts & Sewing",Art Supplies,Art Sets,,,,"Arts, Crafts & Sewing",0,Art Supplies,9,Art Sets,66,,,,,,
1,"Arts, Crafts & Sewing|Art Supplies|Drawing & I...","Arts, Crafts & Sewing",Art Supplies,Drawing & Illustration,Art Sketchbooks and Paper,,,"Arts, Crafts & Sewing",0,Art Supplies,9,Drawing & Illustration,289,Art Sketchbooks and Paper,83,,,,
2,"Arts, Crafts & Sewing|Art Supplies|Drawing & I...","Arts, Crafts & Sewing",Art Supplies,Drawing & Illustration,Markers,,,"Arts, Crafts & Sewing",0,Art Supplies,9,Drawing & Illustration,289,Markers,379,,,,
3,"Arts, Crafts & Sewing|Art Supplies|Drawing & I...","Arts, Crafts & Sewing",Art Supplies,Drawing & Illustration,Pastels,,,"Arts, Crafts & Sewing",0,Art Supplies,9,Drawing & Illustration,289,Pastels,454,,,,
4,"Arts, Crafts & Sewing|Art Supplies|Painting|Ai...","Arts, Crafts & Sewing",Art Supplies,Painting,Airbrushing,,,"Arts, Crafts & Sewing",0,Art Supplies,9,Painting,597,Airbrushing,15,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1193,Video Games|Video Game Titles|Shop by Genre|Sp...,Video Games,Video Game Titles,Shop by Genre,Sports Games,,,Video Games,30,Video Game Titles,275,Shop by Genre,725,Sports Games,540,,,,
1194,Video Games|Video Game Titles|Video Game Titles,Video Games,Video Game Titles,Video Game Titles,,,,Video Games,30,Video Game Titles,275,Video Game Titles,805,,,,,,
1195,Walmart for Business|Breakroom Supplies|Breakr...,Walmart for Business,Breakroom Supplies,Breakroom Furniture,,,,Walmart for Business,31,Breakroom Supplies,43,Breakroom Furniture,153,,,,,,
1196,Walmart for Business|Office Organization|Filin...,Walmart for Business,Office Organization,Filing Products,File Folders,,,Walmart for Business,31,Office Organization,187,Filing Products,339,File Folders,252,,,,


In [18]:
# delete the merged descriptions and original descriptions, leaving the ids
df_ecomm_prod_cat.drop('category_combined_desc',axis=1,inplace=True)

df_ecomm_prod_cat.drop('category_level1',axis=1,inplace=True)
df_ecomm_prod_cat.drop('category_level1_desc',axis=1,inplace=True)

df_ecomm_prod_cat.drop('category_level2',axis=1,inplace=True)
df_ecomm_prod_cat.drop('category_level2_desc',axis=1,inplace=True)

df_ecomm_prod_cat.drop('category_level3',axis=1,inplace=True)
df_ecomm_prod_cat.drop('category_level3_desc',axis=1,inplace=True)

df_ecomm_prod_cat.drop('category_level4',axis=1,inplace=True)
df_ecomm_prod_cat.drop('category_level4_desc',axis=1,inplace=True)

df_ecomm_prod_cat.drop('category_level5',axis=1,inplace=True)
df_ecomm_prod_cat.drop('category_level5_desc',axis=1,inplace=True)

df_ecomm_prod_cat.drop('category_level6',axis=1,inplace=True)
df_ecomm_prod_cat.drop('category_level6_desc',axis=1,inplace=True)

df_ecomm_prod_cat['category_comb_id'] = df_ecomm_prod_cat.index 

df_ecomm_prod_cat

Unnamed: 0,category_level1_id,category_level2_id,category_level3_id,category_level4_id,category_level5_id,category_level6_id,category_comb_id
0,0,9,66,,,,0
1,0,9,289,83,,,1
2,0,9,289,379,,,2
3,0,9,289,454,,,3
4,0,9,597,15,,,4
...,...,...,...,...,...,...,...
1193,30,275,725,540,,,1193
1194,30,275,805,,,,1194
1195,31,43,153,,,,1195
1196,31,187,339,252,,,1196


In [19]:
# confirm no nulls in new dataframes
print('nulls in df_ecomm_category_level1\n' + str(len(df_ecomm_category_level1) - df_ecomm_category_level1.count()))
print('\nnulls in df_ecomm_category_level2\n' + str(len(df_ecomm_category_level2) - df_ecomm_category_level2.count()))
print('\nnulls in df_ecomm_category_level3\n' + str(len(df_ecomm_category_level3) - df_ecomm_category_level3.count()))
print('\nnulls in df_ecomm_category_level4\n' + str(len(df_ecomm_category_level4) - df_ecomm_category_level4.count()))
print('\nnulls in df_ecomm_category_level5\n' + str(len(df_ecomm_category_level5) - df_ecomm_category_level5.count()))
print('\nnulls in df_ecomm_category_level6\n' + str(len(df_ecomm_category_level6) - df_ecomm_category_level6.count()))

nulls in df_ecomm_category_level1
category_level1_desc    0
category_level1_id      0
dtype: int64

nulls in df_ecomm_category_level2
category_level2_desc    0
category_level2_id      0
dtype: int64

nulls in df_ecomm_category_level3
category_level3_desc    0
category_level3_id      0
dtype: int64

nulls in df_ecomm_category_level4
category_level4_desc    0
category_level4_id      0
dtype: int64

nulls in df_ecomm_category_level5
category_level5_desc    0
category_level5_id      0
dtype: int64

nulls in df_ecomm_category_level6
category_level6_desc    0
category_level6_id      0
dtype: int64


In [20]:
# replace 'uniq id' with a numeric index, add foreign keys to df_ecomm
df_ecomm

Unnamed: 0,Uniq Id,Crawl Timestamp,Product Url,Product Name,Description,List Price,Sale Price,Brand,Item Number,Gtin,Package Size,Category,Postal Code,Available
0,459b05f3cb7f1cba0a36fdc042ff0056,2019-03-22 17:10:04 +0000,https://www.walmart.com/ip/In-Style-Eyes-Catey...,In Style Eyes Cateye Two Tone Reading Glasses,Stunning Looking Cat Eye Two Tone Reading Glas...,19.99,19.99,In Style Eyes,,9.664782e+10,,Health|Home Health Care|Daily Living Aids,,True
1,6a1bddc2801cbba539be0c182498d4dd,2019-03-22 17:10:04 +0000,https://www.walmart.com/ip/In-Style-Eyes-Catey...,In Style Eyes Cateye Two Tone Reading Glasses,Stunning Looking Cat Eye Two Tone Reading Glas...,19.99,19.99,In Style Eyes,,9.664782e+10,,Health|Home Health Care|Daily Living Aids,,True
2,4d237340ae8361b4bb4f51e8a6128c8b,2019-03-22 17:10:04 +0000,https://www.walmart.com/ip/In-Style-Eyes-Catey...,In Style Eyes Cateye Two Tone Reading Glasses,Stunning Looking Cat Eye Two Tone Reading Glas...,19.99,19.99,In Style Eyes,,9.664782e+10,,Health|Home Health Care|Daily Living Aids,,True
3,d6dbc29d5782a88db9082d81ad04089c,2019-03-22 17:10:04 +0000,https://www.walmart.com/ip/In-Style-Eyes-Catey...,In Style Eyes Cateye Two Tone Reading Glasses,Stunning Looking Cat Eye Two Tone Reading Glas...,19.99,19.99,In Style Eyes,,9.664782e+10,,Health|Home Health Care|Daily Living Aids,,True
4,9e51356d763e53dc622c92e3a86f9ef8,2019-03-22 17:46:46 +0000,https://www.walmart.com/ip/Upper-Crust-Caribbe...,Upper Crust Caribbean Coconut Panko Bread Crum...,| Size information : 1-10 POUND Upper Crust Ca...,39.96,39.96,Upper Crust,,8.907490e+11,,Food|Baking|Baking Mixes,,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29995,654d1b3239d4852eb4e2813ca986fe84,2019-03-11 20:57:31 +0000,https://www.walmart.com/ip/MightySkins-Skin-De...,MightySkins Skin Decal Wrap Compatible with HP...,Do You Want Your HP 17t Laptop 17.3&quot; (201...,19.99,19.99,Mightyskins,,8.248157e+10,"size-hpprobookx36011""(2017)",Electronics|Electronics Learning Center|Ads Fr...,,True
29996,59a420ed8ed429440c1769ad30eef022,2019-03-11 20:57:31 +0000,https://www.walmart.com/ip/MightySkins-Skin-De...,MightySkins Skin Decal Wrap Compatible with HP...,Do You Want Your HP 17t Laptop 17.3&quot; (201...,19.99,19.99,Mightyskins,,7.458392e+11,"size-pavilionx36014""(2018)",Electronics|Electronics Learning Center|Ads Fr...,,True
29997,41e228c9f27d0dfc98b3cbb0824181f8,2019-03-11 20:57:31 +0000,https://www.walmart.com/ip/MightySkins-Skin-De...,MightySkins Skin Decal Wrap Compatible with HP...,Do You Want Your HP 17t Laptop 17.3&quot; (201...,19.99,19.99,Mightyskins,,7.924363e+11,"size-hppavilionx36011""(2017)",Electronics|Electronics Learning Center|Ads Fr...,,True
29998,cbd04f1cb09cff78ba8150ce882d2f6f,2019-03-11 20:57:31 +0000,https://www.walmart.com/ip/MightySkins-Skin-De...,MightySkins Skin Decal Wrap Compatible with HP...,Do You Want Your HP 17t Laptop 17.3&quot; (201...,19.99,19.99,Mightyskins,,8.248178e+10,"size-hpstream11""(2017)",Electronics|Electronics Learning Center|Ads Fr...,,True


# STEP 2 - TRANSFORM THE DATA

In [21]:
# transform ***price*** dataframe (column price data into rows and add a 'days' field) 
for i in range(1, 1970):
    col_name = 'd_' + str(i)
    df_prices_slice = df_prices_raw.loc[:, ['id',col_name]]
    df_prices_slice.rename(columns = {col_name: 'price'}, inplace = True) 
    df_prices_slice['day'] = i
    if i != 1:
        df_prices = df_prices.append(df_prices_slice, ignore_index=True)
    else:
        df_prices = df_prices_slice
    
    # give a status update every 100 columns processed
    if i % 100 == 0:
        print(str(i) + ' columns processed...')

df_prices


100 columns processed...
200 columns processed...
300 columns processed...
400 columns processed...
500 columns processed...
600 columns processed...
700 columns processed...
800 columns processed...
900 columns processed...
1000 columns processed...
1100 columns processed...
1200 columns processed...
1300 columns processed...
1400 columns processed...
1500 columns processed...
1600 columns processed...
1700 columns processed...
1800 columns processed...
1900 columns processed...


Unnamed: 0,id,price,day
0,3104,2.00,1
1,3105,2.00,1
2,3106,2.00,1
3,3107,2.00,1
4,3108,2.00,1
...,...,...,...
60034805,36645,5.94,1969
60034806,36646,5.94,1969
60034807,36647,5.94,1969
60034808,36648,5.94,1969


In [22]:
# transform ***quantile*** dataframe (column data into rows and add a 'quantile' field) 
cols = ['Q5','Q25','Q165','Q250','Q500','Q750','Q835','Q975','Q995']
i = 1
for col_name in cols:
    df_quant_fe_slice = df_quant_fe_raw.loc[:, ['id',col_name]]
    df_quant_fe_slice.rename(columns = {col_name: 'value'}, inplace = True) 
    df_quant_fe_slice['quantile'] = col_name
    if i != 1:
        df_quant_fe = df_quant_fe.append(df_quant_fe_slice, ignore_index=True)
    else:
        df_quant_fe = df_quant_fe_slice
    i = i + 1
    
df_quant_fe


Unnamed: 0,id,value,quantile
0,18915,0.000,Q5
1,18926,0.000,Q5
2,18937,0.000,Q5
3,18948,0.000,Q5
4,18959,0.000,Q5
...,...,...,...
8482315,39779,5837.900,Q995
8482316,3103,24688.700,Q995
8482317,39745,15883.330,Q995
8482318,42839,19490.245,Q995


In [23]:
# transform ***sales*** dataframe (column sales data into rows and add a 'days' field)
for i in range(1, 1942):
    col_name = 'd_' + str(i)
    df_sales_slice = df_sales_raw.loc[:, ['id','item_id','dept_id','cat_id','store_id','state_id',col_name]]
    df_sales_slice.rename(columns = {col_name: 'sales'}, inplace = True) 
    df_sales_slice['day'] = i
    if i != 1:
        df_sales = df_sales.append(df_sales_slice, ignore_index=True)
    else:
        df_sales = df_sales_slice
        
    # give a status update every 100 columns processed
    if i % 100 == 0:
        print(str(i) + ' columns processed...')

df_sales


100 columns processed...
200 columns processed...
300 columns processed...
400 columns processed...
500 columns processed...
600 columns processed...
700 columns processed...
800 columns processed...
900 columns processed...
1000 columns processed...
1100 columns processed...
1200 columns processed...
1300 columns processed...
1400 columns processed...
1500 columns processed...
1600 columns processed...
1700 columns processed...
1800 columns processed...
1900 columns processed...


Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,sales,day
0,18915,1443,5,1,1,6,0,1
1,18926,1444,5,1,1,6,0,1
2,18937,1445,5,1,1,6,0,1
3,18948,1446,5,1,1,6,0,1
4,18959,1447,5,1,1,6,0,1
...,...,...,...,...,...,...,...,...
83152435,39779,3059,10,3,12,55,4757,1941
83152436,3103,3059,10,3,0,6,24644,1941
83152437,39745,3059,10,3,5,48,14815,1941
83152438,42839,3059,10,3,9,55,14879,1941


In [24]:
df_prices.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 60034810 entries, 0 to 60034809
Data columns (total 3 columns):
 #   Column  Dtype  
---  ------  -----  
 0   id      Int64  
 1   price   float64
 2   day     int64  
dtypes: Int64(1), float64(1), int64(1)
memory usage: 1.4 GB


In [25]:
df_quant_fe.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8482320 entries, 0 to 8482319
Data columns (total 3 columns):
 #   Column    Dtype  
---  ------    -----  
 0   id        Int64  
 1   value     float64
 2   quantile  object 
dtypes: Int64(1), float64(1), object(1)
memory usage: 703.8 MB


In [26]:
df_sales.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 83152440 entries, 0 to 83152439
Data columns (total 8 columns):
 #   Column    Dtype
---  ------    -----
 0   id        Int64
 1   item_id   Int64
 2   dept_id   Int64
 3   cat_id    Int64
 4   store_id  Int64
 5   state_id  Int64
 6   sales     int64
 7   day       int64
dtypes: Int64(6), int64(2)
memory usage: 5.4 GB


## Step 3 - Save/Load dataframes to .csv (to avoid re-running the ETL)

In [27]:
# export the dataframes to csv files so that the timely transformations don't need to be redone
df_prices.to_csv(os.path.join(resource_dir, 'clean', 'd3_prices.csv'),index=False)
df_quant_fe.to_csv(os.path.join(resource_dir, 'clean', 'd3_quant_fe.csv'),index=False)
df_sales.to_csv(os.path.join(resource_dir, 'clean', 'd3_sales.csv'),index=False)

df_sales_categories.to_csv(os.path.join(resource_dir, 'clean', 'd3_categories.csv'),index=False)
df_sales_departments.to_csv(os.path.join(resource_dir, 'clean', 'd3_departments.csv'),index=False)
df_sales_stores.to_csv(os.path.join(resource_dir, 'clean', 'd3_stores.csv'),index=False)
df_sales_states.to_csv(os.path.join(resource_dir, 'clean', 'd3_states.csv'),index=False)
df_sales_items.to_csv(os.path.join(resource_dir, 'clean', 'd3_items.csv'),index=False)

df_ecomm.to_csv(os.path.join(resource_dir, 'clean', 'd7_ecomm.csv'),index=False)
df_ecomm_prod_cat.to_csv(os.path.join(resource_dir, 'clean', 'd7_prod_cat.csv'),index=False)

df_ecomm_category_level1.to_csv(os.path.join(resource_dir, 'clean', 'd7_cat_lvl1.csv'),index=False)
df_ecomm_category_level2.to_csv(os.path.join(resource_dir, 'clean', 'd7_cat_lVl2.csv'),index=False)
df_ecomm_category_level3.to_csv(os.path.join(resource_dir, 'clean', 'd7_cat_lvl3.csv'),index=False)
df_ecomm_category_level4.to_csv(os.path.join(resource_dir, 'clean', 'd7_cat_lvl4.csv'),index=False)
df_ecomm_category_level5.to_csv(os.path.join(resource_dir, 'clean', 'd7_cat_lvl5.csv'),index=False)
df_ecomm_category_level6.to_csv(os.path.join(resource_dir, 'clean', 'd7_cat_lvl6.csv'),index=False)


FileNotFoundError: [Errno 2] No such file or directory: 'C:\\Users\\habib\\OneDrive\\Documents\\ANALYTICS\\Github\\magic\\Project-1 Liz\\resources\\clean\\d3_prices.csv'

In [None]:
# # To reload the dataframes from here, run this:
# df_prices = pd.read_csv(os.path.join(resource_dir, 'clean', 'd3_prices.csv'))
# df_quant_fe = pd.read_csv(os.path.join(resource_dir, 'clean', 'd3_quant_fe.csv'))
# df_sales = pd.read_csv(os.path.join(resource_dir, 'clean', 'd3_sales.csv'))

# df_sales_categories = pd.read_csv(os.path.join(resource_dir, 'clean', 'd3_categories.csv'))
# df_sales_departments = pd.read_csv(os.path.join(resource_dir, 'clean', 'd3_departments.csv'))
# df_sales_stores = pd.read_csv(os.path.join(resource_dir, 'clean', 'd3_stores.csv'))
# df_sales_states = pd.read_csv(os.path.join(resource_dir, 'clean', 'd3_states.csv'))
# df_sales_items = pd.read_csv(os.path.join(resource_dir, 'clean', 'd3_items.csv'))

# df_ecomm = pd.read_csv(os.path.join(resource_dir, 'clean', 'd7_ecomm.csv'))
# df_ecomm_prod_cat = pd.read_csv(os.path.join(resource_dir, 'clean', 'd7_prod_cat.csv'))

# df_ecomm_category_level1 = pd.read_csv(os.path.join(resource_dir, 'clean', 'd7_cat_lvl1.csv'))
# df_ecomm_category_level2 = pd.read_csv(os.path.join(resource_dir, 'clean', 'd7_cat_lVl2.csv'))
# df_ecomm_category_level3 = pd.read_csv(os.path.join(resource_dir, 'clean', 'd7_cat_lvl3.csv'))
# df_ecomm_category_level4 = pd.read_csv(os.path.join(resource_dir, 'clean', 'd7_cat_lvl4.csv'))
# df_ecomm_category_level5 = pd.read_csv(os.path.join(resource_dir, 'clean', 'd7_cat_lvl5.csv'))
# df_ecomm_category_level6 = pd.read_csv(os.path.join(resource_dir, 'clean', 'd7_cat_lvl6.csv'))

## Step 4. Load data into PostgreSQL

In [None]:
# # this code loops through the folder of cleaned .csv files and loads them to PostgreSQL
# # this is over 20 faster than using sqlalchemy and df.to_sql for long tables

# # files are read to memory using StringIO in the io package
# # 'copy [table] from stdin' in PostgreSQL, which directly from memory on the local computer

# folder_name = os.path.join(resource_dir, 'clean1')

# # Ask the user for which database they want to use and which credentials to access it
# # conn_host = input('host: ')
# # conn_dbname = input('database: ')
# # conn_user = input('username: ')
# # conn_pass = getpass.getpass(prompt='password: ')

# conn_host = 'otto.db.elephantsql.com'
# conn_dbname = 'ofiglsqd'
# conn_user = 'ofiglsqd'
# conn_pass = 'vVojrG9_zzJZCOLXz8rhKWXk6ivvYqAe'

# # loop through .csv files in the output folder
# for file in os.listdir(folder_name):

#     print('\n\n' + str(datetime.utcnow()) + ' ' + str(file) + ' to be loaded')

#     print(str(datetime.utcnow()) + ' reading file to dataframe...')
    
#     # read .csv file into dataframe
#     df = pd.read_csv(os.path.join(folder_name, file), na_values=['nan','NA','NaN'])
    
#     print(str(datetime.utcnow()) + ' completed')
    
#     print(df.info(memory_usage='deep'))
    
#     # 
#     with psycopg2.connect(host=conn_host, dbname=conn_dbname, user=conn_user, password=conn_pass) as conn:
#         conn.autocommit = True

#         table_name = file.split('.csv')[0].lower().replace('-','_')

#         output = io.StringIO()

#         print(str(datetime.utcnow()) + ' reading file to memory using StringIO...')

#         df.to_csv(output, sep='|', header=False, index=False)
#         output.seek(0)

#         print(str(datetime.utcnow()) + ' completed')

#         print(str(datetime.utcnow()) + ' generating the create table statement...')
        
#         qry = pd.io.sql.get_schema(df, table_name, con=conn)

#         qry = qry.replace('CREATE TABLE', 'CREATE TABLE IF NOT EXISTS')

#         for key in df.columns:
#             if pd.api.types.infer_dtype(df[key], skipna=True) == 'boolean':
#                 start = qry.find(key)
#                 end = start + qry[start:].find(',')
#                 print(start, end)
#                 qry = qry[:start] + key + '" BOOLEAN' + qry[end:]
#         try:
#             with conn.cursor() as cur:
#                 print(str(datetime.utcnow()) + ' completed')
#                 print(qry)
                
#                 print(str(datetime.utcnow()) + ' executing the create table statement...')
#                 cur.execute(qry)
#                 print(str(datetime.utcnow()) + ' completed')
                
#                 print(str(datetime.utcnow()) + ' loading table to database...')
#                 cur.copy_expert("""COPY %s FROM STDIN WITH (FORMAT csv, DELIMITER '|', QUOTE '"')""" % table_name, output)
#                 print(str(datetime.utcnow()) + ' completed')

#         except Exception as e:
#             print('Error:\n' + str(e))
