In [1]:
import nest_asyncio
nest_asyncio.apply()

In [2]:
# Run dependencies
%run transformation.ipynb

500 columns processed...
1000 columns processed...
1500 columns processed...
500 columns processed...
1000 columns processed...
1500 columns processed...


# Loading

In [3]:
# *******************************************LOADING*******************************************
import psycopg2
%run config.ipynb
engine=create_engine(f"postgresql://{connection_string}")

In [4]:
df_sales_raw.head()

Unnamed: 0,Holiday,Year,Date
0,New Year's Day,2010,2010-01-01
1,Independence Day,2010,2010-07-04
2,Thanksgiving Day,2010,2010-11-25
3,Christmas Eve,2010,2010-12-24
4,Christmas Day,2010,2010-12-25


In [None]:
holiday_df.to_sql(name="holiday", con=engine, if_exists='append', index=True)

walmart.to_sql(name="walmart", con=engine, if_exists='append', index=True)

stock.to_sql(name="stock", con=engine, if_exists='append', index=True)

state_id.to_sql(name="state_id", con=engine, if_exists='append', index=True)

store.to_sql(name="store", con=engine, if_exists='append', index=True)

marketShare_final.to_sql(name="marketshare", con=engine, if_exists='append', index=True)

In [None]:
# Add primary keys
engine.execute('ALTER TABLE "CAfoods" ADD PRIMARY KEY (id)')
engine.execute('ALTER TABLE state_category ADD PRIMARY KEY (state, cat_id)')
engine.execute('ALTER TABLE walmart_stores ADD PRIMARY KEY (state)')
engine.execute('ALTER TABLE walmart_state_cat ADD PRIMARY KEY (state, cat_id)')
engine.execute('ALTER TABLE state_store_sale ADD PRIMARY KEY (state)')

In [None]:
df_sales.to_csv('Resources/clean/d3_sales.csv'),index=False)
df_price_changes.to_csv('Resources/clean/d3_price_changes.csv'),index=False)

df_sales_categories.to_csv('Resources/clean/d3_categories.csv'),index=False)
df_sales_departments.to_csv('Resources/clean/d3_departments.csv'),index=False)
df_sales_stores.to_csv('Resources/clean/d3_stores.csv'),index=False)
df_sales_states.to_csv('Resources/clean/d3_states.csv'),index=False)
df_sales_items.to_csv('Resources/clean/d3_items.csv'),index=False)

df_ecomm.to_csv('Resources/clean/d7_ecomm.csv'),index=False)

In [None]:
# this code loops through the folder of cleaned .csv files and loads them to PostgreSQL
# this is over 20 faster than using sqlalchemy and df.to_sql for long tables

# files are read to memory using StringIO in the io package
# 'copy [table] from stdin' in PostgreSQL, which directly from memory on the local computer

folder_name = os.path.join('Resources/clean')

conn_host = 'otto.db.elephantsql.com'
conn_dbname = 'ofiglsqd'
conn_user = 'ofiglsqd'

conn_pass = getpass.getpass(prompt='password: ')

# loop through .csv files in the output folder
for file in os.listdir(folder_name):

    print('\n\n' + str(datetime.utcnow()) + ' ' + str(file) + ' to be loaded')

    print(str(datetime.utcnow()) + ' reading file to dataframe...')
    
    # read .csv file into dataframe
    df = pd.read_csv(os.path.join(folder_name, file), na_values=['nan','NA','NaN'])
    
    print(str(datetime.utcnow()) + ' completed')
    
    print(df.info(memory_usage='deep'))
    
    # 
    with psycopg2.connect(host=conn_host, dbname=conn_dbname, user=conn_user, password=conn_pass) as conn:
        conn.autocommit = True

        table_name = file.split('.csv')[0].lower().replace('-','_')

        output = io.StringIO()

        print(str(datetime.utcnow()) + ' reading file to memory using StringIO...')

        df.to_csv(output, sep='|', header=False, index=False)
        output.seek(0)

        print(str(datetime.utcnow()) + ' completed')

        print(str(datetime.utcnow()) + ' generating the create table statement...')
        
        qry = pd.io.sql.get_schema(df, table_name, con=conn)

        qry = qry.replace('CREATE TABLE', 'CREATE TABLE IF NOT EXISTS')

        for key in df.columns:
            if pd.api.types.infer_dtype(df[key], skipna=True) == 'boolean':
                start = qry.find(key)
                end = start + qry[start:].find(',')
                print(start, end)
                qry = qry[:start] + key + '" BOOLEAN' + qry[end:]
        try:
            with conn.cursor() as cur:
                print(str(datetime.utcnow()) + ' completed')
                print(qry)
                
                print(str(datetime.utcnow()) + ' executing the create table statement...')
                cur.execute(qry)
                print(str(datetime.utcnow()) + ' completed')
                
                print(str(datetime.utcnow()) + ' loading table to database...')
                cur.copy_expert("""COPY %s FROM STDIN WITH (FORMAT csv, DELIMITER '|', QUOTE '"')""" % table_name, output)
                print(str(datetime.utcnow()) + ' completed')

        except Exception as e:
            print('Error:\n' + str(e))


In [None]:
# Add financial data tables
revenue_df.to_sql(name='revenue', con=engine, if_exists='append', index=True)
netincome_df.to_sql(name='net_income', con=engine, if_exists='append', index=True)
earnings_df.to_sql(name='earnings', con=engine, if_exists='append', index=True)

In [None]:
# Check tables in database
engine.table_names()

In [None]:
# Close connection
engine.dispose()