### Read sales-records from S3

In [1]:
import boto3
import pandas as pd

s3 = boto3.client('s3')
access_key = pd.read_csv("../../ankur_accessKeys_s3.csv")
sess = boto3.Session(region_name="us-east-2")
client = sess.client('s3',
                     aws_access_key_id=access_key["Access key ID"][0],
                     aws_secret_access_key=access_key["Secret access key"][0])
bucket_name = 'forever21etl'
file_name = 'sales-records.csv'

In [2]:
def read_file_from_s3(bucket_name, file_name):
    obj = client.get_object(Bucket=bucket_name, Key=file_name)
    data = obj['Body'].read().decode('utf-8')
    return data
data = read_file_from_s3(bucket_name, file_name)

In [3]:
with open(file_name, 'w') as out:
    out.write(data)

In [4]:
data = pd.read_csv(file_name)
data.head()

Unnamed: 0,ID,Region,Country,Item_Type,Sales_Channel,Order_Priority,Order_Date,Order_ID,Ship_Date,Units_Sold,Unit_Price,Unit_Cost,Total_Revenue,Total_Cost,Total_Profit
0,1,Central America and the Caribbean,Antigua and Barbuda,Baby Food,Online,M,12/20/2013,957081544,1/11/2014,552,255.28,159.42,140914.56,87999.84,52914.72
1,2,Central America and the Caribbean,Panama,Snacks,Offline,C,7/5/2010,301644504,7/26/2010,2167,152.58,97.44,330640.86,211152.48,119488.38
2,3,Europe,Czech Republic,Beverages,Offline,C,9/12/2011,478051030,9/29/2011,4778,47.45,31.79,226716.1,151892.62,74823.48
3,4,Asia,North Korea,Cereal,Offline,L,5/13/2010,892599952,6/15/2010,9016,205.7,117.11,1854591.2,1055863.76,798727.44
4,5,Asia,Sri Lanka,Snacks,Offline,C,7/20/2015,571902596,7/27/2015,7542,152.58,97.44,1150758.36,734892.48,415865.88


### Load to BigQuery

In [5]:
from google.cloud import bigquery
import os

In [6]:
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "test7151991-8eba3fa62ea8.json"

In [7]:
client = bigquery.Client()
client

<google.cloud.bigquery.client.Client at 0x22a3f7490d0>

In [8]:
tableRef = client.dataset("clothing").table("data")
tableRef

TableReference(DatasetReference('test7151991', 'clothing'), 'data')

In [11]:
bigqueryJob = client.load_table_from_dataframe(data, tableRef)
bigqueryJob.result()

LoadJob<project=test7151991, location=US, id=8ce03922-c9d2-4d9e-8fa1-2d46051c3c68>

### Query BigQuery table

In [12]:
import bigframes.pandas as bpd

In [32]:
# region count
query_statement = """              
                    select region, count(*) cnt
                    from test7151991.clothing.data
                    group by region
                    """

In [33]:
query = bpd.read_gbq(query_statement)
query.shape

(7, 2)

In [35]:
query.sort_values("cnt", ascending=False)

Unnamed: 0,region,cnt
0,Europe,1330
2,Sub-Saharan Africa,1285
5,Asia,719
6,Middle East and North Africa,610
4,Central America and the Caribbean,534
3,Australia and Oceania,416
1,North America,106
