In [9]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [10]:
from google.cloud import bigquery
from google.oauth2 import service_account

In [11]:
key_path = '../service_account/gentle-keyword-423715-j0-03be08ad6412.json'

credentials = service_account.Credentials.from_service_account_file(
    key_path,
    scopes=["https://www.googleapis.com/auth/bigquery"]
)

In [12]:
from google.cloud import bigquery

client = bigquery.Client(
    credentials = credentials,
    project=credentials.project_id
)

# SELECTED CATEGORY

In [51]:
df_cat = pd.read_csv('./data_modeling.csv')

for i in ['total_hits', 'total_visits', 'total_page_views', 'total_first_visits', 'total_time_on_site', 'total_transactions']:
    df_cat[i] = df_cat[i].astype('int')

df_cat['date'] = pd.to_datetime(df_cat['date'])
print(df_cat.info())
df_cat.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2170 entries, 0 to 2169
Data columns (total 22 columns):
 #   Column                                 Non-Null Count  Dtype         
---  ------                                 --------------  -----         
 0   category                               2170 non-null   object        
 1   date                                   2170 non-null   datetime64[ns]
 2   total_hits                             2170 non-null   int32         
 3   total_visits                           2170 non-null   int32         
 4   total_page_views                       2170 non-null   int32         
 5   total_first_visits                     2170 non-null   int32         
 6   total_time_on_site                     2170 non-null   int32         
 7   avg_time_on_site                       2170 non-null   float64       
 8   total_transactions                     2170 non-null   int32         
 9   total_revenue                          2170 non-null   float64 

Unnamed: 0,category,date,total_hits,total_visits,total_page_views,total_first_visits,total_time_on_site,avg_time_on_site,total_transactions,total_revenue,...,medium_not_set,medium_affiliate,medium_cpc,medium_cpm,medium_organic,medium_referral,action_Check_out,action_Click_through_of_product_lists,action_Completed_purchase,action_Product_detail_views
0,Accessories,2016-08-01,4566,89,3027,33,87104,563.19585,20,1459300000.0,...,0,0,3,3,57,14,0,146,0,121
1,Accessories,2016-08-02,5619,94,3713,40,107344,815.891196,10,629920000.0,...,0,0,14,0,102,2,0,151,0,133
2,Accessories,2016-08-03,6343,113,4077,58,116221,986.585195,0,0.0,...,0,0,15,2,140,6,0,179,0,162
3,Accessories,2016-08-04,5006,95,3549,68,141102,805.315385,1,122266700.0,...,0,8,2,0,93,13,0,152,0,135
4,Accessories,2016-08-05,6448,110,4578,70,139769,799.491296,28,3990420000.0,...,0,7,20,0,192,0,0,180,0,150


# TOP PRODUCT

In [8]:
df_prod = pd.read_csv('../result/top_product.csv')


print(df_prod.info())
df_prod.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 75 entries, 0 to 74
Data columns (total 16 columns):
 #   Column                                 Non-Null Count  Dtype 
---  ------                                 --------------  ----- 
 0   category                               75 non-null     object
 1   product                                75 non-null     object
 2   visits                                 75 non-null     int64 
 3   time_on_site                           75 non-null     int64 
 4   transactions                           75 non-null     int64 
 5   revenue                                75 non-null     int64 
 6   medium_none                            75 non-null     int64 
 7   medium_affiliate                       75 non-null     int64 
 8   medium_cpc                             75 non-null     int64 
 9   medium_cpm                             75 non-null     int64 
 10  medium_organic                         75 non-null     int64 
 11  medium_referral      

Unnamed: 0,category,product,visits,time_on_site,transactions,revenue,medium_none,medium_affiliate,medium_cpc,medium_cpm,medium_organic,medium_referral,action_Check_out,action_Click_through_of_product_lists,action_Completed_purchase,action_Product_detail_views
0,Accessories,8 pc Android Sticker Sheet,366,373644,109,24754460000,228,2,11,2,105,18,0,196,0,170
1,Accessories,Android Sticker Sheet Ultra Removable,465,420577,91,10802510000,220,2,26,7,201,9,0,253,0,212
2,Accessories,Basecamp Explorer Powerbank Flashlight,81,97267,13,3518450000,40,0,0,0,39,2,0,47,0,34
3,Accessories,Google Car Clip Phone Holder,123,98510,14,668220000,78,0,12,6,24,3,0,73,0,50
4,Accessories,Google Device Stand,20,18840,4,78280000,7,0,2,4,7,0,0,10,0,10


In [55]:
# Get the schema of the result
schema = [
    bigquery.SchemaField("category", "STRING"),
    bigquery.SchemaField("product", "STRING"),
    bigquery.SchemaField("total_hits", "INTEGER"),
    bigquery.SchemaField("total_visits", "INTEGER"),
    bigquery.SchemaField("total_page_views", "INTEGER"),
    bigquery.SchemaField("total_first_visits", "INTEGER"),
    bigquery.SchemaField("total_time_on_site", "INTEGER"),
    bigquery.SchemaField("avg_time_on_site", "FLOAT"),
    bigquery.SchemaField("total_transactions", "INTEGER"),
    bigquery.SchemaField("total_revenue", "FLOAT"),
    bigquery.SchemaField("avg_revenue", "FLOAT"),
]

# Define the new table reference
project_id = credentials.project_id
dataset_id = 'dummy'
table_id = 'top-products'
table_ref = client.dataset(dataset_id, project=project_id).table(table_id)

# Delete the existing destination table if it exists
try:
    client.delete_table(table_ref)
    print(f"Deleted table {table_ref}")
except Exception as e:
    print(f"Table {table_ref} does not exist: {e}")
        
# Create a new table with the schema
table = bigquery.Table(table_ref, schema=schema)
table = client.create_table(table)  # API request


# Load DataFrame to BigQuery table
job = client.load_table_from_dataframe(df_prod, table_ref)

# Wait for the load job to complete
job.result()

print(f"Loaded {job.output_rows} rows into {dataset_id}:{table_id}.")

Deleted table gentle-keyword-423715-j0.dummy.top-products
Loaded 82 rows into dummy:top-products.


In [52]:
df_cat['category'].unique()

array(['Accessories', 'Electronics', "Men's", 'Office', 'Shop by Brand',
       "Women's"], dtype=object)

In [53]:
# Get the schema of the result
schema = [
    bigquery.SchemaField("category", "STRING"),
    bigquery.SchemaField("date", "DATE"),
    bigquery.SchemaField("total_hits", "INTEGER"),
    bigquery.SchemaField("total_visits", "INTEGER"),
    bigquery.SchemaField("total_page_views", "INTEGER"),
    bigquery.SchemaField("total_first_visits", "INTEGER"),
    bigquery.SchemaField("total_time_on_site", "INTEGER"),
    bigquery.SchemaField("avg_time_on_site", "FLOAT"),
    bigquery.SchemaField("total_transactions", "INTEGER"),
    bigquery.SchemaField("total_revenue", "FLOAT"),
    bigquery.SchemaField("avg_revenue", "FLOAT"),
    bigquery.SchemaField("medium_none", "INTEGER"),
    bigquery.SchemaField("medium_not_set", "INTEGER"),
    bigquery.SchemaField("medium_affiliate", "INTEGER"),
    bigquery.SchemaField("medium_cpc", "INTEGER"),
    bigquery.SchemaField("medium_cpm", "INTEGER"),
    bigquery.SchemaField("medium_organic", "INTEGER"),
    bigquery.SchemaField("medium_referral", "INTEGER"),
    bigquery.SchemaField("action_Check_out", "INTEGER"),
    bigquery.SchemaField("action_Click_through_of_product_lists", "INTEGER"),
    bigquery.SchemaField("action_Completed_purchase", "INTEGER"),
    bigquery.SchemaField("action_Product_detail_views", "INTEGER")
]

# Define the new table reference
project_id = credentials.project_id
dataset_id = 'dummy'
table_id = 'data-main'
table_ref = client.dataset(dataset_id, project=project_id).table(table_id)

# Delete the existing destination table if it exists
try:
    client.delete_table(table_ref)
    print(f"Deleted table {table_ref}")
except Exception as e:
    print(f"Table {table_ref} does not exist: {e}")
        
# Create a new table with the schema
table = bigquery.Table(table_ref, schema=schema)
table = client.create_table(table)  # API request


# Load DataFrame to BigQuery table
job = client.load_table_from_dataframe(df_cat, table_ref)

# Wait for the load job to complete
job.result()

print(f"Loaded {job.output_rows} rows into {dataset_id}:{table_id}.")

Deleted table gentle-keyword-423715-j0.dummy.data-main
Loaded 2170 rows into dummy:data-main.


In [43]:
# categories = ['Accessories', 'Electronics', 'Office', 'Men\'s', 'Women\'s', 'Shop by Brand']
# for cat in categories:
#     df_exp = pd.read_csv(f'./prediction-revenue-{cat}.csv')
#     df_exp['date'] = pd.to_datetime(df_exp['date'])
#     # Get the schema of the result
#     schema = [
#         bigquery.SchemaField("date", "DATE"),
#         bigquery.SchemaField("total_revenue", "FLOAT"),
#         bigquery.SchemaField("pred_total_revenue", "FLOAT"),
#         bigquery.SchemaField("category", "STRING"),
#     ]

#     # Define the new table reference
#     project_id = credentials.project_id
#     dataset_id = 'dummy'
#     name_table = cat.replace("'", "").replace(" ", "")
#     table_id = f'{name_table}'
#     table_ref = client.dataset(dataset_id, project=project_id).table(table_id)

#     # Delete the existing destination table if it exists
#     try:
#         client.delete_table(table_ref)
#         print(f"Deleted table {table_ref}")
#     except Exception as e:
#         print(f"Table {table_ref} does not exist: {e}")

#     # Create a new table with the schema
#     table = bigquery.Table(table_ref, schema=schema)
#     table = client.create_table(table)  # API request


#     # Load DataFrame to BigQuery table
#     job = client.load_table_from_dataframe(df_exp, table_ref)

#     # Wait for the load job to complete
#     job.result()

#     print(f"Loaded {job.output_rows} rows into {dataset_id}:{table_id}.")

Deleted table gentle-keyword-423715-j0.dummy.Accessories
Loaded 251 rows into dummy:Accessories.
Deleted table gentle-keyword-423715-j0.dummy.Electronics
Loaded 276 rows into dummy:Electronics.
Deleted table gentle-keyword-423715-j0.dummy.Office
Loaded 276 rows into dummy:Office.
Table gentle-keyword-423715-j0.dummy.Mens does not exist: 404 DELETE https://bigquery.googleapis.com/bigquery/v2/projects/gentle-keyword-423715-j0/datasets/dummy/tables/Mens?prettyPrint=false: Not found: Table gentle-keyword-423715-j0:dummy.Mens
Loaded 274 rows into dummy:Mens.
Table gentle-keyword-423715-j0.dummy.Womens does not exist: 404 DELETE https://bigquery.googleapis.com/bigquery/v2/projects/gentle-keyword-423715-j0/datasets/dummy/tables/Womens?prettyPrint=false: Not found: Table gentle-keyword-423715-j0:dummy.Womens
Loaded 264 rows into dummy:Womens.
Table gentle-keyword-423715-j0.dummy.ShopbyBrand does not exist: 404 DELETE https://bigquery.googleapis.com/bigquery/v2/projects/gentle-keyword-423715-j0

In [47]:
df = pd.DataFrame()
categories = ['Accessories', 'Electronics', 'Office', 'Men\'s', 'Women\'s', 'Shop by Brand']
for cat in categories:
    df_exp = pd.read_csv(f'./prediction-revenue-{cat}.csv')
    df = pd.concat([df, df_exp])

df['date'] = pd.to_datetime(df['date'])
print(df.info())
df.head()

Unnamed: 0,date,total_revenue,pred_total_revenue,category
0,2016-08-01,4377900000.0,2533989000.0,Accessories
1,2016-08-02,1889760000.0,2757827000.0,Accessories
2,2016-08-03,0.0,3003203000.0,Accessories
3,2016-08-04,366800000.0,2802602000.0,Accessories
4,2016-08-05,11971260000.0,2828253000.0,Accessories


In [49]:
# Get the schema of the result
schema = [
    bigquery.SchemaField("date", "DATE"),
    bigquery.SchemaField("total_revenue", "FLOAT"),
    bigquery.SchemaField("pred_total_revenue", "FLOAT"),
    bigquery.SchemaField("category", "STRING"),
]

# Define the new table reference
project_id = credentials.project_id
dataset_id = 'dummy'
name_table = cat.replace("'", "").replace(" ", "")
table_id = f'prediction-revenue'
table_ref = client.dataset(dataset_id, project=project_id).table(table_id)

# Delete the existing destination table if it exists
try:
    client.delete_table(table_ref)
    print(f"Deleted table {table_ref}")
except Exception as e:
    print(f"Table {table_ref} does not exist: {e}")

# Create a new table with the schema
table = bigquery.Table(table_ref, schema=schema)
table = client.create_table(table)  # API request


# Load DataFrame to BigQuery table
job = client.load_table_from_dataframe(df, table_ref)

# Wait for the load job to complete
job.result()

print(f"Loaded {job.output_rows} rows into {dataset_id}:{table_id}.")

Deleted table gentle-keyword-423715-j0.dummy.prediction-revenue
Loaded 1617 rows into dummy:prediction-revenue.
