In [1]:
import pandas as pd
from google.cloud import aiplatform as vertexai
from pandas_gbq import to_gbq
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
PROJECT_ID = !(gcloud config get-value core/project)
PROJECT_ID = PROJECT_ID[0]

BQ_LOCATION = 'US'
REGION = 'us-central1'

In [3]:
GCS_BUCKET = f"khue-capstone-bucket"
BQ_DATASET = f"{PROJECT_ID}:khue_capstone"

In [4]:
vertexai.init(project=PROJECT_ID, location=REGION, staging_bucket=f"gs://{GCS_BUCKET}")

In [40]:
%%bigquery hits_df --project $PROJECT_ID
SELECT DISTINCT
  CONCAT(fullVisitorId, visitId) AS session_id,
  SUBSTR(date, 7, 2) AS day,
  SUBSTR(date, 5, 2) AS month,
  SUBSTR(date, 1, 4) AS year,
  totals.hits,
  totals.pageviews AS page_views,
  totals.bounces,
  totals.timeOnSite AS time_on_site,
  hour,
  minute,
  device.deviceCategory AS device,
  geoNetwork.subContinent AS sub_continent,
  geoNetwork.country,
  product_category,
  product_name,
  product_price,
  add_to_cart,
FROM
  `bigquery-public-data.google_analytics_sample.ga_sessions_*`
LEFT JOIN (
  SELECT
    CONCAT(fullVisitorId, visitId) AS session_id,
    MAX(CASE hit.eCommerceAction.action_type = '3' WHEN TRUE THEN 1 ELSE 0 END) AS add_to_cart,
    hit.hour AS hour,
    hit.minute AS minute,
    product.v2ProductName AS product_name,
    product.v2ProductCategory AS product_category,
    CAST(AVG(product.localProductPrice) AS INT64) AS product_price
  FROM
    `bigquery-public-data.google_analytics_sample.ga_sessions_*`,
    UNNEST(hits) AS hit
    LEFT JOIN UNNEST(hit.product) AS product
  WHERE
    _TABLE_SUFFIX BETWEEN '20161101' AND '20161130'
  GROUP BY
    session_id,
    hit.hour,
    hit.minute,
    product_name,
    product_category
) AS target_table ON CONCAT(fullVisitorId, visitId) = target_table.session_id
WHERE
  (_TABLE_SUFFIX BETWEEN '20161101' AND '20161130')
  AND totals.bounces IS NULL
ORDER BY session_id

Query is running:   0%|          |

Downloading:   0%|          |

In [41]:
display(hits_df.head(5))
print(hits_df.shape)
print(hits_df.isnull().sum())

Unnamed: 0,session_id,day,month,year,hits,page_views,bounces,time_on_site,hour,minute,device,sub_continent,country,product_category,product_name,product_price,add_to_cart
0,00000204243422487471480578901,30,11,2016,17,13,,297,23,56,desktop,South America,Peru,Home/Apparel/Men's/,Google Men's 100% Cotton Short Sleeve Hero Tee...,16990000,0
1,00000204243422487471480578901,30,11,2016,17,13,,297,23,55,desktop,South America,Peru,Home/Bags/Backpacks/,Waterproof Backpack,99990000,0
2,00000204243422487471480578901,30,11,2016,17,13,,297,23,58,desktop,South America,Peru,Home/Electronics/,Aluminum Handy Emergency Flashlight,16990000,0
3,00000204243422487471480578901,30,11,2016,17,13,,297,23,55,desktop,South America,Peru,Home/Bags/Backpacks/,Google Rucksack,69990000,0
4,00000204243422487471480578901,30,11,2016,17,13,,297,23,56,desktop,South America,Peru,Home/Apparel/Men's/,Google Men's Short Sleeve Hero Tee Heather,18990000,0


(1236880, 17)
session_id                0
day                       0
month                     0
year                      0
hits                      0
page_views               10
bounces             1236880
time_on_site            786
hour                      0
minute                    0
device                    0
sub_continent             0
country                   0
product_category      99205
product_name          99205
product_price         99205
add_to_cart               0
dtype: int64


In [42]:
print(hits_df.dtypes)

session_id          object
day                 object
month               object
year                object
hits                 Int64
page_views           Int64
bounces              Int64
time_on_site         Int64
hour                 Int64
minute               Int64
device              object
sub_continent       object
country             object
product_category    object
product_name        object
product_price        Int64
add_to_cart          Int64
dtype: object


In [54]:
cleaned_hits_df = hits_df.copy()

In [55]:
cleaned_hits_df = cleaned_hits_df.drop(['session_id', 'bounces'], axis=1) # dropping session_id and bounces columns

In [56]:
cleaned_hits_df = cleaned_hits_df.dropna()

In [57]:
print(cleaned_hits_df.shape)
print(cleaned_hits_df.isnull().sum())

(1136937, 15)
day                 0
month               0
year                0
hits                0
page_views          0
time_on_site        0
hour                0
minute              0
device              0
sub_continent       0
country             0
product_category    0
product_name        0
product_price       0
add_to_cart         0
dtype: int64


In [47]:
display(cleaned_hits_df.head(5))

Unnamed: 0,day,month,year,hits,page_views,time_on_site,hour,minute,device,sub_continent,country,product_category,product_name,product_price,add_to_cart
0,30,11,2016,17,13,297,23,56,desktop,South America,Peru,Home/Apparel/Men's/,Google Men's 100% Cotton Short Sleeve Hero Tee...,16990000,0
1,30,11,2016,17,13,297,23,55,desktop,South America,Peru,Home/Bags/Backpacks/,Waterproof Backpack,99990000,0
2,30,11,2016,17,13,297,23,58,desktop,South America,Peru,Home/Electronics/,Aluminum Handy Emergency Flashlight,16990000,0
3,30,11,2016,17,13,297,23,55,desktop,South America,Peru,Home/Bags/Backpacks/,Google Rucksack,69990000,0
4,30,11,2016,17,13,297,23,56,desktop,South America,Peru,Home/Apparel/Men's/,Google Men's Short Sleeve Hero Tee Heather,18990000,0


In [48]:
cleaned_hits_df['product_category'] = cleaned_hits_df['product_category'].str.strip('/') # remove '/' from product_category values

In [49]:
# convert values to integers
cleaned_hits_df['day'] = pd.to_numeric(cleaned_hits_df['day'])
cleaned_hits_df['month'] = pd.to_numeric(cleaned_hits_df['month'])
cleaned_hits_df['year'] = pd.to_numeric(cleaned_hits_df['year'])
cleaned_hits_df['product_price'] = round(cleaned_hits_df['product_price'] / (10 ** 6), 2) # revert the 10^6

In [50]:
print(cleaned_hits_df[['day', 'month', 'year']].dtypes)

day      int64
month    int64
year     int64
dtype: object


In [51]:
display(cleaned_hits_df.head(5))

Unnamed: 0,day,month,year,hits,page_views,time_on_site,hour,minute,device,sub_continent,country,product_category,product_name,product_price,add_to_cart
0,30,11,2016,17,13,297,23,56,desktop,South America,Peru,Home/Apparel/Men's,Google Men's 100% Cotton Short Sleeve Hero Tee...,16.99,0
1,30,11,2016,17,13,297,23,55,desktop,South America,Peru,Home/Bags/Backpacks,Waterproof Backpack,99.99,0
2,30,11,2016,17,13,297,23,58,desktop,South America,Peru,Home/Electronics,Aluminum Handy Emergency Flashlight,16.99,0
3,30,11,2016,17,13,297,23,55,desktop,South America,Peru,Home/Bags/Backpacks,Google Rucksack,69.99,0
4,30,11,2016,17,13,297,23,56,desktop,South America,Peru,Home/Apparel/Men's,Google Men's Short Sleeve Hero Tee Heather,18.99,0


In [52]:
cleaned_df = cleaned_hits_df.copy()

In [53]:
%store cleaned_df

Stored 'cleaned_df' (DataFrame)


In [51]:
destination_table = 'york-bb-cohort.khue_capstone.hits_data'
to_gbq(cleaned_df, destination_table, project_id='york-bb-cohort', if_exists='replace')

100%|██████████| 1/1 [00:00<00:00, 8981.38it/s]


In [5]:
%%bigquery hits_df --project $PROJECT_ID
SELECT * FROM khue_capstone.hits_data;

Query is running:   0%|          |

Downloading:   0%|          |

In [7]:
print(hits_df.shape)

(1136937, 15)
