In [311]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt


In [312]:
from google.cloud import bigquery
from google.oauth2 import service_account

In [313]:
key_path = './service_account/gentle-keyword-423715-j0-03be08ad6412.json'

credentials = service_account.Credentials.from_service_account_file(
    key_path,
    scopes=["https://www.googleapis.com/auth/bigquery"]
)

In [314]:
from google.cloud import bigquery

client = bigquery.Client(
    credentials = credentials,
    project=credentials.project_id
)

In [316]:
# Create dataset reference
dataset_ref = client.dataset('google_analytics_sample', project='bigquery-public-data')
# Retrieve dataset from reference
dataset = client.get_dataset(dataset_ref)

In [None]:
# Try if this connected

query = """
SELECT *
FROM `bigquery-public-data.google_analytics_sample.ga_sessions_20160801`
LIMIT 5
"""

df = client.query(query).to_dataframe()
print(df.info())
df

In [None]:
# Get format schema nested field

def format_schema_field(schema_field, indent=0):
    indent_str = "  " * indent
    field_info = f"{indent_str}{schema_field.name} ({schema_field.field_type})"
    
    if schema_field.mode != "NULLABLE":
        field_info += f" - {schema_field.mode}"
    
    if schema_field.description:
        field_info += f" - {schema_field.description}"
    
    nested_indent = indent + 2
    if schema_field.field_type == "RECORD":
        for sub_field in schema_field.fields:
            field_info += "\n" + format_schema_field(sub_field, nested_indent)
    
    return field_info


table_ref = dataset_ref.table('ga_sessions_20160801')
table = client.get_table(table_ref)

# Display schemas
print("SCHEMA field for the 'totals' column:\n")
print(format_schema_field(table.schema[5]))
print()

print("\nSCHEMA field for the 'trafficSource' column:\n")
print(format_schema_field(table.schema[6]))
print()

print("\nSCHEMA field for the 'device' column:\n")
print(format_schema_field(table.schema[7]))
print()

print("\nSCHEMA field for the 'geoNetwork' column:\n")
print(format_schema_field(table.schema[8]))
print()

print("\nSCHEMA field for the 'customDimensions' column:\n")
print(format_schema_field(table.schema[9]))
print()

print("\nSCHEMA field for the 'hits' column:\n")
print(format_schema_field(table.schema[10]))

In [None]:
# Create data table from 'totals'

query = """
SELECT
  product.v2ProductName as product,
  SUM(totals.hits) as total_hits,
  SUM(totals.visits) as total_visits,
  SUM(totals.pageviews) as total_page_views,
  SUM(totals.newVisits) as total_first_visits,
  SUM(totals.timeOnScreen) as total_time_on_screen,
  AVG(totals.timeOnScreen) as avg_time_on_screen,
  SUM(totals.timeOnSite) as total_time_on_site,
  AVG(totals.timeOnSite) as avg_time_on_site,
  SUM(totals.screenviews) as total_screen_views,
  SUM(totals.transactions) as total_transactions,
  SUM(totals.transactionRevenue) as total_revenue,
  AVG(totals.transactionRevenue) as avg_revenue,
  SUM(totals.uniqueScreenviews) as total_unique_screen_views,
FROM `bigquery-public-data.google_analytics_sample.ga_sessions_*` h,
    UNNEST(h.hits) hits,
    UNNEST(hits.product) product
WHERE 
  (_TABLE_SUFFIX BETWEEN '20160801' AND '20170801')
  AND
    (
      hits.eCommerceAction.action_type != '0'
      AND
      hits.eCommerceAction.action_type != '3'
      AND
      hits.eCommerceAction.action_type != '4'
    )
  AND
  (geoNetwork.country = 'United States')
GROUP BY product
"""

df_totals = client.query(query).to_dataframe()
print(df_totals.info())
df_totals.head()

In [None]:
df_totals = df_totals.dropna(axis=1, how='all').fillna(0)

print(df_totals.info())
df_totals.head()

In [None]:
df_totals.describe()

In [None]:
query = """
SELECT
  product.v2ProductName as product,
  CASE 
    WHEN hits.eCommerceAction.action_type = '1' THEN 'Click through of product lists'
    WHEN hits.eCommerceAction.action_type = '2' THEN 'Product detail views'
    WHEN hits.eCommerceAction.action_type = '5' THEN 'Check out'
    WHEN hits.eCommerceAction.action_type = '6' THEN 'Completed purchase'
  END as action,
  trafficSource.medium as medium,
FROM `bigquery-public-data.google_analytics_sample.ga_sessions_*` h,
    UNNEST(h.hits) hits,
    UNNEST(hits.product) product
WHERE 
  (_TABLE_SUFFIX BETWEEN '20160801' AND '20170801')
  AND
    (
      hits.eCommerceAction.action_type != '0'
      AND
      hits.eCommerceAction.action_type != '3'
      AND
      hits.eCommerceAction.action_type != '4'
    )
  AND
    (geoNetwork.country = 'United States')
"""

df_hits_traffic = client.query(query).to_dataframe()
print(df_hits_traffic.info())
df_hits_traffic.head()

In [None]:
df_hits_traffic.drop(['medium', 'action'], axis=1)

In [None]:
df_ohe = pd.get_dummies(df_hits_traffic[['medium', 'action']], dtype=int)
df_hit_traffic = pd.concat([df_hits_traffic.drop(['medium', 'action'], axis=1),
                            df_ohe], axis=1)

def modify_columns(array):
    temp = []
    for d in array:
        temp.append(d.replace('(', '').replace(')', '').replace(' ', '_'))
    return temp


df_hit_traffic.columns = modify_columns(np.array(df_hit_traffic.columns))

df_hit_traffic

In [None]:
df_hit_traffic = df_hit_traffic.groupby('product').sum().reset_index()

print(df_hit_traffic.info())
df_hit_traffic.head()

In [None]:
df = pd.merge(df_totals, df_hit_traffic, how='outer', on='product')
print(df.info())
df.head()

In [None]:
X = df.drop('product', axis=1)

In [None]:
from sklearn.ensemble import IsolationForest

model = IsolationForest()
model.fit(X)

In [None]:
df_result = df.copy()

# Result
result = model.predict(X)

df_result['anomaly'] = result
print(df_result.info())
df_result.head()

In [None]:
df_result = df.copy()

# Result
result = model.predict(X)

df_result['anomaly'] = result
df_result['anomaly'] = df_result['anomaly'].map({1: 'Normal', -1: 'Anomaly'})

In [None]:
# Scatter plot total_visits vs total_transactions
import plotly.express as px

fig = px.scatter(
    x=df_result['total_visits'], 
    y=df_result['total_transactions'],
    color=df_result['anomaly'],
    title="Sample Scatter Plot",
    labels={'x': 'Total Visits', 'y': 'Total Transactions'}
)

# Show the plot
fig.show()

In [None]:
df_anomaly = df_result[df_result['anomaly'] == 'Anomaly']
print(df_anomaly.info())
df_anomaly.head()

In [None]:
# Scatter plot total_visits vs total_transactions

fig = px.scatter(
    x=df_anomaly['total_visits'], 
    y=df_anomaly['total_transactions'], 
    text=df_anomaly['product'], 
    title="Sample Scatter Plot",
    labels={'x': 'Total Visits', 'y': 'Total Transactions'}
)

# Show the plot
fig.show()

In [None]:
df_anomaly.head()

In [None]:
# df_anomaly['first_visit_rate'] = df_anomaly['total_first_visits'] / df_anomaly['total_visits']

# df_anomaly[['product', 'first_visit_rate']].head()

In [None]:
anomaly = df_anomaly.sort_values(by='total_transactions', ascending=False)['product'].values
anomaly

In [None]:
len(anomaly)

In [None]:
# Select top anomaly item from last 3 months

query = f"""
SELECT 
  products.v2ProductName as product,
  SUM(totals.visits) as total_visits,
  SUM(totals.newVisits) as total_first_visits,
  SUM(totals.transactions) as total_transactions,
  SUM(totals.transactionRevenue) as total_revenue,
FROM `bigquery-public-data.google_analytics_sample.ga_sessions_*` h,
    UNNEST(h.hits) hits,
    UNNEST(hits.product) products
WHERE 
  (_TABLE_SUFFIX BETWEEN '20170601' AND '20170801')
  AND
    (
      hits.eCommerceAction.action_type != '0'
      AND
      hits.eCommerceAction.action_type != '3'
      AND
      hits.eCommerceAction.action_type != '4'
    )
  AND
  (geoNetwork.country = 'United States')
  AND
  products.v2ProductName IN {tuple(anomaly)}
GROUP BY product
ORDER BY total_first_visits, total_visits DESC
"""

df_anomaly_3months = client.query(query).to_dataframe()
print(df_anomaly_3months.info())
df_anomaly_3months.head()

In [None]:
# df_anomaly_3months['first_visit_rate'] = df_anomaly_3months['total_first_visits'] / df_anomaly_3months['total_visits']

In [None]:
# df_anomaly_3months = df_anomaly_3months[(df_anomaly_3months['first_visit_rate'] > 0.5) 
#                                         & (df_anomaly_3months['total_visits'] > 800)]

In [None]:
df_anomaly_3months.sort_values(by='total_transactions', ascending=False).head(20)

In [None]:
anomaly_3months = df_anomaly_3months.sort_values(by='total_transactions', ascending=False)['product'].head(20)
print(len(anomaly_3months))
anomaly_3months

In [None]:
# Filtering for get the category
query = f"""
SELECT 
  products.v2ProductCategory as category,
  products.v2ProductName as product,
FROM `bigquery-public-data.google_analytics_sample.ga_sessions_*` h,
    UNNEST(h.hits) hits,
    UNNEST(hits.product) products
WHERE 
  (_TABLE_SUFFIX BETWEEN '20170601' AND '20170801')
  AND
    (
      hits.eCommerceAction.action_type != '0'
      AND
      hits.eCommerceAction.action_type != '3'
      AND
      hits.eCommerceAction.action_type != '4'
    )
  AND
  (geoNetwork.country = 'United States')
  AND
  products.v2ProductName IN {tuple(anomaly_3months)}
  GROUP BY product, category
"""

df_selection = client.query(query).to_dataframe()
print(df.info())
df_selection.head(20)

In [None]:
def categorized(x):
    category = []
    categories = ['Accessories', 'Electronics', 'Office', 'Men\'s', 'Women\'s', 'Kid\'s', 'Bags', 'Lifestyle', 'Apparel', 'Shop by Brand']
    for i in x:
        if i in categories:
            category.append(i)
    if len(category) == 0:
        return None
    return category[-1]

df_selection['final_category'] = df_selection['category'].str.split('/').apply(categorized)

df_selection = (df_selection[df_selection['product'].isin(selected)])[df_selection['final_category'].notna()]

In [None]:
category_table = df_selection.groupby(['product','final_category'])\
                        .agg({'category': 'first'})\
                        .reset_index('final_category')\
                        .drop('category', axis=1)
print(category_table.shape)
category_table

In [None]:
# Updated anomaly value
anomaly_3months = category_table['product'].unique()

In [None]:
# anomaly_6months.to_csv('./result/anomaly_products.csv', index=False)

In [None]:
# anomaly_6months.to_csv('./result/anomaly_products(1).csv', index=False)

In [None]:
query = f"""
SELECT
  product.v2ProductName as product,
  SUM(totals.hits) as total_hits,
  SUM(totals.visits) as total_visits,
  SUM(totals.pageviews) as total_page_views,
  SUM(totals.newVisits) as total_first_visits,
  SUM(totals.timeOnScreen) as total_time_on_screen,
  AVG(totals.timeOnScreen) as avg_time_on_screen,
  SUM(totals.timeOnSite) as total_time_on_site,
  AVG(totals.timeOnSite) as avg_time_on_site,
  SUM(totals.screenviews) as total_screen_views,
  SUM(totals.transactions) as total_transactions,
  SUM(totals.transactionRevenue) as total_revenue,
  AVG(totals.transactionRevenue) as avg_revenue,
  SUM(totals.uniqueScreenviews) as total_unique_screen_views,
FROM `bigquery-public-data.google_analytics_sample.ga_sessions_*` h,
    UNNEST(h.hits) hits,
    UNNEST(hits.product) product
WHERE 
  (_TABLE_SUFFIX BETWEEN '20170601' AND '20170801')
  AND
    (
      hits.eCommerceAction.action_type != '0'
      AND
      hits.eCommerceAction.action_type != '3'
      AND
      hits.eCommerceAction.action_type != '4'
    )
  AND
  (geoNetwork.country = 'United States')
  AND
  (products.v2ProductName IN {tuple(anomaly_3months)})
GROUP BY product
"""

df_3months_totals = client.query(query)
print(df_3months_totals.info())
df_3months_totals.head()

In [None]:
query = """
SELECT
  CASE 
    WHEN hits.eCommerceAction.action_type = '1' THEN 'Click through of product lists'
    WHEN hits.eCommerceAction.action_type = '2' THEN 'Product detail views'
    WHEN hits.eCommerceAction.action_type = '5' THEN 'Check out'
    WHEN hits.eCommerceAction.action_type = '6' THEN 'Completed purchase'
  END as action,
  products.v2ProductCategory as category,
FROM `bigquery-public-data.google_analytics_sample.ga_sessions_*` h,
    UNNEST(h.hits) hits,
    UNNEST(hits.product) product
WHERE 
  (_TABLE_SUFFIX BETWEEN '20170601' AND '20170801')
  AND
    (
      hits.eCommerceAction.action_type != '0'
      AND
      hits.eCommerceAction.action_type != '3'
      AND
      hits.eCommerceAction.action_type != '4'
    )
  AND
    (geoNetwork.country = 'United States')
  AND
  (products.v2ProductName IN {tuple(anomaly_3months)})
"""

df_3months_hits = client.query(query).to_dataframe()
print(df_3months_hits.info())
df_3months_hits.head()

In [None]:
df_ohe = pd.get_dummies(df_3months_hits[['action']], dtype=int)
df_3months_hits = pd.concat([df_3months_hits.drop(['action'], axis=1),
                            df_ohe], axis=1)

def modify_columns(array):
    temp = []
    for d in array:
        temp.append(d.replace('(', '').replace(')', '').replace(' ', '_'))
    return temp


df_3months_hits.columns = modify_columns(np.array(df_3months_hits.columns))

df_3months_hits = df_3months_hits.groupby('product').sum().reset_index()

print(df_3months_hits.info())
df_3months_hits.head()

In [None]:
df_3monhts = pd.merge(df_3months_totals, df_3months_hits, how='outer', on='product')
print(df_3monhts.info())
df_3monhts.head()

In [None]:
df_example = example.to_dataframe()

df_example = df_example[df_example['product'].isin(anomaly_6months.values)].reset_index(drop=True)
df_example

In [None]:
df_example = df[df['product'].isin(anomaly_3months.values)].reset_index(drop=True)
df_example

In [None]:
df_example.sort_values(by='total_revenue', ascending=False).head(20)['product'].values

In [None]:
df_example.info()

In [None]:
# # Wait for the job to complete and get the result
# results = example.result()

# # Get the schema of the result
# # schema = results.schema
# schema = [
#     bigquery.SchemaField("product", "STRING"),
#     bigquery.SchemaField("total_hits", "INTEGER"),
#     bigquery.SchemaField("total_visits", "INTEGER"),
#     bigquery.SchemaField("total_page_views", "INTEGER"),
#     bigquery.SchemaField("total_first_visits", "INTEGER"),
#     bigquery.SchemaField("total_time_on_site", "INTEGER"),
#     bigquery.SchemaField("avg_time_on_site", "FLOAT"),
#     bigquery.SchemaField("total_transactions", "INTEGER"),
#     bigquery.SchemaField("total_revenue", "INTEGER"),
#     bigquery.SchemaField("avg_revenue", "FLOAT"),
#     bigquery.SchemaField("medium_none", "INTEGER"),
#     bigquery.SchemaField("medium_not_set", "INTEGER"),
#     bigquery.SchemaField("medium_affiliate", "INTEGER"),
#     bigquery.SchemaField("medium_cpc", "INTEGER"),
#     bigquery.SchemaField("medium_cpm", "INTEGER"),
#     bigquery.SchemaField("medium_organic", "INTEGER"),
#     bigquery.SchemaField("medium_referral", "INTEGER"),
#     bigquery.SchemaField("action_Check_out", "INTEGER"),
#     bigquery.SchemaField("action_Click_through_of_product_lists", "INTEGER"),
#     bigquery.SchemaField("action_Completed_purchase", "INTEGER"),
#     bigquery.SchemaField("action_Product_detail_views", "INTEGER")
# ]

# # Define the new table reference
# project_id = credentials.project_id
# dataset_id = 'dummy'
# table_id = 'anomaly_selection'
# table_ref = client.dataset(dataset_id, project=project_id).table(table_id)

# # Delete the existing destination table if it exists
# try:
#     client.delete_table(table_ref)
#     print(f"Deleted table {table_ref}")
# except Exception as e:
#     print(f"Table {destination_table_ref} does not exist: {e}")
        
# # Create a new table with the schema
# table = bigquery.Table(table_ref, schema=schema)
# table = client.create_table(table)  # API request


# # Load DataFrame to BigQuery table
# job = client.load_table_from_dataframe(df_example, table_ref)

# # Wait for the load job to complete
# job.result()

# print(f"Loaded {job.output_rows} rows into {dataset_id}:{table_id}.")

In [None]:
len(schema)

In [None]:
df_example[df_example['product'] == 'Google Twill Cap']

In [None]:
query = """
SELECT
  CASE 
    WHEN hit.eCommerceAction.action_type = '1' THEN 'Click through of product lists'
    WHEN hit.eCommerceAction.action_type = '2' THEN 'Product detail views'
    WHEN hit.eCommerceAction.action_type = '5' THEN 'Check out'
    WHEN hit.eCommerceAction.action_type = '6' THEN 'Completed purchase'
  END action,
  COUNT(*) users
FROM `bigquery-public-data.google_analytics_sample.ga_sessions_*` h,
  UNNEST(h.hits) hit,
  UNNEST(hit.product) product
WHERE 
  _TABLE_SUFFIX BETWEEN '20160801' AND '20170801'
  AND
    (
      hit.eCommerceAction.action_type != '0'
      AND
      hit.eCommerceAction.action_type != '3'
      AND
      hit.eCommerceAction.action_type != '4'
    )
    GROUP BY action
    ORDER BY users DESC
"""
client.query(query).to_dataframe()

In [None]:
query = """
SELECT
  CASE 
    WHEN hits.eCommerceAction.action_type = '1' THEN 'Click through of product lists'
    WHEN hits.eCommerceAction.action_type = '2' THEN 'Product detail views'
    WHEN hits.eCommerceAction.action_type = '5' THEN 'Check out'
    WHEN hits.eCommerceAction.action_type = '6' THEN 'Completed purchase'
  END as action,
  COUNT(*) users
FROM `bigquery-public-data.google_analytics_sample.ga_sessions_*` h,
    UNNEST(h.hits) hits,
    UNNEST(hits.product) product
WHERE 
  (_TABLE_SUFFIX BETWEEN '20160801' AND '20170801')
  AND
    (
      hits.eCommerceAction.action_type != '0'
      AND
      hits.eCommerceAction.action_type != '3'
      AND
      hits.eCommerceAction.action_type != '4'
    )
  AND
    (geoNetwork.country = 'United States')
  AND
    (product.v2ProductName = 'Sport Bag')
    GROUP BY action
"""

client.query(query).to_dataframe()