In [25]:
from google.cloud import bigquery
import pandas as pd

import numpy as np

In [187]:
#get data for a given month as dataframe
def get_month_data(year: int, month: int, select_cols: str): 
    #get correct number of days based on month
    match month:
        case 2: 
            max_days = 28
        case 1 | 3 | 5 | 7 | 8 | 10 | 12:
            max_days = 31
        case 4 | 6 | 9 | 11:
            max_days = 30
    #write select statement given columns 
    statement = f"""SELECT {select_cols}
        FROM `bigquery-public-data.google_analytics_sample.ga_sessions_*`, 
        UNNEST(hits) as hits LEFT JOIN UNNEST(product) as product LEFT JOIN UNNEST(promotion) AS promotion 
        WHERE _TABLE_SUFFIX BETWEEN FORMAT_DATE('%Y%m%d', '{year}-{month}-01') AND FORMAT_DATE('%Y%m%d', '{year}-{month}-{max_days}')
        AND eventInfo.eventAction IS NOT NULL
        ORDER BY date ASC;""" 
        #initialize bigquery connection
    client = bigquery.Client()
    job_config = bigquery.QueryJobConfig()
    query_job = client.query(statement, job_config=job_config)
    results = query_job.result().to_dataframe()
    return(results)



In [215]:
#flatten JSON records
def flatten(df, *cols):
    for col in cols:
        temp = pd.json_normalize(df[col])
        print(col, len(temp.columns))
        df = pd.concat([df, temp], axis = 1)
        df = df.drop(col, axis = 1)
    return df 

In [232]:
#FIX ???
#drop columns that are than 50% null or only have 1 unique value
def drop_columns(df):
    nulls = [None, '(not set)', '(not provided)', '(none)', '(not available in demo dataset)', np.nan, np.array([])] 
    #remove duplicates
    duplicated = df.columns[df.columns.duplicated()]
    for col in duplicated:
        df = df.drop[col][0]
    for col in (df.columns):
        print(col)
        #remove all nan only columns
        if pd.isna(df[col]).all() or df[col].isin(nulls).all():
            df = df.drop(col, axis =1)
        #remove columns where >50% of values are null: 
        elif (df[col].isin(nulls).value_counts()[False]/len(df.index) < 0.5):
            df = df.drop(col, axis = 1)
        #remove all cols that only have 1 unique value 
        elif (len(df[col].astype(str).unique()) <= 1):
            df = df.drop(col, axis =1)
    return df

In [184]:
#preprocessing pipeline
def preprocess(year, month, select_cols, *flatten_cols):
    month_df = get_month_data(year, month, select_cols)
    flatten_df = flatten(month_df, *flatten_cols)
    dropped_df = drop_columns(flatten_df)
    try:
        dropped_df.to_csv(f'monthly_data_{month}.csv', mode = 'x', index= False)
    except ValueError:
        print('file already exists')

In [190]:
select_cols = """date, visitId, totals, trafficSource, device, geoNetwork, product, promotion hits eventInfo.eventAction AS raw_target, 
        CASE
        WHEN eventInfo.eventAction = 'Add to Cart' THEN 1
        ELSE 0
        END AS target_encoded"""
preprocess(2016, 10, select_cols, 'product', 'promotion', 'hits', 'trafficSource', 'device', 'geoNetwork','totals')

date
visitId
product


ValueError: The truth value of a Series is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().

In [185]:
preprocess(2017,3, select_cols, 'product', 'promotion', 'hits', 'trafficSource', 'device', 'geoNetwork','totals')

In [208]:
month_df = get_month_data(2016,10, select_cols)

In [218]:
flatten_df = flatten(month_df, 'product', 'promotion', 'hits', 'trafficSource', 'device', 'geoNetwork','totals')

product 19
promotion 4
hits 101
trafficSource 21
device 17
geoNetwork 11
totals 13


In [236]:
flatten_df

Unnamed: 0,date,visitId,raw_target,target_encoded,productSKU,v2ProductName,v2ProductCategory,productVariant,productBrand,productRevenue,...,timeOnSite,bounces,transactions,transactionRevenue,newVisits,screenviews,uniqueScreenviews,timeOnScreen,totalTransactionRevenue,sessionQualityDim
0,20161001,1475337407,Promotion Click,0,,,,,,,...,,,,,1.0,,,,,
1,20161001,1475368375,Promotion Click,0,,,,,,,...,38.0,,,,1.0,,,,,
2,20161001,1475380960,Promotion Click,0,,,,,,,...,3.0,,,,1.0,,,,,
3,20161001,1475341828,Promotion Click,0,,,,,,,...,6.0,,,,1.0,,,,,
4,20161001,1475352999,Promotion Click,0,,,,,,,...,10.0,,,,1.0,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
66509,20161031,1477969807,Product Click,0,GGOEGBRA037499,Waterproof Backpack,Home/Bags/,(not set),(not set),,...,760.0,,,,1.0,,,,,
66510,20161031,1477963371,Quickview Click,0,GGOEGAAX0037,Google Sunglasses,Home/Accessories/Fun/,(not set),(not set),,...,170.0,,,,1.0,,,,,
66511,20161031,1477963371,Quickview Click,0,GGOEGCGB022199,Fashion Sunglasses & Pouch,Home/Accessories/Fun/,(not set),(not set),,...,170.0,,,,1.0,,,,,
66512,20161031,1477937149,Quickview Click,0,GGOEGAAX0360,Google Women's Fleece Hoodie,Home/Apparel/Women's/,(not set),(not set),,...,232.0,,1.0,44790000.0,,,,,52790000.0,


In [235]:
flatten_df.drop('customDimensions', axis = 1)

Unnamed: 0,date,visitId,raw_target,target_encoded,productSKU,v2ProductName,v2ProductCategory,productVariant,productBrand,productRevenue,...,timeOnSite,bounces,transactions,transactionRevenue,newVisits,screenviews,uniqueScreenviews,timeOnScreen,totalTransactionRevenue,sessionQualityDim
0,20161001,1475337407,Promotion Click,0,,,,,,,...,,,,,1.0,,,,,
1,20161001,1475368375,Promotion Click,0,,,,,,,...,38.0,,,,1.0,,,,,
2,20161001,1475380960,Promotion Click,0,,,,,,,...,3.0,,,,1.0,,,,,
3,20161001,1475341828,Promotion Click,0,,,,,,,...,6.0,,,,1.0,,,,,
4,20161001,1475352999,Promotion Click,0,,,,,,,...,10.0,,,,1.0,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
66509,20161031,1477969807,Product Click,0,GGOEGBRA037499,Waterproof Backpack,Home/Bags/,(not set),(not set),,...,760.0,,,,1.0,,,,,
66510,20161031,1477963371,Quickview Click,0,GGOEGAAX0037,Google Sunglasses,Home/Accessories/Fun/,(not set),(not set),,...,170.0,,,,1.0,,,,,
66511,20161031,1477963371,Quickview Click,0,GGOEGCGB022199,Fashion Sunglasses & Pouch,Home/Accessories/Fun/,(not set),(not set),,...,170.0,,,,1.0,,,,,
66512,20161031,1477937149,Quickview Click,0,GGOEGAAX0360,Google Women's Fleece Hoodie,Home/Apparel/Women's/,(not set),(not set),,...,232.0,,1.0,44790000.0,,,,,52790000.0,


In [231]:
drop_df = drop_columns(flatten_df)

date
visitId
raw_target
target_encoded


  return f(comps_array, values)


productSKU
v2ProductName
v2ProductCategory
productVariant


  return f(comps_array, values)
  return f(comps_array, values)
  return f(comps_array, values)


productBrand
productRevenue
localProductRevenue
productPrice
localProductPrice
productQuantity
productRefundAmount
localProductRefundAmount
isImpression
isClick
customDimensions


ValueError: The truth value of a Series is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().