In [25]:
from google.cloud import bigquery
import pandas as pd

import numpy as np

In [151]:
#get data for a given month as dataframe
def get_month_data(year: int, month: int, select_cols: str): 
    #get correct number of days based on month
    match month:
        case 2: 
            max_days = 28
        case 1 | 3 | 5 | 7 | 8 | 10 | 12:
            max_days = 31
        case 4 | 6 | 9 | 11:
            max_days = 30
    #write select statement given columns 
    statement = f"""SELECT {select_cols}
        FROM `bigquery-public-data.google_analytics_sample.ga_sessions_*`, UNNEST(hits) as hits WHERE
        _TABLE_SUFFIX BETWEEN FORMAT_DATE('%Y%m%d', '{year}-{month}-01') AND FORMAT_DATE('%Y%m%d', '{year}-{month}-{max_days}')
        AND eventInfo.eventAction IS NOT NULL
        ORDER BY date ASC;""" 
        #initialize bigquery connection
    client = bigquery.Client()
    job_config = bigquery.QueryJobConfig()
    query_job = client.query(statement, job_config=job_config)
    results = query_job.result().to_dataframe()
    return(results)



In [182]:
#flatten JSON records
def flatten(df, *cols):
    for col in cols:
        temp = pd.json_normalize(df[col])
        df = pd.concat([df, temp], axis = 1)
        df = df.drop(col, axis = 1)
    return df 

In [181]:
#FIX ???
#drop columns that are than 50% null or only have 1 unique value
def drop_columns(df):
    nulls = [None, '(not set)', '(not provided)', '(none)', '(not available in demo dataset)', np.nan] 
    for col in (df.columns):
        #remove all nan only columns
        if pd.isna(df[col]).all() or df[col].isin(nulls).all():
            df = df.drop(col, axis =1)
        #remove columns where >50% of values are null: 
        elif (df[col].isin(nulls).value_counts()[False]/len(df.index) < 0.5):
            df = df.drop(col, axis = 1)
        #remove all cols that only have 1 unique value 
        elif (len(df[col].astype(str).unique()) <= 1):
            df = df.drop(col, axis =1)
    return df

In [184]:
#preprocessing pipeline
def preprocess(year, month, select_cols, *flatten_cols):
    month_df = get_month_data(year, month, select_cols)
    flatten_df = flatten(month_df, *flatten_cols)
    dropped_df = drop_columns(flatten_df)
    try:
        dropped_df.to_csv(f'monthly_data_{month}.csv', mode = 'x', index= False)
    except ValueError:
        print('file already exists')

In [None]:
select_cols = """date, visitId, totals, trafficSource, device, geoNetwork, hits, eventInfo.eventAction AS raw_target, 
        CASE
        WHEN eventInfo.eventAction = 'Add to Cart' THEN 1
        ELSE 0
        END AS target_encoded"""
preprocess(2016, 10, select_cols, 'hits', 'trafficSource', 'device', 'geoNetwork', 'totals')

In [185]:
preprocess(2017,3, select_cols, 'hits', 'trafficSource', 'device', 'geoNetwork', 'totals')