## BigQuery Loading

This program will load the Customer_Transactions data into a BigQuery project.

- [Import modules](#1)
- [Load data from local to BigQuery](#2)
- [Load data from BigQuery](#3)

### <a id="1"></a>Import modules

In [1]:
# !pip install --upgrade google-cloud-bigquery
from google.cloud import bigquery

import os

import pandas as pd

In [2]:
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "test.json"

In [3]:
client = bigquery.Client()
client

<google.cloud.bigquery.client.Client at 0x252d68e5b10>

### _Testing Connection_

In [4]:
citibike_query = "SELECT * FROM bigquery-public-data.new_york_citibike.citibike_trips LIMIT 5"
citibike_query_job = client.query(citibike_query)
citibike_query_job

QueryJob<project=test7151991, location=US, id=5b591ef2-4b69-456e-ae8a-62f03c9ec574>

In [5]:
for row in citibike_query_job:
    print(row)
    print("")

Row((None, None, None, None, '', None, None, None, '', None, None, None, '', None, '', ''), {'tripduration': 0, 'starttime': 1, 'stoptime': 2, 'start_station_id': 3, 'start_station_name': 4, 'start_station_latitude': 5, 'start_station_longitude': 6, 'end_station_id': 7, 'end_station_name': 8, 'end_station_latitude': 9, 'end_station_longitude': 10, 'bikeid': 11, 'usertype': 12, 'birth_year': 13, 'gender': 14, 'customer_plan': 15})

Row((None, None, None, None, '', None, None, None, '', None, None, None, '', None, '', ''), {'tripduration': 0, 'starttime': 1, 'stoptime': 2, 'start_station_id': 3, 'start_station_name': 4, 'start_station_latitude': 5, 'start_station_longitude': 6, 'end_station_id': 7, 'end_station_name': 8, 'end_station_latitude': 9, 'end_station_longitude': 10, 'bikeid': 11, 'usertype': 12, 'birth_year': 13, 'gender': 14, 'customer_plan': 15})

Row((None, None, None, None, '', None, None, None, '', None, None, None, '', None, '', ''), {'tripduration': 0, 'starttime': 1, 's

### <a id="2"></a>Load data from local to BigQuery

In [6]:
# Directory for the project
mainDir = "folderPath"
loadedDataPath = mainDir + "Loaded_Data/"

In [7]:
# List of files in loadedDataPath
os.listdir(loadedDataPath)

['Customers.csv',
 'Customer_Transactions.xlsx',
 'payload_files_directory.csv',
 'Transactions.csv']

In [8]:
# Loading the joined Customer Transactions table
Customer_Transactions = pd.read_excel(loadedDataPath + "Customer_Transactions.xlsx")
Customer_Transactions.shape

(536885, 15)

In [13]:
tableRef = client.dataset("Customer_Transactions").table("Customer_Transactions")
tableRef

TableReference(DatasetReference('test7151991', 'Customer_Transactions'), 'Customer_Transactions')

In [14]:
bigqueryJob = client.load_table_from_dataframe(Customer_Transactions, tableRef)
bigqueryJob.result()

LoadJob<project=test7151991, location=US, id=98e3fb74-42be-41d5-9ed7-b2478ef0912e>

### <a id="3"></a>Load data from BigQuery

In [28]:
# !pip install bigframes
import bigframes.pandas as bpd

In [54]:
# Query for average price by Department, Category, SKU
CT_priceAvg_query = """
                    SELECT Department, Category, SKU, COUNT(SKU) AS , AVG(Price) AS Avg_Price
                    FROM test7151991.Customer_Transactions.Customer_Transactions
                    GROUP BY Department, Category, SKU
                    """

In [55]:
# Load data from BigQuery
CT_priceAvg = bpd.read_gbq(CT_priceAvg_query)
CT_priceAvg.shape

(39, 5)

In [56]:
CT_priceAvg_df = pd.DataFrame(CT_priceAvg.sort_values("Price_Avg", ascending=False))
CT_priceAvg_df.columns = CT_priceAvg.columns
CT_priceAvg_df

Unnamed: 0,Department,Category,SKU,SKU_Cnt,Price_Avg
0,Men,Bottoms,Suit Trousers,5128,55.77
1,Men,Bottoms,Cargo Pants,9276,50.77
2,Men,Bottoms,Dress Pants,9305,45.77
3,Women,Bottoms,Palazzo Pants,9398,41.77
4,Men,Shirts,Henley Shirt,8387,41.77
5,Women,Tops,Hoodie,18960,40.77
6,Men,Bottoms,Chinos,27194,39.77
7,Men,Shirts,Button-Down Shirt,15324,38.77
8,Men,Bottoms,Corduroy Pants,9507,38.77
9,Women,Bottoms,Wide-Leg Pants,22151,37.77


In [31]:
# Query to count discounted transactions
CT_discountTransactions_query = """
                                SELECT COUNT(Transaction_ID) AS Transactions_w_Discounts
                                FROM test7151991.Customer_Transactions.Customer_Transactions
                                WHERE Discount IS NOT NULL;
                                """

In [32]:
# Load data from BigQuery
CT_discountTransactions = bpd.read_gbq(CT_discountTransactions_query)
CT_discountTransactions

Unnamed: 0,Transactions_w_Discounts
0,22705


In [59]:
# Query to get total transactions and total prices for each year
CT_yearlyTransactionsPrices_query = """
                              SELECT EXTRACT(YEAR FROM DATE(Date)) AS Year, 
                              COUNT(Transaction_ID) AS Total_Transactions,
                              ROUND(SUM(Price),0) AS Total_Prices
                              FROM test7151991.Customer_Transactions.Customer_Transactions
                              GROUP BY Year;
                              """

In [63]:
# Load data from BigQuery
CT_yearlyTransactionsPrices = bpd.read_gbq(CT_yearlyTransactionsPrices_query)
CT_yearlyTransactionsPrices.shape

(5, 3)

In [62]:
CT_yearlyTransactionsPrices_df = pd.DataFrame(CT_yearlyTransactionsPrices.sort_values("Year", ascending=True))
CT_yearlyTransactionsPrices_df.columns = CT_yearlyTransactionsPrices.columns
CT_yearlyTransactionsPrices_df

Unnamed: 0,Year,Total_Transactions,Total_Prices
0,2022,299034,9573574.0
1,2023,237400,7512252.0
2,2024,222,7102.0
3,2025,210,6423.0
4,2026,19,549.0


## BigQuery Loading Completed.