## Part 1: County Data (Public Dataset)

In [2]:
from google.cloud import bigquery
bq = bigquery.Client()

In [3]:
q = bq.query(
"""
select count(*) as num_rows 
from bigquery-public-data.geo_us_boundaries.counties
""")
q.to_dataframe()

Unnamed: 0,num_rows
0,3233


In [4]:
#q1

q1 = bq.query(
"""
SELECT geo_id 
FROM bigquery-public-data.geo_us_boundaries.counties
WHERE county_name = 'Dane'
""")
df = q1.to_dataframe()
result = df['geo_id'].values[0]
result

'55025'

In [5]:
#q2

q2 = bq.query(
"""
SELECT state_fips_code, COUNT(*) AS num_counties
FROM bigquery-public-data.geo_us_boundaries.counties
GROUP BY state_fips_code
ORDER BY num_counties DESC
LIMIT 5
""")
df = q2.to_dataframe()
dict = df.set_index('state_fips_code')['num_counties'].to_dict()
dict

{'48': 254, '13': 159, '51': 133, '21': 120, '29': 115}

In [6]:
#q3

from google.cloud import bigquery
q1 = bq.query(
"""
SELECT geo_id 
FROM bigquery-public-data.geo_us_boundaries.counties
WHERE county_name = 'Dane'
""",
job_config=bigquery.QueryJobConfig(use_query_cache=False))

q2 = bq.query(
"""
SELECT state_fips_code, COUNT(*) AS num_counties
FROM bigquery-public-data.geo_us_boundaries.counties
GROUP BY state_fips_code
ORDER BY num_counties DESC
LIMIT 5
""",
job_config=bigquery.QueryJobConfig(use_query_cache=False))

cost1 = q1.total_bytes_billed / 1024**4
cost1 *= 6.25

cost2 = q2.total_bytes_billed / 1024**4
cost2 *= 6.25

cost_dict = {'q1': cost1, 'q2': cost2}
cost_dict

{'q1': 5.9604644775390625e-05, 'q2': 5.9604644775390625e-05}

## Part 2: HDMA Data (Parquet in GCS)

In [18]:
ds = bigquery.Dataset("comp-sci-544-398616.p8")
ds

Dataset(DatasetReference('comp-sci-544-398616', 'p8'))

In [19]:
bq.create_dataset(ds, exists_ok=True)

Dataset(DatasetReference('comp-sci-544-398616', 'p8'))

In [20]:
config = bigquery.LoadJobConfig(source_format="PARQUET", write_disposition="WRITE_TRUNCATE")
source = "gs://cs544_perrito_p8/hdma-wi-2021.parquet"
hdma = bq.load_table_from_uri(source, "comp-sci-544-398616.p8.table1", job_config=config)
hdma.result()

LoadJob<project=comp-sci-544-398616, location=US, id=33d06f8d-32c9-4967-8ac7-d9366867875d>

In [21]:
#q4

for ds in bq.list_datasets("comp-sci-544-398616"):
    print(ds.dataset_id)

p8


In [180]:
#q5

bq.query("""
CREATE OR REPLACE TABLE `comp-sci-544-398616.p8.wi`
AS
SELECT *
FROM bigquery-public-data.geo_us_boundaries.counties
WHERE state_fips_code = '55'
""")

q3 = bq.query(
    """
    SELECT hdma.county_code, wi.county_name, COUNT(*) AS loan_term
    FROM `comp-sci-544-398616.p8.table1` AS hdma
    JOIN `comp-sci-544-398616.p8.wi` AS wi
    ON hdma.county_code = wi.county_fips_code
    GROUP BY hdma.county_code, wi.county_name
    ORDER BY loan_term DESC
    LIMIT 10
    """
)

df2 = q3.to_dataframe()
dict2 = df2.set_index('county_name')['loan_term'].to_dict()
dict2

{'Milwaukee': 46570,
 'Dane': 38557,
 'Waukesha': 34159,
 'Brown': 15615,
 'Racine': 13007,
 'Outagamie': 11523,
 'Kenosha': 10744,
 'Washington': 10726,
 'Rock': 9834,
 'Winnebago': 9310}