# Data Update Test Notebook

This notebook was written for initial testing of updating Zillow and ADI data into a pre-existing table.  Post testing, this functionality will be deployed Python a script.

In [1]:
import pandas as pd
from google.cloud import bigquery
import pandas_gbq

from data_download_utilities import csv_download, bq_download
from initial_data_engineering_setup import pull_and_merge_data

In [11]:
state_fips_code = '48'

In [3]:
url = 'https://files.zillowstatic.com/research/public_csvs/zhvi/County_zhvi_uc_sfrcondo_tier_0.0_0.33_sm_sa_month.csv?t=1719616688'
zillow_df = csv_download(url)
zillow_df

Unnamed: 0,RegionID,SizeRank,RegionName,RegionType,StateName,State,Metro,StateCodeFIPS,MunicipalCodeFIPS,2000-01-31,...,2023-08-31,2023-09-30,2023-10-31,2023-11-30,2023-12-31,2024-01-31,2024-02-29,2024-03-31,2024-04-30,2024-05-31
0,3101,0,Los Angeles County,county,CA,CA,"Los Angeles-Long Beach-Anaheim, CA",6,37,124912.217391,...,622968.669520,632323.728567,640283.515150,646900.415598,651704.845595,652102.707975,649008.305552,645802.140203,645443.686791,647091.578083
1,139,1,Cook County,county,IL,IL,"Chicago-Naperville-Elgin, IL-IN-WI",17,31,74703.400245,...,179697.073275,181258.034792,182519.288526,183278.357444,183707.225098,183807.935922,184617.333267,186062.801111,188338.280310,190061.319519
2,1090,2,Harris County,county,TX,TX,"Houston-The Woodlands-Sugar Land, TX",48,201,62792.666916,...,201893.354160,202124.843436,202252.005472,202199.851711,202278.715599,202712.332624,203201.534777,204045.570081,204951.684008,205630.885842
3,2402,3,Maricopa County,county,AZ,AZ,"Phoenix-Mesa-Chandler, AZ",4,13,81945.145075,...,348075.444474,350421.502771,352503.270506,354143.374449,355173.368792,355679.347419,356144.405109,357065.041385,358225.879499,359143.128779
4,2841,4,San Diego County,county,CA,CA,"San Diego-Chula Vista-Carlsbad, CA",6,73,130444.002644,...,645270.438699,653422.726440,660237.686939,665258.088915,668322.625249,670558.341135,673675.731617,678699.813944,684599.865145,689612.499144
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3000,240,3204,Loup County,county,NE,NE,,31,115,,...,,,,104414.807407,104251.992923,104919.333425,106128.265543,107575.759483,108694.770944,109157.567101
3001,846,3206,Banner County,county,NE,NE,"Scottsbluff, NE",31,7,,...,188365.309979,189329.978255,190464.865275,190741.055744,190697.949048,189965.823131,189547.901080,189877.165166,190931.660439,191769.109187
3002,1648,3207,Daggett County,county,UT,UT,,49,9,,...,,,220912.893659,220538.303035,220249.263259,220171.703311,221442.220032,222949.498827,224425.807479,224542.652354
3003,1432,3208,Thomas County,county,NE,NE,,31,171,,...,,,,64374.757186,63986.350912,64025.250888,64121.239607,64753.424721,65537.876908,66509.107848


In [4]:
year_list = []

for n in zillow_df.columns:
    if n[0:4].isdigit():
        if int(n[0:4]) not in year_list:
            year_list.append(int(n[0:4]))
year_list

[2000,
 2001,
 2002,
 2003,
 2004,
 2005,
 2006,
 2007,
 2008,
 2009,
 2010,
 2011,
 2012,
 2013,
 2014,
 2015,
 2016,
 2017,
 2018,
 2019,
 2020,
 2021,
 2022,
 2023,
 2024]

In [5]:
adi_query = 'SELECT DISTINCT year FROM `bigquery-public-data.broadstreet_adi.area_deprivation_index_by_county`'

adi_check_df = bq_download(adi_query)
adi_check_df
adi_list = adi_check_df['year'].to_list()
adi_list

[2018, 2019, 2020]

In [6]:
adi_zillow_match_year_list = list(set(year_list).intersection(adi_list))
adi_zillow_match_year_list

[2018, 2019, 2020]

In [7]:
project = 'even-gearbox-427900-t9'
dataset = 'avi_housing'
table_name = 'merged_data'

current_data_query = f'SELECT DISTINCT year FROM `{project}.{dataset}.{table_name}`'

current_data_df = bq_download(current_data_query)
current_data_df

current_yrs_list = current_data_df['year'].to_list()


In [8]:
missing_year_list = list(set(adi_zillow_match_year_list) - set(current_yrs_list))
missing_year_list

[2018, 2019]

In [14]:
for n in missing_year_list:
    dl_year = str(n)
    merged_df = pull_and_merge_data(dl_year, state_fips_code)
    table_id_combined = f'{dataset}.{table_name}'
    pandas_gbq.to_gbq(merged_df, table_id_combined, project_id=project, table_schema=[{'name': 'county_geom', 'type': 'GEOGRAPHY'}], if_exists = 'append')

100%|██████████| 1/1 [00:00<00:00, 10459.61it/s]
100%|██████████| 1/1 [00:00<00:00, 11748.75it/s]
