In [10]:
from google.cloud import bigquery
from google.oauth2 import service_account
import pandas as pd

In [4]:
# Key.json contains information for service account to connect to BigQuery API
credentials = service_account.Credentials.from_service_account_file('key.json')
project_id = 'zillow-houses-376821'
client = bigquery.Client(credentials = credentials, project = project_id)

In [9]:
# https://medium.com/pipeline-a-data-engineering-resource/automate-your-bigquery-schema-definitions-with-5-lines-of-python-7a1996749718
# Automate schema creation

def create_schema(field_list:list, type_list:list):
    schema_list = []
    for fields, types in zip(field_list, type_list):
        schema = bigquery.SchemaField(fields, types)
        schema_list.append(schema)
    return schema_list

# Function to load dataframe into BigQuery

def bq_load(df, dataset_id: str, table_id: str, schema, client):
    bq_client = client
    dataset_ref = bq_client.dataset(dataset_id)
    dataset_table_id = dataset_ref.table(table_id)
    
    job_config = bigquery.LoadJobConfig()
    job_config.write_disposition='WRITE_TRUNCATE'
    job_config.source_format = bigquery.SourceFormat.CSV
    job_config.autodetect=False
    # Calling create_schema()
    job_config.schema = schema
    job_config.ignore_unknown_values=False
    job = client.load_table_from_dataframe(df,
    table_id,
    location='US',
    job_config=job_config)
    
    return job.result()

In [None]:
# Manual schema creation

# schema = [
#     bigquery.SchemaField("zpid", "INTEGER"),
#     bigquery.SchemaField("streetAddress", "STRING"),
#     bigquery.SchemaField("zipcode", "STRING"),
#     bigquery.SchemaField("city", "STRING"),
#     bigquery.SchemaField("state","STRING"),
#     bigquery.SchemaField("latitude", "NUMERIC"),
#     bigquery.SchemaField("longitude", "NUMERIC"),
#     bigquery.SchemaField("price", "INTEGER"),
#     bigquery.SchemaField("bathrooms", "FLOAT64"),
#     bigquery.SchemaField("bedrooms", "FLOAT64"),
#     bigquery.SchemaField("livingArea", "FLOAT64"),
#     bigquery.SchemaField("homeType", "STRING"),
#     bigquery.SchemaField("taxAssessedValue", "FLOAT64"),
#     bigquery.SchemaField("brokerName", "STRING")
# ]

In [43]:
# df_to_bq = bq_load(df, "zillow_houses", "zillow-houses-376821.zillow_houses.vancouverhouses", create_schema(field_list,type_list), client)

In [11]:
df = pd.read_csv('reviseddataset.csv')

In [15]:
df = df.rename(columns = {'newAddress':'addressWithoutUnit'})

In [16]:
df.columns

Index(['zpid', 'streetAddress', 'zipcode', 'city', 'state', 'latitude',
       'longitude', 'price', 'bathrooms', 'bedrooms', 'livingArea', 'homeType',
       'taxAssessedValue', 'lotAreaValue', 'brokerName', 'neighbourhoodName',
       'addressWithoutUnit'],
      dtype='object')

In [18]:
location = df[['zpid', 'streetAddress', 'zipcode', 'city', 'state', 'latitude', 'longitude','addressWithoutUnit']]
homeinfo = df[['streetAddress', 'price', 'bathrooms', 'bedrooms', 'livingArea', 'homeType', 'taxAssessedValue', 'lotAreaValue', 'neighbourhoodName']]
brokerageinfo = df[['zpid','brokerName']]

In [19]:
# Gathering the types of each column and appending BigQuery datatype to list

def type_list_generator(df):
    type_list = []

    for column in df.columns:
        if df[column].dtype == 'object':
            type_list.append("STRING")
        elif df[column].dtype == 'float64':
            type_list.append("FLOAT64")
        else:
            type_list.append("INTEGER")
    return type_list

# Gathering every column name
def field_list_generator(df):
    field_list = list(df.columns)
    return field_list

In [22]:
# load relations to bigQuery

location_to_bq = bq_load(location, "zillow_houses", "zillow-houses-376821.zillow_houses.location", 
                         create_schema(field_list_generator(location),type_list_generator(location)), client)

homeinfo_to_bq = bq_load(homeinfo, "zillow_houses", "zillow-houses-376821.zillow_houses.homeinfo", 
                         create_schema(field_list_generator(homeinfo),type_list_generator(homeinfo)), client)

brokerageinfo_to_bq = bq_load(brokerageinfo, "zillow_houses", "zillow-houses-376821.zillow_houses.brokerageinfo", 
                              create_schema(field_list_generator(brokerageinfo),type_list_generator(brokerageinfo)), client)

In [23]:
# Testing query

initial_query = """ 

SELECT * FROM `zillow-houses-376821.zillow_houses.location` LIMIT 10

"""

query_results = client.query(initial_query).to_dataframe()

print(query_results)

         zpid            streetAddress zipcode       city state   latitude  \
0   314396616           2973 McGill St  V5K1H8  Vancouver    BC  49.289010   
1   314397260           2565 Dundas St  V5K1P7  Vancouver    BC  49.285038   
2   314397306           2697 Dundas St  V5K1R1  Vancouver    BC  49.285027   
3   314398013         3466 Franklin St  V5K1Y3  Vancouver    BC  49.281788   
4   314398017         3467 Franklin St  V5K1Y4  Vancouver    BC  49.282303   
5   314398025         3495 Franklin St  V5K1Y4  Vancouver    BC  49.282300   
6  2059767952  2741 Hastings St E #303  V5K1Z8  Vancouver    BC  49.281414   
7  2059768478        3537 Georgia St E  V5K2J4  Vancouver    BC  49.278503   
8  2060865873     3523 Georgia St E #2  V5K2L9  Vancouver    BC  49.278477   
9   314399067           2447 Adanac St  V5K2M3  Vancouver    BC  49.277690   

    longitude   addressWithoutUnit  
0 -123.042404       2973 McGill St  
1 -123.052950       2565 Dundas St  
2 -123.049650       2697 Dunda

In [36]:
# Top 5 lowest $/sq ft per neighbourhood

top5lowestpricepersqft = """ 

WITH dollar_sqft_calculations AS (
    SELECT streetAddress, neighbourhoodName, 
    bathrooms, bedrooms,
    price, livingArea, homeType, round(price/livingArea,2) as dollarPerSqFt
    FROM `zillow-houses-376821.zillow_houses.homeinfo`
    WHERE livingArea != 0 and price != 0
),
ranking_dollarpersq AS (
    SELECT *, 
    rank() over (PARTITION BY neighbourhoodName ORDER BY dollarPerSqFt) as ranking
    FROM dollar_sqft_calculations
)

SELECT * FROM ranking_dollarpersq WHERE ranking <= 5 

"""

query_results = client.query(top5lowestpricepersqft).to_dataframe()



In [37]:
print(query_results)

                   streetAddress neighbourhoodName  bathrooms  bedrooms  \
0                 1055 Wolfe Ave       Shaughnessy        5.0       8.0   
1             1056 Richelieu Ave       Shaughnessy        3.0       5.0   
2    1011 King Edward Ave W #201       Shaughnessy        2.0       2.0   
3                  4549 Osler St       Shaughnessy        6.0       6.0   
4                  5290 Angus Dr       Shaughnessy        4.0       6.0   
..                           ...               ...        ...       ...   
104          3445 E 49th Ave #98            Sunset        2.0       4.0   
105              1322 E 62nd Ave            Sunset        6.0       6.0   
106              1045 E 63rd Ave            Sunset        4.0       6.0   
107               268 E 65th Ave            Sunset        3.0       5.0   
108               320 E 57th Ave            Sunset        3.0       5.0   

         price  livingArea       homeType  dollarPerSqFt  ranking  
0    4988000.0      5765.0  SIN

In [38]:
query_results.to_csv('queries/Top5LowestDollarPerSqft.csv', index = False)

In [35]:
# Top 3 highest price per neighbourhood

top3highestprice = """ 

WITH top_3_rank AS (
    SELECT streetAddress, neighbourhoodName, 
    bathrooms, bedrooms,
    price, livingArea, homeType, 
    round(price/livingArea,2) as dollarPerSqFt,
    rank() over (PARTITION BY neighbourhoodName ORDER BY price DESC) as ranking
    FROM `zillow-houses-376821.zillow_houses.homeinfo`
    WHERE livingArea != 0 and price != 0
)

SELECT * FROM top_3_rank WHERE ranking <= 3 

"""

query2_results = client.query(top3highestprice).to_dataframe()

In [39]:
print(query2_results)

            streetAddress         neighbourhoodName  bathrooms  bedrooms  \
0      3439 Point Grey Rd                 Kitsilano        4.0       4.0   
1      2529 Point Grey Rd                 Kitsilano        3.0       3.0   
2      2487 Point Grey Rd                 Kitsilano        5.0       4.0   
3        1351 Laurier Ave               Shaughnessy        8.0       5.0   
4           1738 Angus Dr               Shaughnessy        7.0       4.0   
..                    ...                       ...        ...       ...   
62        2026 E 32nd Ave  Kensington-Cedar Cottage        7.0       8.0   
63     5059 Sherbrooke St  Kensington-Cedar Cottage        5.0       5.0   
64         692 W 39th Ave              South Cambie        2.0       4.0   
65  846 King Edward Ave W              South Cambie        5.0       6.0   
66    5289 Cambie St #601              South Cambie        3.0       4.0   

         price  livingArea       homeType  dollarPerSqFt  ranking  
0   24988000.0     

In [40]:
query2_results.to_csv('queries/Top3HighestPrice.csv', index = False)

In [41]:
# Top 3 lowest price per neighbourhood

top3lowestprice = """ 

WITH top_3_rank AS (
    SELECT streetAddress, neighbourhoodName, 
    bathrooms, bedrooms,
    price, livingArea, homeType, 
    round(price/livingArea,2) as dollarPerSqFt,
    rank() over (PARTITION BY neighbourhoodName ORDER BY price) as ranking
    FROM `zillow-houses-376821.zillow_houses.homeinfo`
    WHERE livingArea != 0 and price != 0
)

SELECT * FROM top_3_rank WHERE ranking <= 3 

"""

query3_results = client.query(top3lowestprice).to_dataframe()

In [42]:
print(query3_results)

           streetAddress    neighbourhoodName  bathrooms  bedrooms      price  \
0   1055 Broadway E #204             Fairview        1.0       1.0   550000.0   
1   1551 W 11th Ave #208             Fairview        1.0       1.0   555000.0   
2   1216 W 11th Ave #206             Fairview        1.0       1.0   558000.0   
3      2224 Eton St #103   Grandview-Woodland        1.0       0.0   434000.0   
4    1533 E 8th Ave #101   Grandview-Woodland        1.0       1.0   649900.0   
..                   ...                  ...        ...       ...        ...   
62      8469 Portside Ct  Victoria-Fraserview        3.0       3.0  1088000.0   
63     1980 E Kent Ave S  Victoria-Fraserview        3.0       3.0  1089000.0   
64   2580 Tolmie St #504      West Point Grey        1.0       1.0   688000.0   
65   3755 W 8th Ave #213      West Point Grey        2.0       2.0  1458000.0   
66       2460 Sasamat St      West Point Grey        3.0       2.0  1650000.0   

    livingArea   homeType  

In [43]:
query3_results.to_csv('queries/Top3LowestPrice.csv', index = False)