In [2]:
# If using the native Google BigQuery API module:
from google.cloud import bigquery
from google.cloud.exceptions import NotFound
import pandas as pd
import os
from google.oauth2 import service_account
from datetime import datetime

In [3]:
# If using a service account key file, save the path to that file in credentials.py and import credentials
path_to_service_account_key_file = "keys.json"
#!pip install credentials

311 dataset first

In [4]:
# # Set the name of the dimension
# dimension_name = 'location'

# # Set the name of the surrogate key
# surrogate_key = f"{dimension_name}_dim_id"

# # # Set the name of the business key
# # business_key = f'{dimension_name}_id'

# # Set the GCP Project, dataset and table name
# gcp_project = 'cis-4400-404715'
# bq_dataset = '311_illegal_parking'
# table_name = f"{dimension_name}_dimension"
# # Construct the full BigQuery path to the table
# dimension_table_path = f"{gcp_project}.{bq_dataset}.{table_name}"

# # Set the path to the source data files. Use double-slash for Windows paths C:\\myfolder
# # For Linux use forward slashes    /home/username/python_etl
# # For Mac use forward slashes      /users/username/python_etl
# # file_source_path = 'c:\\Python_ETL'
# # file_source_path = 'C:\\Users\\rholo\\OneDrive\\Documents\\classes\\4400\\311'
# file_source_path = 'C:\\Users\\andyy\\Desktop\\cis-4400\\data\\311_master.csv'

In [5]:
def transform_data(df: pd.DataFrame, column_list):
    """
    transform_data
    Accepts a data frame
    Performs any specific cleaning and transformation steps on the dataframe
    Returns the modified dataframe
    This function can be modified based on required changes
    """
    # Select the columns for this dimension
    df = df[column_list]
    # Remove duplicates
    df = df.drop_duplicates()
    return df

In [6]:
def create_bigquery_client():
    """
    create_bigquery_client
    Creates a BigQuery client using the path to the service account key file
    for credentials.
    Returns the BigQuery client object
    """
    try:
        # If authenticating using a service account key file, use the following code:
        bqclient = bigquery.Client.from_service_account_json(service_account_key)
        # Google Colab authentication already completed
        bqclient = bigquery.Client(gcp_project)
        return bqclient
    except Exception as err:
        print("error")
        # os._exit(-1)
    return bqclient

In [7]:
def upload_bigquery_table(bqclient, table_path, write_disposition, df):
    """
    upload_bigquery_table
    Accepts a path to a BigQuery table, the write disposition and a dataframe
    Loads the data into the BigQuery table from the dataframe.
    for credentials.
    The write disposition is either
    write_disposition="WRITE_TRUNCATE"  Erase the target data and load all new data.
    write_disposition="WRITE_APPEND"    Append to the existing table
    """
    try:
        # Set up a BigQuery job configuration with the write_disposition.
        job_config = bigquery.LoadJobConfig(write_disposition=write_disposition)
        
        # Submit the job
        print(type(bqclient))
        job = bqclient.load_table_from_dataframe(df, table_path, job_config=job_config)
        # Show the job results
    except Exception as err:
        print(err)
        #os._exit(-1)

In [8]:
def bigquery_table_exists(bqclient, table_path):
    """
    bigquery_table_exists
    Accepts a path to a BigQuery table
    Checks if the BigQuery table exists.
    Returns True or False
    """
    try:
        bqclient.get_table(table_path)  # Make an API request.
        return True
    except NotFound:
        return False

In [9]:
def query_bigquery_table(table_path, bqclient, surrogate_key):
    """
    query_bigquery_table
    Accepts a path to a BigQuery table and the name of the surrogate key
    Queries the BigQuery table but leaves out the update_timestamp and surrogate key columns
    Returns the dataframe
    """
    bq_df = pd.DataFrame
    sql_query = 'SELECT * EXCEPT ( update_timestamp, '+surrogate_key+') FROM `' + table_path + '`'
    try:
        bq_df = bqclient.query(sql_query).to_dataframe()
    except Exception as err:
        print("error")
    return bq_df

In [10]:
def add_surrogate_key(df, dimension_name='customers', offset=1):
    """
    add_surrogate_key
    Accepts a data frame and inserts an integer identifier as the first column
    Returns the modified dataframe
    """
    # Reset the index to count from 0
    df.reset_index(drop=True, inplace=True)
    # Add the new surrogate key starting from offset
    df.insert(0, dimension_name+'_dim_id', df.index+offset)
    return df

In [11]:
def build_new_table(bqclient, dimension_table_path, dimension_name, df):
    """
    build_new_table
    Accepts a path to a dimensional table, the dimension name and a data frame
    Add the surrogate key and a record timestamp to the data frame
    Inserts the contents of the dataframe to the dimensional table.
    """
    # Add a surrogate key
    df = add_surrogate_key(df, dimension_name, 1)
    # Add the update timestamp
    # Upload the dataframe to the BigQuery table
    upload_bigquery_table(bqclient, dimension_table_path, "WRITE_TRUNCATE", df)

In [12]:
def rename_column(df, bq_dataset, dimension_name):

    # Renaming for 311
    if bq_dataset == '311_illegal_parking':
        if dimension_name == 'complaint':
            df = df.rename(columns={'descriptor': 'complaint_description'})
        elif dimension_name == 'complaint_source':
            df = df.rename(columns={'open_data_channel_type': 'complaint_source_channel'})
        elif dimension_name == 'location':
            df = df.rename(columns={'city': 'incident_city', 'incident_zip': 'incident_zipcode'})

    # Renaming for Open Parking
    elif bq_dataset == 'open_parking':
        if dimension_name == 'agency':
            df = df.rename(columns={'issuing_agency': 'agency_name'})
        elif dimension_name == 'location':
            # NOTE: county is from API, and is not yet calculated.
            # make sure to calculate county BEFORE calling this function
            df = df.rename(columns={'precinct': 'precinct_num', 'county': 'borough'})
        elif dimension_name == 'violation':
            df = df.rename(columns={'violation': 'violation_description'})
        elif dimension_name == 'violator':
            df = df.rename(columns={'plate': 'violator_plate', 'state': 'violator_state'})
    return df

In [13]:
def calculate_location_attributes(df):
    precinct_to_zipcode = {
        "1": "10013",
        "5": "10013",
        "6": "10014",
        "7": "10002",
        "9": "10003",
        "10": "10011",
        "13": "10010",
        "Midtown South": "10001",
        "17": "10022",
        "Midtown North": "10019",
        "19": "10065",
        "20": "10024",
        "Central Park": "10024",
        "23": "10029",
        "24": "10025",
        "25": "10035",
        "26": "10027",
        "28": "10027",
        "30": "10031",
        "32": "10030",
        "33": "10032",
        "34": "10033",
        "40": "10454",
        "41": "10459",
        "42": "10451",
        "43": "10473",
        "44": "10452",
        "45": "10465",
        "46": "10457",
        "47": "10466",
        "48": "10457",
        "49": "10461",
        "50": "10463",
        "52": "10467",
        "60": "11224",
        "61": "11223",
        "62": "11214",
        "63": "11210",
        "66": "11204",
        "67": "11226",
        "68": "11220",
        "69": "11236",
        "70": "11230",
        "71": "11225",
        "72": "11232",
        "73": "11212",
        "75": "11208",
        "76": "11231",
        "77": "11213",
        "78": "11217",
        "79": "11216",
        "81": "11221",
        "83": "11237",
        "84": "11201",
        "88": "11205",
        "90": "11211",
        "94": "11222",
        "100": "11693",
        "101": "11691",
        "102": "11418",
        "103": "11432",
        "104": "11385",
        "105": "11428",
        "106": "11417",
        "107": "11365",
        "108": "11101",
        "109": "11354",
        "110": "11373",
        "111": "11361",
        "112": "11375",
        "113": "11434",
        "114": "11103",
        "115": "11372",
        "120": "10301",
        "121": "10314",
        "122": "10306",
        "123": "10307"
    }

    zipcode_to_borough = {
        "10013": "Manhattan",
        "10014": "Manhattan",
        "10002": "Manhattan",
        "10003": "Manhattan",
        "10011": "Manhattan",
        "10010": "Manhattan",
        "10001": "Manhattan",
        "10022": "Manhattan",
        "10019": "Manhattan",
        "10065": "Manhattan",
        "10024": "Manhattan",
        "10029": "Manhattan",
        "10025": "Manhattan",
        "10035": "Manhattan",
        "10027": "Manhattan",
        "10031": "Manhattan",
        "10030": "Manhattan",
        "10032": "Manhattan",
        "10033": "Manhattan",
        "10454": "Bronx",
        "10459": "Bronx",
        "10451": "Bronx",
        "10473": "Bronx",
        "10452": "Bronx",
        "10465": "Bronx",
        "10457": "Bronx",
        "10466": "Bronx",
        "10461": "Bronx",
        "10463": "Bronx",
        "10467": "Bronx",
        "11224": "Brooklyn",
        "11223": "Brooklyn",
        "11214": "Brooklyn",
        "11210": "Brooklyn",
        "11204": "Brooklyn",
        "11226": "Brooklyn",
        "11220": "Brooklyn",
        "11236": "Brooklyn",
        "11230": "Brooklyn",
        "11225": "Brooklyn",
        "11232": "Brooklyn",
        "11212": "Brooklyn",
        "11208": "Brooklyn",
        "11231": "Brooklyn",
        "11213": "Brooklyn",
        "11217": "Brooklyn",
        "11216": "Brooklyn",
        "11221": "Brooklyn",
        "11237": "Brooklyn",
        "11201": "Brooklyn",
        "11205": "Brooklyn",
        "11211": "Brooklyn",
        "11222": "Brooklyn",
        "11693": "Queens",
        "11691": "Queens",
        "11418": "Queens",
        "11432": "Queens",
        "11385": "Queens",
        "11428": "Queens",
        "11417": "Queens",
        "11365": "Queens",
        "11101": "Queens",
        "11354": "Queens",
        "11373": "Queens",
        "11361": "Queens",
        "11375": "Queens",
        "11434": "Queens",
        "11103": "Queens",
        "11372": "Queens",
        "10301": "Staten Island",
        "10314": "Staten Island",
        "10306": "Staten Island",
        "10307": "Staten Island"
    }

    df['zipcode'] = df['precinct'].map(precinct_to_zipcode)
    df['borough'] = df['zipcode'].map(zipcode_to_borough)
    return df

311

In [14]:
# 311 311 311 311 311 311 311 311 311 311 311 311 311 311
# Program main
# Load the CSV File into a dataframe
# Transform the Dataframe
# Create a BigQuery client
# See if the target dimension table exists
#    If not exists, load the data into a new table
#    If exists, insert new records into the table
if __name__ == "__main__":

    dim_dict = {
    'location': ['borough', 'city', 'incident_zip', 'incident_address', 'location_type',],
    'complaint': ['complaint_type', 'descriptor'],
    'complaint_source': ['open_data_channel_type'],
    'status': ['status'],
    'date': ['created_date'],
    }

    for key, value in dim_dict.items():
        dimension_name = key
        column = value

        # Set the name of the surrogate key
        surrogate_key = f"{dimension_name}_dim_id"

        # Set the GCP Project, dataset and table name
        gcp_project = 'cis-4400-404715'
        bq_dataset = '311_illegal_parking'
        table_name = f"{dimension_name}_dimension"
        # Construct the full BigQuery path to the table
        dimension_table_path = f"{gcp_project}.{bq_dataset}.{table_name}"

        # Set the path to the source data files. Use double-slash for Windows paths C:\\myfolder
        file_source_path = 'C:\\Users\\andyy\\Desktop\\cis-4400\\data\\311_master.csv'

        df = pd.DataFrame
        # Load in the data file
        with open(file_source_path, 'r') as data:
                df = pd.read_csv(data)
            # Set all of the column names to lower case letters
        df = df.rename(columns=str.lower)
            
            
        #df = load_csv_data_file(file_source_path, "my_311_data_WaterQuality.csv", df)
        # Transform the data
        df = transform_data(df, column)
    
        df = rename_column(df, bq_dataset, dimension_name)
    
        if dimension_name == 'date':
            df['created_date'] = pd.to_datetime(df['created_date'])
            df = pd.DataFrame({
                'full_date': df['created_date'].dt.date,
                'year': df['created_date'].dt.year,
                'month': df['created_date'].dt.month,
                'month_name': df['created_date'].dt.strftime('%B'),  # Month name
                'day': df['created_date'].dt.day,
                'weekday_name': df['created_date'].dt.strftime('%A'),  # Month name
            })
            
        # Create the BigQuery Client
        # setup enviroment parameters to connect to BQ project
        os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = path_to_service_account_key_file

        # Construct a BigQuery client object
        bqclient = bigquery.Client()

        # See if the target dimensional table exists
        target_table_exists = bigquery_table_exists(bqclient, dimension_table_path  )

        # If the target dimension table does not exist, load all of the data into a new table
        if not target_table_exists:
            build_new_table( bqclient, dimension_table_path, dimension_name, df)
        # If the target table exists, then perform an incremental load
    

FileNotFoundError: [Errno 2] No such file or directory: 'C:\\Users\\andyy\\Desktop\\cis-4400\\data\\311_master.csv'

OPEN PARKING

In [None]:
# OPEN PARKING OPEN PARKING OPEN PARKING OPEN PARKING OPEN PARKING OPEN PARKING OPEN PARKING
# Program main
# Load the CSV File into a dataframe
# Transform the Dataframe
# Create a BigQuery client
# See if the target dimension table exists
#    If not exists, load the data into a new table
#    If exists, insert new records into the table
if __name__ == "__main__":

    dim_dict = {
      'location': ['precinct'],
      'agency': ['issuing_agency'],
      'violation': ['violation', 'violation_status'],
      'violator': ['plate', 'state', 'license_type'],
      'date': ['issue_date',]
    }

    for key, value in dim_dict.items():
        dimension_name = key
        column = value

        # Set the name of the surrogate key
        surrogate_key = f"{dimension_name}_dim_id"

        # Set the GCP Project, dataset and table name
        gcp_project = 'cis-4400-404715'
        bq_dataset = 'open_parking'
        table_name = f"{dimension_name}_dimension"
        # Construct the full BigQuery path to the table
        dimension_table_path = f"{gcp_project}.{bq_dataset}.{table_name}"

        # Set the path to the source data files. Use double-slash for Windows paths C:\\myfolder
        file_source_path = 'C:\\Users\\andyy\\Desktop\\cis-4400\\data\\open_parking_master.csv'

        df = pd.DataFrame
        # Load in the data file
        with open(file_source_path, 'r') as data:
                df = pd.read_csv(data)
            # Set all of the column names to lower case letters
        df = df.rename(columns=str.lower)
            
            
        #df = load_csv_data_file(file_source_path, "my_311_data_WaterQuality.csv", df)
        # Transform the data
        df = transform_data(df, column)

        # Call a function here that calculates the zipcode based on the precinct number
# --> --> -->
        if dimension_name == 'location':
            df = calculate_location_attributes(df)


        df = rename_column(df, bq_dataset, dimension_name)

        if dimension_name == 'date':
            # Convert 'issue_date' to datetime using the specified format
            df['issue_date'] = pd.to_datetime(df['issue_date'], format='%m/%d/%Y', errors='coerce')

            # Create a DataFrame with extracted date components
            df = pd.DataFrame({
                'full_date': df['issue_date'].dt.date,
                'year': df['issue_date'].dt.year,
                'month': df['issue_date'].dt.month,
                'month_name': df['issue_date'].dt.strftime('%B'),  # Month name
                'day': df['issue_date'].dt.day,
                'weekday_name': df['issue_date'].dt.strftime('%A'),  # Weekday name
            })



        # Create the BigQuery Client
        # setup enviroment parameters to connect to BQ project
        os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = path_to_service_account_key_file

        # Construct a BigQuery client object
        bqclient = bigquery.Client()

        # See if the target dimensional table exists
        target_table_exists = bigquery_table_exists(bqclient, dimension_table_path  )

        # If the target dimension table does not exist, load all of the data into a new table
        if not target_table_exists:
            build_new_table( bqclient, dimension_table_path, dimension_name, df)
        # If the target table exists, then perform an incremental load
    

<class 'google.cloud.bigquery.client.Client'>
