# Use ibis

https://www.twosigma.com/articles/ibis-and-bigquery-scalable-analytics-with-the-comfort-of-python/

In [None]:
import pandas as pd
import ibis
import ibis_bigquery

PROJECT = "cal-itp-data-infra-staging"

In [None]:
conn = ibis_bigquery.connect(
    project_id=PROJECT,
    dataset_id = 'views'
)

table = conn.table("gtfs_agency_names")

In [None]:
table

In [None]:
table[table.calitp_itp_id.isin([182, 183])].execute()

In [None]:
#table.execute??
table.compile()

In [None]:
print(table[table.calitp_itp_id.isin([182, 183])].compile())

In [None]:
# Use ibis
#catalog = intake_civis.open_postgres_catalog()
#expr = catalog.public.import311.to_ibis()

refuse_pickup = ['Bulky Items', 'Electronic Waste', 
                'Metal/Household Applicances', 'Illegal Dumping Pickup', 
                'Illegal Dumping in Progress', 'Service Not Complete']

street_repair = ['Barricade Removal', 'Bus Pad/Landing', 'Curb Repair', 
                 'Flooding', 'General Street Inspection', 'Guard/Warning Rail Maintenance',  
                 'Gutter Repair', 'Land/Mud Slide',  'Pothole - Small Asphalt Repair', 
                 'Resurfacing', 'Sidewalk Repair', 'Street Sweeping']

trees_vegetation = ['Bees or Beehive', 'Median Island Maintenance', 
                    'Overgrown Vegetation/Plants', 'Palm Fronds Down',
                    'Street Tree Inspection', 'Street Tree Violations', 
                    'Tree Emergency', 'Tree Obstruction', 'Tree Permits', 
                    'Weed Abatement for Pvt Parcels']

investigation = ['News Rack Violation', 'Obstructions', 'Tables and Chairs Obstructing']

cols = ['srnumber', 'createddate', 'requesttype', 'status', 'servicedate',
        'closeddate', 'latitude', 'longitude', 'cd', 'ncname']


def query_and_execute(query_expression, subset_request, cols):
    # Subset by request type and columns
    df_query = query_expression[
                query_expression.requesttype.isin(subset_request)][cols]

    df_query = fix_dates(df_query)
    
    df = df_query.execute(limit=None)

    return df

def fix_dates(df_query):
    # Fix dates
    datecols = ["createddate", "closeddate", "servicedate"]

    # Filter to non-future dates and cast to string (shapefile can't handle datetime)
    df_query = df_query[
                reduce(lambda x,y: x & y, [df_query[c] < ibis.now() for c in datecols])
            ].mutate(
                **{c: df_query[c].cast("string") for c in datecols}
            )
    
    return df_query

In [None]:
street = query_and_execute(expr, street_repair, cols)
tree = query_and_execute(expr, trees_vegetation, cols)
investigate = query_and_execute(expr, investigation, cols)

bulky_query = expr[expr.requesttype.isin(refuse_pickup)][cols]
bulky_query = fix_dates(bulky_query)

bulky1_query = bulky_query[bulky_query.createddate.cast('date') < '2018-01-01']
bulky1 = bulky1_query.execute(limit=None)

bulky2_query = bulky_query[bulky_query.createddate.cast('date') >= '2018-01-01']
bulky2 = bulky2_query.execute(limit=None)

print("Execute queries")

In [None]:
def light_cleaning(df, category_name):

    df = (
        df.assign(
            Category = category_name,
            Year = pd.to_datetime(df.createddate, errors = "coerce").dt.year,
            createddate = pd.to_datetime(df.createddate, errors = "coerce").dt.date,
            servicedate = pd.to_datetime(df.servicedate, errors = "coerce").dt.date,
        ).rename(columns = {
            'srnumber': 'SRNumber',
            'createddate': 'CreatedDate',
            'requesttype': 'RequestType',
            'status': 'Status',
            'servicedate': 'ServiceDate',
            'closeddate': 'ClosedDate',
            'cd': 'CD', 
            'ncname': 'NCName',
        })
    )

    col_order= ['SRNumber', 'Category', 'RequestType', 'Status', 
                 'Year', 'CreatedDate', 'ServiceDate', 
                'CD', 'NCName', 'longitude', 'latitude']
                
    df = (df.reindex(columns = col_order)
           .sort_values('SRNumber')
           .reset_index(drop=True))
    
    return df


# Do some light cleaning
dataframes = {}

street = light_cleaning(street, "bulky")
tree = light_cleaning(tree, "tree")
new_street = (street.append(tree, sort=False)
              .sort_values("SRNumber")
              .reset_index(drop=True)
              )

dataframes["street"] = new_street
dataframes["investigate"] = light_cleaning(investigate, "investigate")
dataframes["bulky2015"] = light_cleaning(bulky1, "bulky")
dataframes["bulky2018"] = light_cleaning(bulky2, "bulky")

print("Finish cleaning")

In [None]:
for key, value in dataframes.items():
    print(key)
    value.to_parquet(f'{s3_file_path}service_requests_{key}.parquet', 
            allow_truncated_timestamps=True) 
    print(f"Upload {key} to S3")