# New dim_shapes

In [1]:
import datetime
import geopandas as gpd
import os
import pandas as pd

os.environ["CALITP_BQ_MAX_BYTES"] = str(130_000_000_000)

from calitp.tables import tbl
from calitp import query_sql
from siuba import *

import utils
import shared_utils

E0325 17:50:08.710008616    1397 fork_posix.cc:70]           Fork support is only compatible with the epoll1 and poll polling strategies
E0325 17:50:11.162861127    1397 fork_posix.cc:70]           Fork support is only compatible with the epoll1 and poll polling strategies


## With string as date

In [2]:
SELECTED_DATE = "2022-1-1"

start = datetime.datetime.now()

df = (tbl.views.gtfs_schedule_dim_shapes()
      >> filter(_.calitp_extracted_at <= SELECTED_DATE, 
                _.calitp_deleted_at > SELECTED_DATE
               )
      >> filter(_.calitp_itp_id==17)
      >> select(_.calitp_itp_id, _.calitp_url_number, _.shape_id)
      >> inner_join(_, 
                    tbl.views.gtfs_schedule_dim_shapes_geo(),
                    ["calitp_itp_id", "calitp_url_number", "shape_id"])
      >> collect()
     )
end = datetime.datetime.now()
print(end - start)



0:00:03.704692


## With datetime as date

In [3]:
SELECTED_DATE = datetime.date(2022, 1, 1)

start = datetime.datetime.now()

df = (tbl.views.gtfs_schedule_dim_shapes()
      >> filter(_.calitp_extracted_at <= SELECTED_DATE, 
                _.calitp_deleted_at > SELECTED_DATE
               )
      >> filter(_.calitp_itp_id == 182)
      >> select(_.calitp_itp_id, _.calitp_url_number, _.shape_id)
      >> inner_join(_, 
                    tbl.views.gtfs_schedule_dim_shapes_geo(),
                    ["calitp_itp_id", "calitp_url_number", "shape_id"])
     )
end = datetime.datetime.now()
print(end - start)

0:00:00.402245




## Handle subsets of operators 

* By default, exclude `ITP_ID==200`, but what if we just want 5 operators?
* Instead of `collect` + `isin`, can we construct `or` statement?

In [4]:
def unpack_list_and_use_or_statement(col, my_list):
    new_or_statement = ""
    
    for i in range(0, len(my_list)):
        add_me = f"(_[{col}]=={my_list[i]})"
        
        if i==0:
            new_or_statement = add_me
        else:
            new_or_statement = new_or_statement + " or " + add_me

    return new_or_statement

In [5]:
INCLUDE_ITP_ID = [17, 100]

sentence = unpack_list_and_use_or_statement("calitp_itp_id", INCLUDE_ITP_ID)
sentence

'(_[calitp_itp_id]==17) or (_[calitp_itp_id]==100)'

In [6]:
from inspect import cleandoc

cleandoc(sentence)

'(_[calitp_itp_id]==17) or (_[calitp_itp_id]==100)'

In [7]:
# Without collect(), which takes awhile

start = datetime.datetime.now()

df = (tbl.views.gtfs_schedule_dim_shapes()
      >> filter(_.calitp_extracted_at <= SELECTED_DATE, 
                _.calitp_deleted_at > SELECTED_DATE
               )
      >> filter(cleandoc(
          unpack_list_and_use_or_statement(
              "calitp_itp_id", INCLUDE_ITP_ID
          )
      ))
      >> select(_.calitp_itp_id, _.calitp_url_number, _.shape_id)
      >> inner_join(_, 
                    tbl.views.gtfs_schedule_dim_shapes_geo(),
                    ["calitp_itp_id", "calitp_url_number", "shape_id"])
     )


end = datetime.datetime.now()
print(end - start) 

ArgumentError: Textual SQL expression '(_[calitp_itp_id]==17) or...' should be explicitly declared as text('(_[calitp_itp_id]==17) or...')

In [8]:
start = datetime.datetime.now()

df = (tbl.views.gtfs_schedule_dim_shapes()
      >> filter(_.calitp_extracted_at <= SELECTED_DATE, 
                _.calitp_deleted_at > SELECTED_DATE
               )
      >> filter(_.calitp_itp_id==17)
      >> select(_.calitp_itp_id, _.calitp_url_number, _.shape_id)
      >> inner_join(_, 
                    tbl.views.gtfs_schedule_dim_shapes_geo(),
                    ["calitp_itp_id", "calitp_url_number", "shape_id"])
     )

df2 = (tbl.views.gtfs_schedule_dim_shapes()
      >> filter(_.calitp_extracted_at <= SELECTED_DATE, 
                _.calitp_deleted_at > SELECTED_DATE
               )
      >> filter(_.calitp_itp_id==4)
      >> select(_.calitp_itp_id, _.calitp_url_number, _.shape_id)
      >> inner_join(_, 
                    tbl.views.gtfs_schedule_dim_shapes_geo(),
                    ["calitp_itp_id", "calitp_url_number", "shape_id"])
     )


end = datetime.datetime.now()
print(end - start)      

0:00:00.718464


In [9]:
query_sql('''
df 
UNION
df2
'''
)

DatabaseError: (google.cloud.bigquery.dbapi.exceptions.DatabaseError) 400 Syntax error: Expected end of input but got identifier "df" at [2:1]

Location: us-west2
Job ID: 5c2a61bb-dd3d-40da-bb71-6757087527dc

[SQL: 
df 
UNION
df2]
(Background on this error at: https://sqlalche.me/e/14/4xp6)

In [None]:
start = datetime.datetime.now()

df = (tbl.views.gtfs_schedule_dim_shapes()
      >> filter(_.calitp_extracted_at <= SELECTED_DATE, 
                _.calitp_deleted_at > SELECTED_DATE
               )
      >> select(_.calitp_itp_id, _.calitp_url_number, _.shape_id)
      >> collect()
      >> filter(_.calitp_itp_id.isin(INCLUDE_ITP_ID))
     )

end = datetime.datetime.now()
print(end - start)  

In [None]:
start = datetime.datetime.now()

def make_linestring(x):

    # shapely errors if the array contains only one point
    if len(x) > 1:
        # each point in the array is wkt
        # so convert them to shapely points via list comprehension
        as_wkt = [shapely.wkt.loads(i) for i in x]
        return shapely.geometry.LineString(as_wkt)

# apply the function
df['geometry'] = df.pt_array.apply(make_linestring)

# convert to geopandas; geometry column contains the linestring
gdf = gpd.GeoDataFrame(df, geometry = 'geometry', crs=WGS84)

end = datetime.datetime.now()
print(end - start)