## Filter mutliple feeds
Evan: <i>I'm also seeing multiple feeds in the District Digest Map. I don't mind them, but it may be helpful to try to filter for just Public Currently Operating Fixed Route or Regional Subfeed.</i>

### Relevant Links
* https://github.com/cal-itp/data-analyses/issues/1240

In [1]:
import geopandas as gpd
import merge_data
import numpy as np
import pandas as pd
from segment_speed_utils import gtfs_schedule_wrangling, helpers, time_series_utils
from shared_utils import catalog_utils, rt_dates, rt_utils
from update_vars import GTFS_DATA_DICT, RT_SCHED_GCS, SCHED_GCS

In [2]:
pd.options.display.max_columns = 100
pd.options.display.float_format = "{:.2f}".format
pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)

In [3]:
OPERATOR_FILE = GTFS_DATA_DICT.digest_tables.operator_profiles
OPERATOR_ROUTE = GTFS_DATA_DICT.digest_tables.operator_routes_map

In [4]:
analysis_date = "2024-11-13"

In [5]:
operator_df = pd.read_parquet(
    f"{RT_SCHED_GCS}{OPERATOR_FILE}.parquet",
)

# using name instead of schedule_gtfs_dataset_key allows us to get
# the last ones for LA Metro without keeping extraneous rows for LA Metro when keys changed
operator_df = (
    operator_df.sort_values(["service_date", "name"], ascending=[False, True])
    .drop_duplicates(subset=["name"])
    .reset_index(drop=True)
)

In [6]:
OPERATOR_FILE

'digest/operator_profiles'

In [7]:
operator_df.columns

Index(['schedule_gtfs_dataset_key', 'vp_per_min_agency',
       'spatial_accuracy_agency', 'service_date', 'operator_n_routes',
       'operator_n_trips', 'operator_n_shapes', 'operator_n_stops',
       'operator_n_arrivals', 'operator_route_length_miles',
       'operator_arrivals_per_stop', 'n_downtown_local_routes',
       'n_local_routes', 'n_coverage_routes', 'n_rapid_routes',
       'n_express_routes', 'n_rail_routes', 'name',
       'organization_source_record_id', 'organization_name',
       'caltrans_district', 'counties_served', 'service_area_sq_miles',
       'hq_city', 'uza_name', 'service_area_pop', 'organization_type',
       'primary_uza', 'reporter_type'],
      dtype='object')

In [8]:
len(operator_df)

173

In [9]:
operator_df.name.nunique()

172

In [10]:
operator_df.organization_name.nunique()

157

### View two names to one org examples: 15 of these cases.


In [11]:
orgs_agg = (
    operator_df.groupby(["caltrans_district", "organization_name"])
    .agg({"name": "nunique"})
    .reset_index()
)

In [12]:
# Filter for orgs with more than two names
orgs_agg2 = orgs_agg.loc[orgs_agg.name > 1]

In [13]:
len(orgs_agg2)

15

In [14]:
orgs_with_2_names = list(orgs_agg2.organization_name.unique())

In [15]:
operator_df.columns

Index(['schedule_gtfs_dataset_key', 'vp_per_min_agency',
       'spatial_accuracy_agency', 'service_date', 'operator_n_routes',
       'operator_n_trips', 'operator_n_shapes', 'operator_n_stops',
       'operator_n_arrivals', 'operator_route_length_miles',
       'operator_arrivals_per_stop', 'n_downtown_local_routes',
       'n_local_routes', 'n_coverage_routes', 'n_rapid_routes',
       'n_express_routes', 'n_rail_routes', 'name',
       'organization_source_record_id', 'organization_name',
       'caltrans_district', 'counties_served', 'service_area_sq_miles',
       'hq_city', 'uza_name', 'service_area_pop', 'organization_type',
       'primary_uza', 'reporter_type'],
      dtype='object')

### Filter out any orgs with 2+ names from the main df

In [16]:
type(orgs_with_2_names)

list

In [17]:
orgs_with_2_names

['Redwood Coast Transit Authority',
 'City of Roseville',
 'Tahoe Transportation District',
 'Mission Bay Transportation Management Agency',
 'Presidio Trust',
 'City of San Luis Obispo',
 'City of Downey',
 'City of Lawndale',
 'Los Angeles County Metropolitan Transportation Authority',
 'Basin Transit',
 'City of Beaumont',
 'Mountain Area Regional Transit Authority',
 'Palo Verde Valley Transit Agency',
 'Victor Valley Transit Authority',
 'Transit Joint Powers Authority for Merced County']

In [18]:
operator_df2 = operator_df.loc[
    ~operator_df.organization_name.isin(orgs_with_2_names)
].reset_index(drop=True)

### Filter out any names that are "out of date" for organizations with more than one name.

In [19]:
orgs_with_2_names_df = operator_df.loc[
    operator_df.organization_name.isin(orgs_with_2_names)
].reset_index(drop=True)

In [20]:
orgs_with_2_names_df.shape

(30, 29)

In [21]:
orgs_with_2_names_df.organization_name.nunique()

15

In [22]:
orgs_with_2_names_df2 = orgs_with_2_names_df.sort_values(
    by=["organization_name", "service_date"], ascending=[True, False]
)

In [23]:
comparison_date = orgs_with_2_names_df2["service_date"].max() - pd.DateOffset(months=3)

In [24]:
comparison_date

Timestamp('2024-08-13 00:00:00')

In [25]:
orgs_with_2_names_df3 = orgs_with_2_names_df2[
    orgs_with_2_names_df2["service_date"] >= comparison_date
]

In [26]:
orgs_with_2_names_df3[["organization_name", "name", "service_date"]]

Unnamed: 0,organization_name,name,service_date
0,Basin Transit,Basin Transit GMV Schedule,2024-11-13
12,Basin Transit,Morongo Basin Schedule,2024-11-13
3,City of Beaumont,Beaumont Transit Schedule,2024-11-13
24,City of Beaumont,Beaumont Pass Schedule,2024-10-16
26,City of Downey,DowneyLINK GMV Schedule,2024-09-18
8,City of Lawndale,Lawndale Beat GMV Schedule,2024-11-13
9,City of Lawndale,Lawndale Schedule,2024-11-13
16,City of Roseville,Roseville Schedule,2024-11-13
17,City of Roseville,Roseville Transit GMV Schedule,2024-11-13
18,City of San Luis Obispo,SLO Peak Transit Schedule,2024-11-13


### Filter out any rows in which `vp_per_min_agency` and `spatial_accuracy_agency` is less than 0 and still has 2+ names 

In [27]:
# Calcualte orgs that still have 2+ names
orgs_with_2_names = (
    orgs_with_2_names_df3.groupby(["organization_name"])
    .agg({"name": "nunique"})
    .reset_index()
    .rename(columns={"name": "n_names"})
)

In [28]:
orgs_with_2_names

Unnamed: 0,organization_name,n_names
0,Basin Transit,2
1,City of Beaumont,2
2,City of Downey,1
3,City of Lawndale,2
4,City of Roseville,2
5,City of San Luis Obispo,2
6,Los Angeles County Metropolitan Transportation Authority,2
7,Mission Bay Transportation Management Agency,1
8,Mountain Area Regional Transit Authority,2
9,Palo Verde Valley Transit Agency,2


In [29]:
orgs_with_2_names_df3 = pd.merge(
    orgs_with_2_names_df3, orgs_with_2_names, on="organization_name", how="left"
)

In [30]:
orgs_with_2_names_df4 = orgs_with_2_names_df3[
    (orgs_with_2_names_df3.vp_per_min_agency > 0)
    & (orgs_with_2_names_df3.spatial_accuracy_agency > 0)
    & (orgs_with_2_names_df3.n_names > 1)
].reset_index(drop=True)

#### Filter out any rows that are not equal to the `service_date`

In [31]:
#analysis_date = rt_dates.DATES.popitem()

In [32]:
#analysis_date = analysis_date[-1]

In [33]:
orgs_with_2_names_df5 = orgs_with_2_names_df4.loc[
    orgs_with_2_names_df4.service_date == analysis_date
]

In [34]:
orgs_with_2_names_df5.organization_name.nunique()

9

In [35]:
orgs_with_2_names_df5.shape

(10, 30)

In [36]:
for organization in list(orgs_with_2_names_df4.organization_name.unique()):
    print(organization)
    filtered = orgs_with_2_names_df5.loc[
        orgs_with_2_names_df5.organization_name == organization
    ].T
    filtered = filtered.fillna(0)
    try:
        filtered.columns = ["name1", "name2"]
        filtered["same"] = filtered.name1 == filtered.name2
        display(filtered.same.value_counts())
        display(filtered)
    except:
        display(filtered)

Basin Transit


Unnamed: 0,0
schedule_gtfs_dataset_key,a7f5522d7690161fc2be75857d7e2f79
vp_per_min_agency,1.35
spatial_accuracy_agency,90.85
service_date,2024-11-13 00:00:00
operator_n_routes,7.00
operator_n_trips,86.00
operator_n_shapes,16.00
operator_n_stops,170.00
operator_n_arrivals,2471.00
operator_route_length_miles,184.36


City of Beaumont


Unnamed: 0,1
schedule_gtfs_dataset_key,7f6e96ebfc0675bfda4f68874aa1466d
vp_per_min_agency,2.96
spatial_accuracy_agency,90.21
service_date,2024-11-13 00:00:00
operator_n_routes,8.00
operator_n_trips,125.00
operator_n_shapes,30.00
operator_n_stops,126.00
operator_n_arrivals,1530.00
operator_route_length_miles,126.03


City of Lawndale


Unnamed: 0,3
schedule_gtfs_dataset_key,09a703757d1ed14ca9580b1385e39315
vp_per_min_agency,2.65
spatial_accuracy_agency,100.00
service_date,2024-11-13 00:00:00
operator_n_routes,2.00
operator_n_trips,30.00
operator_n_shapes,2.00
operator_n_stops,43.00
operator_n_arrivals,698.00
operator_route_length_miles,17.92


City of San Luis Obispo


Unnamed: 0,4
schedule_gtfs_dataset_key,f4c3ea214214ee0d96f7646b3e9d69dc
vp_per_min_agency,1.99
spatial_accuracy_agency,99.02
service_date,2024-11-13 00:00:00
operator_n_routes,9.00
operator_n_trips,147.00
operator_n_shapes,10.00
operator_n_stops,157.00
operator_n_arrivals,3916.00
operator_route_length_miles,84.95


Los Angeles County Metropolitan Transportation Authority


False    17
True     13
Name: same, dtype: int64

Unnamed: 0,name1,name2,same
schedule_gtfs_dataset_key,0666caf3ec1ecc96b74f4477ee4bc939,2a0571758141f412b6a546fd70a65bf3,False
vp_per_min_agency,2.06,1.31,False
spatial_accuracy_agency,91.81,94.91,False
service_date,2024-11-13 00:00:00,2024-11-13 00:00:00,True
operator_n_routes,111.00,6.00,False
operator_n_trips,13658.00,1485.00,False
operator_n_shapes,602.00,12.00,False
operator_n_stops,12008.00,106.00,False
operator_n_arrivals,860096.00,25331.00,False
operator_route_length_miles,1955.29,121.40,False


Palo Verde Valley Transit Agency


Unnamed: 0,7
schedule_gtfs_dataset_key,4383eb1cca04093020f1583f57f32d9b
vp_per_min_agency,2.93
spatial_accuracy_agency,84.83
service_date,2024-11-13 00:00:00
operator_n_routes,5.00
operator_n_trips,54.00
operator_n_shapes,9.00
operator_n_stops,39.00
operator_n_arrivals,472.00
operator_route_length_miles,220.10


Presidio Trust


Unnamed: 0,8
schedule_gtfs_dataset_key,85a39da903f4beb0b66a4ba6f16a35af
vp_per_min_agency,3.00
spatial_accuracy_agency,45.74
service_date,2024-11-13 00:00:00
operator_n_routes,2.00
operator_n_trips,56.00
operator_n_shapes,2.00
operator_n_stops,31.00
operator_n_arrivals,890.00
operator_route_length_miles,15.64


Tahoe Transportation District


Unnamed: 0,10
schedule_gtfs_dataset_key,c3499b856c717e5706299664fb1c5261
vp_per_min_agency,2.89
spatial_accuracy_agency,93.59
service_date,2024-11-13 00:00:00
operator_n_routes,4.00
operator_n_trips,121.00
operator_n_shapes,9.00
operator_n_stops,117.00
operator_n_arrivals,2544.00
operator_route_length_miles,64.68


Victor Valley Transit Authority


Unnamed: 0,11
schedule_gtfs_dataset_key,8eecb796518dafd3c1b971a99f8b8252
vp_per_min_agency,2.85
spatial_accuracy_agency,94.57
service_date,2024-11-13 00:00:00
operator_n_routes,33.00
operator_n_trips,859.00
operator_n_shapes,75.00
operator_n_stops,1062.00
operator_n_arrivals,20968.00
operator_route_length_miles,820.10


### Add column into crosswalk: `gtfs_funnel/crosswalk_gtfs_dataset_key_to_organization`

In [37]:
import sys

sys.path.append("../gtfs_funnel")

In [38]:
may_date = "2024-05-22"

In [39]:
df = helpers.import_scheduled_trips(
    may_date,
    columns=[
        "gtfs_dataset_key",
        "name",
        "regional_feed_type",
    ],
    get_pandas=True,
).rename(columns={"schedule_gtfs_dataset_key": "gtfs_dataset_key"})

In [40]:
df.shape

(168, 3)

#### Lots of missing values for `regional_feed_type` so this isn't very helpful.

## Try to deduplicate all in a notebook.

In [41]:
district = "08 - San Bernardino"

In [42]:
operator_df = pd.read_parquet(
    f"{RT_SCHED_GCS}{OPERATOR_FILE}.parquet",
    filters=[[("caltrans_district", "==", district)]],
)

# using name instead of schedule_gtfs_dataset_key allows us to get
# the last ones for LA Metro without keeping extraneous rows for LA Metro when keys changed
operator_df = (
    operator_df.sort_values(["service_date", "name"], ascending=[False, True])
    .drop_duplicates(subset=["name"])
    .reset_index(drop=True)
)

In [43]:
# First find any organizations with more than 2 names per org-name
orgs_agg = (
    operator_df.groupby(["caltrans_district", "organization_name"])
    .agg({"name": "nunique"})
    .reset_index()
)
orgs_agg2 = orgs_agg.loc[orgs_agg.name > 1]
orgs_with_2_names = list(orgs_agg2.organization_name.unique())

In [44]:
# Delete out these organizations from the original df so we can manipulate them.
operator_df2 = operator_df.loc[
    ~operator_df.organization_name.isin(orgs_with_2_names)
].reset_index(drop=True)

In [45]:
# Filter for these organizations with more than 2 names in their own df.
orgs_with_2_names_df = operator_df.loc[
    operator_df.organization_name.isin(orgs_with_2_names)
].reset_index(drop=True)

In [46]:
# Keep only records that are from the past 3 months
three_month_reference = orgs_with_2_names_df2["service_date"].max() - pd.DateOffset(
    months=3
)

In [47]:
orgs_with_2_names_df = orgs_with_2_names_df[
    orgs_with_2_names_df["service_date"] >= three_month_reference
]

In [49]:
# Filter out any rows in which `vp_per_min_agency` and `spatial_accuracy_agency` is equal than 0
# and still has 2+ names
orgs_agg = (
    orgs_with_2_names_df.groupby(["organization_name"])
    .agg({"name": "nunique"})
    .reset_index()
    .rename(columns={"name": "n_names"})
)

In [50]:
orgs_with_2_names_df = pd.merge(
    orgs_with_2_names_df, orgs_agg, on="organization_name", how="left"
)

In [51]:
orgs_with_2_names_df2 = orgs_with_2_names_df[
    (orgs_with_2_names_df.vp_per_min_agency > 0)
    & (orgs_with_2_names_df.spatial_accuracy_agency > 0)
    & (orgs_with_2_names_df.n_names > 1)
].reset_index(drop=True)

In [82]:
service_date = operator_df2.service_date.max()

In [83]:
orgs_with_2_names_df3 = orgs_with_2_names_df2.loc[
    orgs_with_2_names_df2.service_date == service_date
]

In [85]:
final_names = list(orgs_with_2_names_df3.organization_name.unique())

In [86]:
## Concat back
orgs_with_2_names_df = orgs_with_2_names_df.loc[~orgs_with_2_names_df.organization_name.isin(final_names)]

In [88]:
final = pd.concat([orgs_with_2_names_df, orgs_with_2_names_df3])

In [91]:
final.groupby(['organization_name']).agg({"name":"nunique"})

Unnamed: 0_level_0,name
organization_name,Unnamed: 1_level_1
Basin Transit,1
City of Beaumont,1
Mountain Area Regional Transit Authority,2
Palo Verde Valley Transit Agency,1
Victor Valley Transit Authority,1


In [92]:
operator_df2 = pd.concat([operator_df2, final])