# Investigate primary key issues in GTFS schedule data

October 26, 2022 / Laurie 

See [this Google Doc](https://docs.google.com/document/d/12OtxayJRUMWO9HrJS11c6iiuw4OHPCgJv01YtnioZxY/edit#) for more info.

In [63]:
import pandas as pd
import pandas_gbq
import base64
import datetime

# trips

First, ran `poetry run dbt run -s +dim_trips` to refresh data in my namespace. 

TLDR:

* Victor Valley GMV and Merced GMV are the only current feeds in the v2 pipeline that have duplicate `trip_id` values within individual feeds.
* For Merced GMV:
   * They are all cases where the key is duplicated but the data is actually different (i.e., worst case scenario).
   * 2/412 (.4%) trips in their feed are affected. Same `trip_id`, different `route_` and `shape_` ids. 
* For Victor Valley GMV: 
   * At the start of v2 data availability (9/15/22), there was a mixture of cases where the key was duplicated with identical and different data (i.e., there are trip IDs that appear at least 3 times: 2 identical duplicates + another row with the same key but different data.) At that time, 46/1307 (3.5%) trips were duplicates of some kind, with most being full duplicates where the entire row was replicated exactly.
   * In the most recent feed (10/25/22), there are **only** full duplicates where the entire row is identical (this is the best case scenario, because these are safe to deduplicate). Incidence has decreased to 26/1838 (1.4%). 

In [26]:
# this is the query from the failing dbt test
# combined with left joining back on to the full table to get the full trip details for the dup rows
raw_dup_trips  = pd.read_gbq(
    """
    with validation_errors as (

        select
            base64_url, ts, trip_id
        from `cal-itp-data-infra-staging`.`laurie_staging`.`stg_gtfs_schedule__trips`
        group by base64_url, ts, trip_id
        having count(*) > 1

    ),
    
    erroring_trips as (
    
        SELECT 
          t2.*
        FROM validation_errors t1 
        LEFT JOIN `cal-itp-data-infra-staging`.`laurie_staging`.`stg_gtfs_schedule__trips` t2
        USING(trip_id, base64_url, ts)
    
    )

    select *
    from erroring_trips
    """, project_id = 'cal-itp-data-infra-staging')

In [114]:
dup_trips = raw_dup_trips.copy()

In [115]:
len(dup_trips)

1852

In [116]:
# drop totally empty columns because they mess up joins later
dup_trips.dropna(axis = 1, how = "all", inplace = True)

In [141]:
# how many feeds affected?
dup_trips.base64_url.unique()

array(['aHR0cHM6Ly9vbnRpbWUudnZ0YS5vcmcvZ3Rmcw==',
       'aHR0cHM6Ly90aGVidXNsaXZlLmNvbS9ndGZz'], dtype=object)

In [118]:
# convert base64_url to string 
dup_trips['url'] = dup_trips['base64_url'].apply(lambda x: str(base64.urlsafe_b64decode(x), encoding = 'utf-8'))

In [119]:
# which urls (human readable) are affected?
dup_trips.url.unique()

array(['https://ontime.vvta.org/gtfs', 'https://thebuslive.com/gtfs'],
      dtype=object)

In [120]:
# how many are cases where ALL values are identical?
orig_trip_cols = list(dup_trips.columns)

In [121]:
dup_trips['dummy'] = 1
# notebooks are annoying just in case we run out of order
if 'dummy' in orig_trip_cols:
    orig_trip_cols.remove('dummy')
summarize_full_trip_dups = dup_trips.groupby(by = orig_trip_cols, dropna = False)['dummy'].sum().reset_index().rename(columns = {'dummy': 'ct'})

In [122]:
# cases where whole row is identical
entire_trip_dups = summarize_full_trip_dups.loc[summarize_full_trip_dups.ct > 1, :].copy()
entire_trip_dups['full_dup'] = True

In [126]:
# how many cases are covered by entire dups?
check_trip_entire_dups = dup_trips.merge(entire_trip_dups, how = 'left', on = orig_trip_cols, suffixes = ['', '_ent_dup'])

In [128]:
# check for issue type by URL
check_trip_entire_dups['full_dup'] = check_trip_entire_dups['full_dup'].fillna(False)

In [129]:
# check for cases where it's not a full dup
check_trip_entire_dups[~check_trip_entire_dups.full_dup].url.unique()

array(['https://ontime.vvta.org/gtfs', 'https://thebuslive.com/gtfs'],
      dtype=object)

In [130]:
# check for cases where it is a full dup
check_trip_entire_dups[check_trip_entire_dups.full_dup].url.unique()

array(['https://ontime.vvta.org/gtfs'], dtype=object)

In [132]:
# check at trip id level for whether all instances of trip id are full dup
trip_id_entire_dups = check_trip_entire_dups.groupby(['url', 'trip_id', 'ts', 'full_dup']).size().reset_index()

In [136]:
# check for mixed cases within a trip id 
trip_id_entire_dups = trip_id_entire_dups.groupby(['url', 'trip_id', 'ts']).agg({'full_dup': ['any', 'all']}).reset_index()
trip_id_entire_dups.columns = ['url', 'trip_id', 'ts', 'any_full_dups', 'all_full_dups']

In [137]:
# flag cases where there are mix of fully same and not 
trip_id_entire_dups['mixed_cases'] = trip_id_entire_dups.any_full_dups != trip_id_entire_dups.all_full_dups

In [138]:
trip_id_entire_dups[trip_id_entire_dups.mixed_cases].url.unique()

array(['https://ontime.vvta.org/gtfs'], dtype=object)

In [176]:
# add the dup type flags to each individual row
check_trip_entire_dups = check_trip_entire_dups.merge(trip_id_entire_dups, how = 'left', on = ['url', 'trip_id', 'ts'])

In [178]:
# summarize typology of dups
entire_dup_trips_pivot = check_trip_entire_dups.groupby(['url', 'ts', 'mixed_cases', 'any_full_dups', 'all_full_dups'])['trip_id'].count().reset_index()

In [144]:
# read in all trips for affected feeds
all_vv_merced_trips_raw = pd.read_gbq(
    """
    select *
    from `cal-itp-data-infra-staging`.`laurie_staging`.`stg_gtfs_schedule__trips`
    where base64_url = 'aHR0cHM6Ly9vbnRpbWUudnZ0YS5vcmcvZ3Rmcw==' or base64_url = 'aHR0cHM6Ly90aGVidXNsaXZlLmNvbS9ndGZz'
    """, project_id = 'cal-itp-data-infra-staging')

In [145]:
all_vv_merced_trips = all_vv_merced_trips_raw.copy()

In [146]:
# convert base64_url to string 
all_vv_merced_trips['url'] = all_vv_merced_trips['base64_url'].apply(lambda x: str(base64.urlsafe_b64decode(x), encoding = 'utf-8'))

In [149]:
# summarize trip count per feed
num_trips = all_vv_merced_trips.groupby(['url', 'ts'])['trip_id'].count().reset_index().rename(columns = {'trip_id': 'num_trips'})

In [179]:
entire_dup_trips_pivot = entire_dup_trips_pivot.pivot(index = ['url', 'ts'], columns = ['mixed_cases', 'any_full_dups', 'all_full_dups'], values = 'trip_id').reset_index()

In [182]:
entire_dup_trips_pivot.columns = ['url', 'ts', 'ct_no_full_dups', 'ct_all_full_dups', 'ct_mix_full_and_partial_dups']

In [183]:
entire_dup_trips_pivot['tot_dups'] = entire_dup_trips_pivot.ct_no_full_dups.fillna(0) + entire_dup_trips_pivot.ct_all_full_dups.fillna(0) + entire_dup_trips_pivot.ct_mix_full_and_partial_dups.fillna(0)

In [184]:
dup_trips_summary = entire_dup_trips_pivot.merge(num_trips, how = 'outer', on = ['url', 'ts'])

In [185]:
dup_trips_summary['dup_prop'] = round((dup_trips_summary.tot_dups / dup_trips_summary.num_trips),4)

In [190]:
dup_trips_summary.dtypes

url                                          object
ts                              datetime64[ns, UTC]
ct_no_full_dups                             float64
ct_all_full_dups                            float64
ct_mix_full_and_partial_dups                float64
tot_dups                                    float64
num_trips                                     int64
dup_prop                                    float64
dtype: object

In [197]:
dup_trips_summary[(dup_trips_summary.ts.dt.date == datetime.date(2022, 10, 25)) | (dup_trips_summary.ts.dt.date == datetime.date(2022, 9, 15))]

Unnamed: 0,url,ts,ct_no_full_dups,ct_all_full_dups,ct_mix_full_and_partial_dups,tot_dups,num_trips,dup_prop
0,https://ontime.vvta.org/gtfs,2022-09-15 14:59:54.274779+00:00,4.0,36.0,6.0,46.0,1307,0.0352
47,https://ontime.vvta.org/gtfs,2022-10-25 03:00:15.900936+00:00,,26.0,,26.0,1838,0.0141
48,https://thebuslive.com/gtfs,2022-09-15 14:59:54.274779+00:00,2.0,,,2.0,412,0.0049
95,https://thebuslive.com/gtfs,2022-10-25 03:00:15.900936+00:00,2.0,,,2.0,412,0.0049


In [199]:
dup_trips[(dup_trips.url == 'https://thebuslive.com/gtfs') & (dup_trips.ts.dt.date == datetime.date(2022, 10, 25))]

Unnamed: 0,base64_url,ts,route_id,service_id,trip_id,shape_id,trip_headsign,direction_id,block_id,url,dummy
1009,aHR0cHM6Ly90aGVidXNsaXZlLmNvbS9ndGZz,2022-10-25 03:00:15.900936+00:00,953,2,388,4388,Loop,0,316189,https://thebuslive.com/gtfs,1
1010,aHR0cHM6Ly90aGVidXNsaXZlLmNvbS9ndGZz,2022-10-25 03:00:15.900936+00:00,957,2,388,6249,Loop,0,316188,https://thebuslive.com/gtfs,1
