# Merge v1 and v2 to see which trip_ids are present

* Start with Oct, then do the same for Nov (but it doesn't appear that schedule data changes much between these 2 dates)

In [1]:
#import os
#os.environ["CALITP_BQ_MAX_BYTES"] = str(1_000_000_000_000)

import dask.dataframe as dd
import pandas as pd

#from calitp.tables import tbls
#from siuba import *

#from shared_utils import rt_utils, gtfs_utils
from shared_utils import rt_dates, geography_utils

V1_PATH = f"./v1_data/"
V2_PATH = "./v2_data/"

oct_date = rt_dates.DATES["oct2022"]



In [None]:
'''
ALL_ITP_IDS = (tbls.gtfs_schedule.agency()
               >> distinct(_.calitp_itp_id)
               >> filter(_.calitp_itp_id != 200)
               >> collect()
              ).calitp_itp_id.tolist()

trips_oct = gtfs_utils.get_trips(
    selected_date = oct_date,
    itp_id_list = ALL_ITP_IDS,
    get_df = True
)

trips_oct.to_parquet(f"{V1_PATH}trips_{oct_date}.parquet")

trips_nov = gtfs_utils.get_trips(
    selected_date = nov_date,
    itp_id_list = ALL_ITP_IDS,
    get_df = True
)

trips_nov.to_parquet(f"{V1_PATH}trips_{nov_date}.parquet")
'''

In [2]:
daily_feeds = pd.read_parquet(
    f"{V2_PATH}daily_feeds_orgs_{oct_date}.parquet")

In [3]:
v1_id = ["calitp_itp_id", "calitp_url_number"]
v2_id = ["feed_key", "name"]

def import_v1_v2_trips(date: str) -> pd.DataFrame:
    keep_cols = [
        "trip_id", "route_id", "shape_id",
        "service_hours"
    ]
    
    df1 = pd.read_parquet(f"{V1_PATH}trips_{date}.parquet")
    df1 = df1[v1_id + keep_cols]
    
    df2 = pd.read_parquet(f"{V2_PATH}trips_{date}.parquet")
    feed_to_name = pd.read_parquet(
        f"{V2_PATH}daily_feeds_orgs_{date}.parquet")
    
    df2 = (df2.rename(columns = {
            "feed_key_x": "feed_key",
            "trip_id_x": "trip_id", 
            "route_id_x": "route_id"})
           [["feed_key"] + keep_cols]
          )
    
    exclude_types = ["Regional Precursor Feed", "Combined Regional Feed"]
    feed_to_name2 = feed_to_name[
        (~feed_to_name.regional_feed_type.isin(exclude_types)) & 
        (feed_to_name.is_future == False)
    ].drop_duplicates()
    
    df2 = pd.merge(
        df2,
        feed_to_name2[v2_id + ["regional_feed_type"]],
        on = "feed_key",
        how = "outer",
        validate = "m:1"
    )
    
    return df1, df2

In [4]:
oct1, oct2 = import_v1_v2_trips(oct_date)

In [5]:
oct1.service_hours.sum(), oct2.service_hours.sum()

(113752.53305555556, 124056.98027777777)

In [6]:
trip_cols = ["trip_id", "route_id", "shape_id"]

oct_merge = pd.merge(
    oct1.drop_duplicates(subset=trip_cols), 
    oct2.drop_duplicates(subset=trip_cols), 
    on = ["trip_id", "route_id", "shape_id", "service_hours"],
    how = "outer",
    validate = "1:1",
    indicator = True
)

## Found in v1 and v2

Majority is in both, good!

Need to figure out which feeds to keep in v2 to get at the very small number of missing trips in v1.

In [7]:
oct_merge._merge.value_counts()

both          112736
right_only     28880
left_only       4252
Name: _merge, dtype: int64

In [8]:
oct1.drop_duplicates(subset=v1_id + ["trip_id"]).service_hours.sum()

113752.53305555556

In [9]:
# In reality, url_number is not kept, so see how many hours we actually want
oct1.drop_duplicates(
    subset=["calitp_itp_id", "trip_id"]).service_hours.sum()

94786.39361111113

In [None]:
# Beaumont?

In [10]:
oct_merge.groupby('_merge').agg({"service_hours": "sum"}).reset_index()

Unnamed: 0,_merge,service_hours
0,left_only,3097.896667
1,right_only,21332.336111
2,both,87973.435278


## Issue 1: Metrolink

Once we find Metrolink, there's the same number of observations in both v1 and v2.

As long as the Metrolink fix can be perpetrated in dbt or through `gtfs_utils_v2`, we'll be good.

In [11]:
metrolink1 = oct1[oct1.calitp_itp_id==323]
metrolink2 = oct2[(oct2.name.str.contains("Metrolink")) & 
                  (oct2.name.notna())]

In [12]:
print(f"feed key: {metrolink2.feed_key.iloc[0]}")
print(f"name: {metrolink2.name.iloc[0]}")

feed key: 55dfff37f64595b6b8ca3fd3e5499d0d
name: Metrolink Schedule


In [13]:
metrolink1.service_hours.sum(), metrolink2.service_hours.sum()

(225.58333333333334, 225.58333333333334)

## Issue 2: Non-Metrolink

In [14]:
oct_issues = oct_merge[(oct_merge._merge!="both") & 
                       (oct_merge.calitp_itp_id != 323) & 
                       (oct_merge.name != "Metrolink Schedule")
                      ]

In [15]:
oct_issues._merge.value_counts()

right_only    28742
left_only      4114
both              0
Name: _merge, dtype: int64

In [16]:
def separate_v1_v2(merged_df):
    
    keep_cols = [
        "trip_id", "route_id", "shape_id",
        "service_hours"
    ]
    
    v1 = merged_df[merged_df._merge=="left_only"][v1_id + keep_cols]
    
    v2 = merged_df[merged_df._merge=="right_only"][v2_id + keep_cols]
    
    return v1, v2

In [17]:
v1, v2 = separate_v1_v2(oct_issues)

In [21]:
pd.merge(
    v1,
    v2,
    on = ["route_id", "service_hours"],
    how = "inner",
    indicator=True
)._merge.value_counts()

both          4
left_only     0
right_only    0
Name: _merge, dtype: int64

In [22]:
pd.merge(
    v1,
    v2,
    on = ["route_id"],
    how = "inner",
    indicator=True
)._merge.value_counts()

both          30
left_only      0
right_only     0
Name: _merge, dtype: int64

In [23]:
pd.merge(
    v1,
    v2,
    on = ["shape_id", "route_id"],
    how = "inner",
    indicator=True
)._merge.value_counts()

both          8
left_only     0
right_only    0
Name: _merge, dtype: int64

In [24]:
v1_agg = geography_utils.aggregate_by_geography(
    v1, 
    group_cols = v1_id,
    nunique_cols = ["shape_id", "route_id", "trip_id"],
    sum_cols = ["service_hours"],
    rename_cols = True
).sort_values(by = ["trip_id_nunique", "route_id_nunique", 
                    "service_hours_sum"], 
              ascending=[True, True, True]
             ).reset_index(drop=True)

In [33]:
v2_agg = geography_utils.aggregate_by_geography(
    v2,
    group_cols = v2_id,
    nunique_cols = ["shape_id", "route_id", "trip_id"],
    sum_cols = ["service_hours"],
    rename_cols = True
).sort_values(by = ["trip_id_nunique", "route_id_nunique", 
                    "service_hours_sum"], 
              ascending=[True, True, True]
             ).reset_index(drop=True)

In [34]:
v1_agg.service_hours_sum.sum()

2872.3133333333335

In [35]:
v2_agg.service_hours_sum.sum()

3.75

In [37]:
geography_utils.aggregate_by_geography(
    v2,
    group_cols = ["feed_key"],
    nunique_cols = ["shape_id", "route_id", "trip_id"],
    sum_cols = ["service_hours"],
    rename_cols = True
).sort_values(by = ["trip_id_nunique", "route_id_nunique", 
                    "service_hours_sum"], 
              ascending=[True, True, True]
             ).reset_index(drop=True).service_hours_sum.sum()

21106.75277777778

Something happens once `name` is included. 

The 21k extra hours drops down to 3??

In [28]:
v1_unique_feeds  = oct_merge[v1_id + ["_merge"]].drop_duplicates()

In [29]:
v1_unmerged = pd.merge(
    v1_unique_feeds,
    v1_agg,
    on = v1_id,
    how = "outer",
    validate = "m:1",
    indicator = "unmerged"
)

v1_unmerged.unmerged.value_counts()

left_only     197
both           14
right_only      0
Name: unmerged, dtype: int64

In [30]:
v1_unmerged[v1_unmerged.unmerged=="both"]

Unnamed: 0,calitp_itp_id,calitp_url_number,_merge,service_hours_sum,route_id_nunique,shape_id_nunique,trip_id_nunique,unmerged
8,247.0,1.0,left_only,36.597222,8.0,23.0,157.0,both
9,247.0,1.0,both,36.597222,8.0,23.0,157.0,both
19,34.0,0.0,left_only,4.416667,1.0,2.0,4.0,both
20,34.0,0.0,both,4.416667,1.0,2.0,4.0,both
50,367.0,0.0,left_only,45.004722,3.0,4.0,65.0,both
55,346.0,0.0,left_only,147.333333,10.0,39.0,137.0,both
87,280.0,0.0,left_only,83.166667,6.0,33.0,141.0,both
108,127.0,1.0,left_only,293.483333,10.0,21.0,206.0,both
126,127.0,2.0,left_only,33.0,4.0,13.0,61.0,both
133,117.0,0.0,left_only,39.416667,7.0,15.0,40.0,both


Try another approach:

We always narrow down to operator-trips

Need to figure out how to do the same in v2

In [38]:
oct1.drop_duplicates(subset=["calitp_itp_id", "trip_id"]
                    ).service_hours.sum()

94786.39361111113

In [39]:
oct2.drop_duplicates(subset=["name", "trip_id"]
                    ).service_hours.sum()

123796.93027777778