# Testing merges 

In [None]:
import numpy as np
import pandas as pd
import math
from siuba import *
from calitp import *
import intake
import data_prep

import shared_utils
from plotnine import *
import altair as alt
import altair_saver
from shared_utils import altair_utils 

pd.options.display.float_format = "{:.2f}".format
pd.options.display.max_rows = 250

In [None]:
df_5311 = data_prep.load_grantprojects()
vehicles = data_prep.load_cleaned_vehiclesdata()
organizations = data_prep.load_cleaned_organizations_data()

# Merge NTD with GTFS  --> vehicle_gtfs
<b> NOTE</b>:
I had to manually add NTD ID to the following agencies in the original CSV file called "cleaned organizations.csv"

* Butte County Association of Governments	90208
* City of Dixon	9402-91041
* City of Fairfield	90092
* City of Ridgecrest	9R02-91006
* City of Wasco	9R02-99426
* Glenn County	9R02-91088
* Mariposa County	9R02-91082
* Modoc Transportation Authority 	9R02-91008
* Palo Verde Valley Transit Agency	9R02-99454
* San Benito County Local Transportation Authority	9R02-91009
* San Joaquin Regional Transit District	90012
* Tuolumne County Transit Agency	9402-035


In [None]:
#trying the iloc way 
#organizations.loc[(organizations['name'] == 'Butte County Association of Governments'), "ntd_id"] = "90208"
#vehicles.loc[(vehicles['agency'] == 'Butte County Association of Governments'), "ntd_id"] = "90208"

#organizations.loc[(organizations['name'] == 'City of Fairfield'), "ntd_id"] = "90092"
#vehicles.loc[(vehicles['agency'] == 'City of Fairfield, California, dba: Fairfield and Suisun Transit'), "ntd_id"] = "90092"

In [None]:
#merging the 2 datasets together
vehicles_gtfs = pd.merge(vehicles, organizations,  how='left', on=['ntd_id'], indicator=True)

In [None]:
vehicle_agencies = vehicles_gtfs['agency'].drop_duplicates().tolist()

In [None]:
vehicles_gtfs['_merge'].value_counts()

In [None]:
vehicles_gtfs = vehicles_gtfs.drop(columns = ['_merge'])

# Merging -> Vehicle & GTFS with Black Cat.

In [None]:
#left merge, Black Cat on the left
Test1 = pd.merge(df_5311, vehicles_gtfs,  how='left', left_on=['organization_name'], right_on=['name'], indicator=True)

In [None]:
len(Test1)

In [None]:
# Counting left only and both
Test1['_merge'].value_counts()

In [None]:
#finding the agencies that are in left only.
Left_only = Test1[(Test1._merge.str.contains("left_only", case= False))] 
Left_orgs = Left_only['organization_name'].drop_duplicates().tolist()

In [None]:
# Filter out left only matches
m2 = Test1[~Test1.organization_name.isin(Left_orgs)]

In [None]:
#check that filter worked ok
m2.shape

In [None]:
m2.sample(3)

### Failed Matches, crosswalk manually

In [None]:
#making a data frame with only failed merges out out of original Black Cat
fail = df_5311[df_5311.organization_name.isin(Left_orgs)]

In [None]:
fail.shape

In [None]:
#crosswalk dictionary for replacing organization name
crosswalk = {'City of Chowchilla ': 'City of Chowchilla, dba: Chowchilla Area Transit ',
 'City of Dinuba ':  'City of Dinuba',
 'Modoc Transportation Agency': 'Modoc Transportation Agency',
 'Butte County Association of Governments/ Butte Regional Transit': 'Butte County Association of Governments',
 'Calaveras County Public Works':  'Calaveras Transit Agency',
 'City of Escalon ':  'City of Escalon, dba: eTrans',
 'County of Mariposa':  'Mariposa County Transit, dba: Mari-Go',
 'County of Shasta Department of Public Works':  'County of Shasta Department of Public Works',
 'County of Siskiyou': 'County of Siskiyou, dba: Siskiyou County Transit',
 'County of Tulare': 'Tulare County Area Transit',
 'Eureka Transit Service':  'City of Eureka, dba: Eureka Transit Service',
 'Kern Regional Transit':  'Kern Regional Transit',
 'Livermore Amador Valley Transit Authority':  'Livermore / Amador Valley Transit Authority',
 'Placer County Public Works (TART & PCT)': 'County of Placer, dba: Placer County Department of Public Works',
 'Plumas County Transportation Commission': 'Plumas County Transportation Commission',
 'San Luis Obispo Regional Transit Authority':  'San Luis Obispo Regional Transit Authority',
 'Sonoma County Transit':  'County of Sonoma, dba: Sonoma County Transit',
 'Sunline Transit Agency':  'SunLine Transit Agency',
 'Tehama County Transit Agency': 'Tehama County',
 'Trinity County Department of Transportation ':  'Trinity County',
 'Tuolumne County Transit Agency (TCTA)':  'Tuolumne County Transit',
 'Amador Transit':  'Amador Regional Transit System',
 'City of Corcoran - Corcoran Area Transit':  'City of Corcoran, dba: Corcoran Area Transit',
 'Yosemite Area Regional Transportation System ':  'Yosemite Area Regional Transportation System',
 'County Connection (Central Contra Costa Transit Authority)': 'Central Contra Costa Transit Authority, dba: COUNTY CONNECTION',
 'Calaveras Transit Agency ': 'Calaveras Transit Agency'}

In [None]:
#failed merges
fail['organization_name'].replace(crosswalk, inplace= True)

In [None]:
#Merging the failed organizations to vehicles 
Test2 = pd.merge(fail, vehicles_gtfs,  how='left', left_on=['organization_name'], right_on=['agency'], indicator=True)

In [None]:
Test2['_merge'].value_counts()

In [None]:
#finding the agencies that are in left only...make sure these are the ones we aren't sure have any matches.
Left_only = Test2[(Test2._merge.str.contains("left_only", case= False))]
#find failed agencies 

Left_only_orgs= Left_only['organization_name'].drop_duplicates().tolist()
Left_only_orgs

In [None]:
#deleting Klamath off since it has GTFS, we only want agencies that do not appear in both GTFS & NTD 
L_only_2_orgs = ['County of Los Angeles - Department of Public Works',
 'County of Nevada Public Works, Transit Services Division',
 'County of Sacramento Department of Transportation',
 'Glenn County Transportation Commission',
 'Stanislaus County Public Works - Transit Division',
 'Alpine County Community Development',
 'Fresno Council of Governments',
 'Greyhound Lines, Inc.']

In [None]:
#appending failed matches to the first data frame
BC_GTFS_NTD = m2.append(Test2, ignore_index=True)

In [None]:
#checking that rows match up
BC_GTFS_NTD.shape

In [None]:
#checking that organizations are here
BC_GTFS_NTD['organization_name'].nunique()

In [None]:
#drop  merge
BC_GTFS_NTD = BC_GTFS_NTD.drop(columns=['_merge'])

### Flag agencies that appear in Black Cat Only

In [None]:
def BC_only(row):
    if row.organization_name in Left_only_orgs:
        return '1'
    else: 
        return '0'  

In [None]:
BC_GTFS_NTD["Is_Agency_In_BC_Only_1_means_Yes"] = BC_GTFS_NTD.apply(lambda x: BC_only(x), axis=1)

In [None]:
#Checking again 
BC_GTFS_NTD[(BC_GTFS_NTD.organization_name.str.contains("greyhound", case= False))].head(1)

### Replacing Klamath
* Klamath has GTFS but no NTD records

In [None]:
BC_GTFS_NTD.loc[(BC_GTFS_NTD['organization_name'] == 'Klamath Trinity Non-Emergency Transportation\u200b'), "itp_id"] = "436"
BC_GTFS_NTD.loc[(BC_GTFS_NTD['organization_name'] == 'Klamath Trinity Non-Emergency Transportation\u200b'), "gtfs_schedule_status"] = "needed"

In [None]:
#Checking again 
BC_GTFS_NTD[(BC_GTFS_NTD.organization_name.str.contains("Klamath", case= False))].head(1)

### Getting ok/needed/research out of original GTFS data frame

In [None]:
#look at original GTFS status for all the different varities...
BC_GTFS_NTD['gtfs_schedule_status'].unique()

In [None]:
#create a new column
temp = BC_GTFS_NTD.gtfs_schedule_status.fillna("None")
BC_GTFS_NTD['GTFS_schedule_status_use'] = np.where(temp.str.contains("None"),"None",
                   np.where(temp.str.contains("ok"), "Ok",
                   np.where(temp.str.contains("long"), "Long-term solution needed",
                   np.where(temp.str.contains("research"), "Research", "Needed"))))


### Rating Fleet Size
* Using the aggregated df below, I did aggregate1['total_vehicles'].describe() and looked at the different percentiles to choose small, med, large, and no info

In [None]:
#Function
def fleet_size (row):
            if ((row.total_vehicles > 0) and (row.total_vehicles < 26)):
                return "Small"
            elif ((row.total_vehicles > 25) and (row.total_vehicles < 82)):
                return "Medium"
            elif ((row.total_vehicles > 82) and (row.total_vehicles < 1200)):
                return "Large"
            else:
                return "No Info"

In [None]:
BC_GTFS_NTD["fleet_size"] = BC_GTFS_NTD.apply(lambda x: fleet_size(x), axis=1)

In [None]:
#Checking again 
BC_GTFS_NTD[(BC_GTFS_NTD.fleet_size.str.contains("Info", case= False))]

### Final Cleaning

In [None]:
#delete old columns
BC_GTFS_NTD2 = BC_GTFS_NTD.drop(columns=['gtfs_schedule_status','name','agency'])

#rename
BC_GTFS_NTD2 = BC_GTFS_NTD2.rename(columns = {'GTFS_schedule_status_use':'GTFS'})

In [None]:
#get agencies without any data to show up 
show_up = ['reporter_type']
for i in show_up:
    BC_GTFS_NTD2[i] = BC_GTFS_NTD2[i].fillna('None')

In [None]:
#change itp id to be float.
BC_GTFS_NTD2['itp_id'] = BC_GTFS_NTD2['itp_id'].fillna(0)
BC_GTFS_NTD2.loc[(BC_GTFS_NTD['itp_id'] == '436'), "itp_id"] = 436

### Parquet

In [None]:
#Parquet
BC_GTFS_NTD2.to_parquet("BC_GTFS_NTD.parquet")
#Adding to GCS (just in case)
#BC_GTFS_NTD2.to_parquet("gs://calitp-analytics-data/data-analyses/5311 /BC_GTFS_NTD.parquet")
#BC_GTFS_NTD2.to_csv("gs://calitp-analytics-data/data-analyses/5311 /BC_GTFS_NTD.csv")

# Taking a quick look


(2) % and # vehicles older vehicles (10, 15 yrs), avg age by fleet size buckets

(3) # doors (which means # validators) for vehicles over 10 yrs by GTFS status and by fleet size

In [None]:
BC_GTFS_NTD2.columns

In [None]:
#First grabbing only one row for each agency.
Aggregate1 = BC_GTFS_NTD2.groupby(['organization_name','reporter_type','GTFS', 'fleet_size', 'Is_Agency_In_BC_Only_1_means_Yes']).agg({'total_vehicles':'max',
       'average_age_of_fleet__in_years_':'max','doors_sum':'max','_0_9':'max', '_10_12':'max', '_13_15':'max', '_16_20':'max', '_21_25':'max',
       '_26_30':'max', '_31_60':'max', '_60plus':'max',
})

In [None]:
#reset index
Aggregate1 = Aggregate1.reset_index()

### (1) % and # vehicles over 10 yrs by GTFS status bucket. Another version of over 15 yrs

In [None]:
#duplicate df into vehicles
vehicles_gtfs_ten = Aggregate1.copy()

In [None]:
#Adding up by age group
vehicles_gtfs_ten['vehicles_older_than_9']= vehicles_gtfs_ten['_10_12'] + vehicles_gtfs_ten['_13_15'] + vehicles_gtfs_ten['_16_20'] + vehicles_gtfs_ten['_21_25'] + vehicles_gtfs_ten['_26_30'] + vehicles_gtfs_ten['_31_60'] + vehicles_gtfs_ten['_60plus']
vehicles_gtfs_ten['vehicles_older_than_16']= vehicles_gtfs_ten['_16_20'] + vehicles_gtfs_ten['_21_25'] + vehicles_gtfs_ten['_26_30'] + vehicles_gtfs_ten['_31_60'] + vehicles_gtfs_ten['_60plus']

#dividing by age group
vehicles_gtfs_ten['vehicles_percent_older_than_9'] = (vehicles_gtfs_ten['vehicles_older_than_9']/vehicles_gtfs_ten['total_vehicles'])*100
vehicles_gtfs_ten['vehicles_percent_older_than_16'] = (vehicles_gtfs_ten['vehicles_older_than_16']/vehicles_gtfs_ten['total_vehicles'])*100
vehicles_gtfs_ten['vehicles_percent_under_10'] = (vehicles_gtfs_ten['_0_9']/vehicles_gtfs_ten['total_vehicles'])*100

In [None]:
vehicles_gtfs_ten.columns

In [None]:
#ONLY for vehicles over 9 
GTFS_ten = vehicles_gtfs_ten.groupby(['GTFS']).agg({'vehicles_older_than_9':'sum','vehicles_percent_older_than_9':'median' })
GTFS_ten = GTFS_ten.reset_index()
GTFS_ten

In [None]:
data_prep.basic_bar_chart(GTFS_ten,'GTFS','vehicles_percent_older_than_9') 

### (2) % and # vehicles older vehicles (10, 15 yrs), avg age by fleet size buckets

In [None]:
#fleet size 9 years & older
fleet_ten = vehicles_gtfs_ten.groupby(['fleet_size']).agg({'vehicles_older_than_9':'sum','vehicles_percent_older_than_9':'median'})
fleet_ten = fleet_ten.reset_index()
fleet_ten

### Looking at organizations & GTFS

In [None]:
Looking_at_GTFS = Aggregate1.groupby(['GTFS']).agg({'organization_name':'nunique'})
Looking_at_GTFS = Looking_at_GTFS.reset_index()
Looking_at_GTFS = Looking_at_GTFS.rename(columns = {'organization_name':'Count of Agencies by GTFS'})
Looking_at_GTFS

In [None]:
data_prep.basic_bar_chart(Looking_at_GTFS,'GTFS','Count of Agencies by GTFS') 