# Testing merges 

In [None]:
import numpy as np
import pandas as pd
import math
from siuba import *
from calitp import *
import intake
import data_prep
import fuzzymatcher
pd.options.display.max_rows = 250

In [None]:
df_5311 = data_prep.load_grantprojects()
vehicles = data_prep.load_cleaned_vehiclesdata()
organizations = data_prep.load_cleaned_organizations_data()

In [None]:
GTFS = data_prep.GTFS()

# Merge NTD with GTFS  --> vehicle_gtfs
<b> NOTE</b>:
I had to manually add NTD ID to the following agencies in the original CSV file called "cleaned organizations.csv"

* Butte County Association of Governments	90208
* City of Dixon	9402-91041
* City of Fairfield	90092
* City of Ridgecrest	9R02-91006
* City of Wasco	9R02-99426
* Glenn County	9R02-91088
* Mariposa County	9R02-91082
* Modoc Transportation Authority 	9R02-91008
* Palo Verde Valley Transit Agency	9R02-99454
* San Benito County Local Transportation Authority	9R02-91009
* San Joaquin Regional Transit District	90012
* Tuolumne County Transit Agency	9402-035


In [None]:
#trying the iloc way 
organizations.loc[(organizations['name'] == 'Butte County Association of Governments'), "ntd_id"] = "90208"
vehicles.loc[(vehicles['agency'] == 'Butte County Association of Governments'), "ntd_id"] = "90208"

organizations.loc[(organizations['name'] == 'City of Fairfield'), "ntd_id"] = "90092"
vehicles.loc[(vehicles['agency'] == 'City of Fairfield, California, dba: Fairfield and Suisun Transit'), "ntd_id"] = "90092"

In [None]:
#merging the 2 datasets together
vehicles_gtfs = pd.merge(vehicles, organizations,  how='left', on=['ntd_id'], indicator=True)

In [None]:
vehicle_agencies = vehicles_gtfs['agency'].drop_duplicates().tolist()

In [None]:
vehicles_gtfs['_merge'].value_counts()

In [None]:
vehicles_gtfs = vehicles_gtfs.drop(columns = ['_merge'])

# Merging - the "traditional" way without fuzzy matcher (Use this!!)

In [None]:
#left merge, Black Cat on the left
Test1 = pd.merge(df_5311, vehicles_gtfs,  how='left', left_on=['organization_name'], right_on=['name'], indicator=True)

In [None]:
# Counting left only and both
Test1['_merge'].value_counts()

In [None]:
#finding the agencies that are in left only.
Left_only = Test1[(Test1._merge.str.contains("left_only", case= False))] 
Left_orgs = Left_only['organization_name'].drop_duplicates().tolist()

In [None]:
# Filter out left only matches
m2 = Test1[~Test1.organization_name.isin(Left_orgs)]

In [None]:
#check that filter worked ok
m2.shape

### Failed Matches, crosswalk manually

In [None]:
#making a data frame with only failed merges out out of original Black Cat
fail = df_5311[df_5311.organization_name.isin(Left_orgs)]

In [None]:
fail.shape

In [None]:
#crosswalk dictionary for replacing organization name
crosswalk = {'City of Chowchilla ': 'City of Chowchilla, dba: Chowchilla Area Transit ',
 'City of Dinuba ':  'City of Dinuba',
 'Modoc Transportation Agency': 'Modoc Transportation Agency',
 'Butte County Association of Governments/ Butte Regional Transit': 'Butte County Association of Governments',
 'Calaveras County Public Works':  'Calaveras Transit Agency',
 'City of Escalon ':  'City of Escalon, dba: eTrans',
 'County of Mariposa':  'Mariposa County Transit, dba: Mari-Go',
 'County of Shasta Department of Public Works':  'County of Shasta Department of Public Works',
 'County of Siskiyou': 'County of Siskiyou, dba: Siskiyou County Transit',
 'County of Tulare': 'Tulare County Area Transit',
 'Eureka Transit Service':  'City of Eureka, dba: Eureka Transit Service',
 'Kern Regional Transit':  'Kern Regional Transit',
 'Livermore Amador Valley Transit Authority':  'Livermore / Amador Valley Transit Authority',
 'Placer County Public Works (TART & PCT)': 'County of Placer, dba: Placer County Department of Public Works',
 'Plumas County Transportation Commission': 'Plumas County Transportation Commission',
 'San Luis Obispo Regional Transit Authority':  'San Luis Obispo Regional Transit Authority',
 'Sonoma County Transit':  'County of Sonoma, dba: Sonoma County Transit',
 'Sunline Transit Agency':  'SunLine Transit Agency',
 'Tehama County Transit Agency': 'Tehama County',
 'Trinity County Department of Transportation ':  'Trinity County',
 'Tuolumne County Transit Agency (TCTA)':  'Tuolumne County Transit',
 'Amador Transit':  'Amador Regional Transit System',
 'City of Corcoran - Corcoran Area Transit':  'City of Corcoran, dba: Corcoran Area Transit',
 'Yosemite Area Regional Transportation System ':  'Yosemite Area Regional Transportation System',
 'County Connection (Central Contra Costa Transit Authority)': 'Central Contra Costa Transit Authority, dba: COUNTY CONNECTION',
 'Calaveras Transit Agency ': 'Calaveras Transit Agency'}

In [None]:
#failed merges
fail['organization_name'].replace(crosswalk, inplace= True)

In [None]:
#Merging the failed organizations to vehicles 
Test2 = pd.merge(fail, vehicles_gtfs,  how='left', left_on=['organization_name'], right_on=['agency'], indicator=True)

In [None]:
Test2['_merge'].value_counts()

In [None]:
#Checking again 
Test2[(Test2.organization_name.str.contains("calaveras", case= False))]

In [None]:
#finding the agencies that are in left only...make sure these are the ones we aren't sure have any matches.
L_only_2 = Test2[(Test2._merge.str.contains("left_only", case= False))]
#find failed agencies 
L_only_2_orgs = L_only_2['organization_name'].drop_duplicates().tolist()

In [None]:
#deleting Klamath off since it has GTFS, we only want agencies that do not appear in both GTFS & NTD 
L_only_2_orgs = ['County of Los Angeles - Department of Public Works',
 'County of Nevada Public Works, Transit Services Division',
 'County of Sacramento Department of Transportation',
 'Glenn County Transportation Commission',
 'Stanislaus County Public Works - Transit Division',
 'Alpine County Community Development',
 'Fresno Council of Governments',
 'Greyhound Lines, Inc.']

In [None]:
#appending failed matches to the first data frame
BC_GTFS_NTD = m2.append(Test2, ignore_index=True)

In [None]:
#checking that rows match up
BC_GTFS_NTD.shape

In [None]:
#checking that organizations are here
BC_GTFS_NTD['organization_name'].nunique()

### Flag for Black Cat Only

In [None]:
def BC_only(row):
    if row.organization_name in L_only_2_orgs:
        return '1'
    else: 
        return '0'  

In [None]:
BC_GTFS_NTD["Is_Agency_In_BC_Only_1_means_Yes"] = BC_GTFS_NTD.apply(lambda x: BC_only(x), axis=1)

In [None]:
#Checking again 
BC_GTFS_NTD[(BC_GTFS_NTD.organization_name.str.contains("greyhound", case= False))].head(1)

### Replacing Klamath
* Klamath has GTFS but no NTD records

In [None]:
BC_GTFS_NTD.loc[(BC_GTFS_NTD['organization_name'] == 'Klamath Trinity Non-Emergency Transportation\u200b'), "itp_id"] = "436"
BC_GTFS_NTD.loc[(BC_GTFS_NTD['organization_name'] == 'Klamath Trinity Non-Emergency Transportation\u200b'), "gtfs_schedule_status"] = "needed"

In [None]:
#Checking again 
BC_GTFS_NTD[(BC_GTFS_NTD.organization_name.str.contains("Klamath", case= False))].head(1)

In [None]:
#drop  merge
BC_GTFS_NTD = BC_GTFS_NTD.drop(columns=['_merge'])

### Replacing GTFS with either ok or need

In [None]:
#Drop the columns that have lower "both" matches
GTFS = GTFS.drop(columns=['name','operator'])
#in script, delete later
GTFS = GTFS.rename(columns = {'gtfs_schedule_status':'simple_GTFS_status'})     

In [None]:
#in script, delete later
GTFS = GTFS.dropna(subset=['simple_GTFS_status'])

In [None]:
GTFS['provider'] = GTFS['provider'].str.replace('"', "")

In [None]:
#lots of agencies have duplicates due to service type
GTFS['provider'].value_counts()

In [None]:
#keep the first occurence.
GTFS.drop_duplicates(subset=['provider'], keep='first', inplace = True)

In [None]:
GTFS.shape

In [None]:
BC_GTFS_NTD2 = pd.merge(BC_GTFS_NTD, GTFS,  how='left', left_on = ['organization_name'], right_on = ['provider'], indicator = True)

In [None]:
BC_GTFS_NTD2['_merge'].value_counts()

In [None]:
BC_GTFS_NTD2['gtfs_status_final'] = BC_GTFS_NTD2.apply(lambda x: x.gtfs_schedule_status if (str(x.simple_GTFS_status) == 'nan') else x.simple_GTFS_status, axis=1)

In [None]:
#Checking again 
BC_GTFS_NTD2[(BC_GTFS_NTD2.organization_name.str.contains("Greyhound", case= False))].head(1)

In [None]:
BC_GTFS_NTD2.shape

In [None]:
#delete old columns
BC_GTFS_NTD3 = BC_GTFS_NTD2.drop(columns=['simple_GTFS_status','_merge','legacy_ntd_id','years_old:','gtfs_schedule_status','provider','name'])

In [None]:
#get agencies without any data to show up (do this neater...)
show_up = ['vehicle_groups','reporter_type','gtfs_status_final']

for i in show_up:
    BC_GTFS_NTD3[i] = BC_GTFS_NTD3[i].fillna('None')

BC_GTFS_NTD3['vehicle_groups'] = BC_GTFS_NTD3['vehicle_groups'].fillna('None')
BC_GTFS_NTD3['reporter_type'] = BC_GTFS_NTD3['reporter_type'].fillna('None')
BC_GTFS_NTD3['gtfs_status_final'] = BC_GTFS_NTD3['gtfs_status_final'].fillna('None')

In [None]:
#Adding to GCS
#BC_GTFS_NTD3.to_csv("gs://calitp-analytics-data/data-analyses/5311 /BC_GTFS_NTD.csv", index= False)

In [None]:
Aggregate1 = BC_GTFS_NTD3.groupby(['organization_name','reporter_type','gtfs_status_final', 'Is_Agency_In_BC_Only_1_means_Yes','vehicle_groups',]).agg({'_0_9':'max', '_10_12':'max','_13_15':'max',
       '_16_20':'max', '_21_25':'max', '_26_30':'max', '_31_60':'max', '_60+':'max'
})

In [None]:
Aggregate1

# Don't Use This: Merging Black Cat with vehicles_gtfs

In [None]:
right_on = ["agency"]
left_on = ["organization_name"]

In [None]:
black_cat1 = fuzzymatcher.fuzzy_left_join(df_5311, vehicles_gtfs, left_on, right_on)

In [None]:
unique_agencies = black_cat1[['best_match_score','organization_name','agency']]
unique_agencies = unique_agencies.drop_duplicates()

In [None]:
unique_agencies.sort_values('best_match_score')

In [None]:
#stuff to filter out before cutting off at scores...
bad_matches_before_cutoff = ['County of Los Angeles - Department of Public Works', 'County of Sacramento Department of Transportation']

In [None]:
#filter out a bunch the agencies above
black_cat2 = black_cat1[~black_cat1.organization_name.isin(bad_matches_before_cutoff)]

In [None]:
#filter out for any scores below 0.19
black_cat2 = black_cat2.loc[black_cat2['best_match_score'] > 0.19 ]

In [None]:
#total agencies that matched over with decent scores
black_cat2['organization_name'].nunique()

## Walking back in the other agencies that were cut into a seperate dataframe

In [None]:
#getting a list of agencies with bad scores
bad_fuzzy_matches = black_cat1.loc[black_cat1['best_match_score'] < 0.19 ]

In [None]:
bad_fuzzy_matches = bad_fuzzy_matches[['organization_name','agency', 'ntd_id']]

In [None]:
bad_fuzzy_matches = bad_fuzzy_matches.drop_duplicates()

In [None]:
#making a manual crosswalk
#bad_fuzzy_matches.to_csv("./bad_fuzzy_matches.csv", index = False) 

In [None]:
#reading crosswalk 3, what I manually inputted
crosswalk3 = pd.read_excel('corrected_bad_fuzzy_match.xlsx')

In [None]:
#crosswalk on the left, I only want a dataframe with the poorly matched agencies
unmatched = crosswalk3.merge(df_5311, on=['organization_name'],  how='left')

In [None]:
#now grabbing vehicles and GTFS info on the left
unmatched2 = unmatched.merge(vehicles_gtfs, on=['agency','ntd_id'],  how='left')

## R bind the badly matched columns back

In [None]:
GTFS_Vehicles_BC = black_cat2.append(pd.DataFrame(data = unmatched2), ignore_index=True)

In [None]:
GTFS_Vehicles_BC.shape

In [None]:

GTFS_Vehicles_BC[(GTFS_Vehicles_BC.organization_name.str.contains("livermore", case= False))]

# Final Clean Up

## One last check to make sure all the names are there
<b> Still having issues with </b>
* Butte
* Fairfield
* Tuolumne 

<b> Inherently, aren't sure if these agencies correspond with any GTFS & NTD records</b> 

* Alpine County
* Fresno Council of Gov
* Greyhound
* County of LA
* City of Tehachapi
* Calaveras County Public Works
* County of Nevada Public Works
* County of Shasta Department of Public Works
* Glenn County Transportation Commission
* Tehama County Transit Agency

<b> Has GTFS but no NTD </b>
* Klamath Trinity Non-Emergency Transportationâ€

<b> Has NTD no GTFS </b>
* Calaveras Transit Agency

In [None]:
Names = GTFS_Vehicles_BC[['organization_name','agency','ntd_id','itp_id','gtfs_schedule_status']]
Names = Names.drop_duplicates()
#Names.sort_values('organization_name')

In [None]:
Names.isna().sum()

In [None]:
Names['organization_name'].nunique()

## Fill in Missing Values - Loops
* https://stackoverflow.com/questions/51097981/fill-column-of-a-dataframe-from-another-dataframe
* https://www.kite.com/python/answers/how-to-create-a-pandas-dataframe-from-columns-in-other-dataframes-in-python
* https://www.kite.com/python/answers/how-to-fill-a-pandas-dataframe-row-by-row-in-python
* https://www.geeksforgeeks.org/replace-values-of-a-dataframe-with-the-value-of-another-dataframe-in-pandas/

In [None]:
#only including  columns just belonging to df where GTFS and vehicles merged. 
v_gtfs_subset = vehicles_gtfs[['agency','ntd_id',
       'organization_type', 'reporter_type', 'primary_uza_population',
       'agency_voms', 'vehicle_type', 'years_old:', '_13_15', '_16_20',
       '_21_25', '_26_30', '_31_60', '_60+', 'total_vehicles',
       'average_age_of_fleet__in_years_', 'average_lifetime_miles_per_vehicle',
       '_0_9', '_10_12', 'vehicle_groups', 'name', 'itp_id',
       'gtfs_schedule_status']]

In [None]:
#subsetting only for agencies that aren't matching
subset_agencies = ['Butte County Association of Governments','City of Fairfield, California, dba: Fairfield and Suisun Transit','Tuolumne County Transit']
subset1 = vehicles_gtfs[vehicles_gtfs.agency.isin(subset_agencies)]

In [None]:
#subset for organizations that aren't matching
subset_orgs = ['Butte County Association of Governments/ Butte Regional Transit','City of Fairfield','Tuolumne County Transit Agency (TCTA)']

In [None]:
#only including  columns just belonging to blackcat data
subset2 = GTFS_Vehicles_BC[GTFS_Vehicles_BC.organization_name.isin(subset_orgs)]
subset2 = subset2[['best_match_score', '__id_left', '__id_right', 'grant_fiscal_year',
       'funding_program', 'grant_number', 'project_year', 'organization_name',
       'upin', 'description', 'ali', 'contract_number', 'allocationamount',
       'encumbered_amount', 'expendedamount', 'activebalance',
       'closedoutbalance', 'project_status','agency']]

In [None]:
#Tuolumne has no agency name, fill it in
subset2.agency = subset2.agency.fillna('Tuolumne County Transit')

In [None]:
subset3 = pd.merge(subset1, subset2,  how='inner', on=['agency'])

In [None]:
#delete the original fairfield, tuolumne, and butte info
GTFS_Vehicles_BC = GTFS_Vehicles_BC[~GTFS_Vehicles_BC.organization_name.isin(subset_orgs)]

In [None]:
#append our dataframe subset 3 in
GTFS_Vehicles_BC = GTFS_Vehicles_BC.append(subset3, ignore_index=True)

#### Klamath has ITP data but no NTD data. Have to add it manually

In [None]:
GTFS_Vehicles_BC.loc[(GTFS_Vehicles_BC['organization_name'] == 'Klamath Trinity Non-Emergency Transportation\u200b'), "itp_id"] = "436"
GTFS_Vehicles_BC.loc[(GTFS_Vehicles_BC['organization_name'] == 'Klamath Trinity Non-Emergency Transportation\u200b'), "gtfs_schedule_status"] = "needed"

In [None]:
GTFS_Vehicles_BC.shape

## Generate a column that indicates that the rows that only appear in Black Cat 

In [None]:
missing_list = Names[Names.isna().any(axis=1)]

In [None]:
BC_only = missing_list[['organization_name']]
BC_only=BC_only.values.tolist()

In [None]:
BC_only = ['County of Los Angeles - Department of Public Works','Greyhound Lines, Inc.','Alpine County Community Development',
'Fresno Council of Governments','Tehama County Transit Agency','Glenn County Transportation Commission',
'Stanislaus County Public Works - Transit Division','County Connection (Central Contra Costa Transit Authority)','County of Nevada Public Works, Transit Services Division',
'City of Tehachapi','Calaveras County Public Works']

In [None]:
def flag_for_BC(row):
    if row.organization_name in BC_only:
        return '1'
    else: 
        return '0'     

In [None]:
GTFS_Vehicles_BC["Is_Agency_In_BC_Only_1_means_Yes"] = GTFS_Vehicles_BC.apply(lambda x: flag_for_BC(x), axis=1)

## Replacing GTFS with more simple data - Help how come this gives us such a crazy number of new rows?
* This data frame didn't have NTD IDS, but they do have a more definitive GTFS status like "ok" or "needed."
* Using provider, as that gives us the most matches for "both"
* How come this gives us such a crazy number of new rows?


In [None]:
GTFS_Vehicles_BC2 = pd.merge(GTFS_Vehicles_BC, GTFS,  how='left', on = ['name'], indicator=True)

In [None]:
GTFS_Vehicles_BC2['_merge'].value_counts()

In [None]:
GTFS_Vehicles_BC2['gtfs_status_final'] = GTFS_Vehicles_BC2.apply(lambda x: x.gtfs_schedule_status if (str(x.simple_GTFS_status) == 'nan') else x.simple_GTFS_status	, axis=1)

In [None]:
GTFS_Vehicles_BC2.sample(3)

## Final preview
* Deleted a few things: agency, state, legacy_id, years old


In [None]:
GTFS_Vehicles_BC2 = GTFS_Vehicles_BC2.drop(columns=['best_match_score', '__id_left', '__id_right','state', 'legacy_ntd_id', 'years_old:','simple_GTFS_status','gtfs_schedule_status','agency','name'])

In [None]:
GTFS_Vehicles_BC2.head(1)

In [None]:
GTFS_Vehicles_BC2.duplicated().sum()

In [None]:
GTFS_Vehicles_BC2 =GTFS_Vehicles_BC2.drop_duplicates()

In [None]:
GTFS_Vehicles_BC2.shape

In [None]:
GTFS_Vehicles_BC2['organization_name'].nunique()

In [None]:
GTFS_Vehicles_BC2.isna().sum()

In [None]:
GTFS_Vehicles_BC2.organization_name.value_counts()

In [None]:
#writing to GCS bucket
#GTFS_Vehicles_BC2.to_csv("gs://calitp-analytics-data/data-analyses/5311 /GTFS_Vehicles_BC.csv", index= False)