# Testing merges before turning them into functions

In [None]:
import numpy as np
import pandas as pd
from siuba import *
from calitp import *
import intake
import data_prep
import fuzzymatcher
pd.options.display.max_rows = 250

In [None]:
df_5311 = data_prep.load_grantprojects()
vehicles = data_prep.load_vehiclesdata2()
organizations = data_prep.load_cleaned_organizations_data()

# Merge NTD with GTFS  --> vehicle_gtfs
<b> NOTE</b>:
I had to MANUALLY add NTD ID to the following agencies in the original CSV file called "cleaned organizations.csv"

* Butte County Association of Governments	90208
* City of Dixon	9402-91041
* City of Fairfield	90092
* City of Ridgecrest	9R02-91006
* City of Wasco	9R02-99426
* Glenn County	9R02-91088
* Mariposa County	9R02-91082
* Modoc Transportation Authority 	9R02-91008
* Palo Verde Valley Transit Agency	9R02-99454
* San Benito County Local Transportation Authority	9R02-91009
* San Joaquin Regional Transit District	90012
* Tuolumne County Transit Agency	9402-035


In [None]:
#drop records without NTD ID.
#organizations = organizations.dropna(subset=['ntd_id'])

In [None]:
#put in script later
#vehicles.ntd_id = vehicles.ntd_id.astype(str)
#put in script later
#organizations.ntd_id = organizations.ntd_id.astype(str)

In [None]:
organizations.loc[(organizations['name'] == 'Butte County Association of Governments'), "ntd_id"] = "90208"
vehicles.loc[(vehicles['agency'] == 'Butte County Association of Governments'), "ntd_id"] = "90208"

organizations.loc[(organizations['name'] == 'City of Fairfield'), "ntd_id"] = "90208"
vehicles.loc[(vehicles['agency'] == 'City of Fairfield, California, dba: Fairfield and Suisun Transit'), "ntd_id"] = "90092"

In [None]:
#checking to make sure the NTD IDS are the same
organizations[organizations.name.str.contains("Butte County Association")]["ntd_id"].iloc[0]

In [None]:
vehicles[vehicles.agency.str.contains("Butte")]["ntd_id"].iloc[0]

In [None]:
#merging the 2 datasets together
vehicles_gtfs = pd.merge(vehicles, organizations,  how='left', on=['ntd_id'], indicator=True)

In [None]:
vehicles_gtfs['_merge'].value_counts()

In [None]:
vehicles_gtfs = vehicles_gtfs.drop(columns = ['_merge'])

In [None]:
vehicles_gtfs.head(2)

# Merging Black Cat with vehicles_gtfs

In [None]:
right_on = ["agency"]
left_on = ["organization_name"]

In [None]:
black_cat1 = fuzzymatcher.fuzzy_left_join(df_5311, vehicles_gtfs, left_on, right_on)

In [None]:
unique_agencies = black_cat1[['best_match_score','organization_name','agency']]
unique_agencies = unique_agencies.drop_duplicates()

In [None]:
unique_agencies.sort_values('best_match_score')

In [None]:
#stuff to filter out before cutting off at scores...
subset3 = ['County of Los Angeles - Department of Public Works', 'County of Sacramento Department of Transportation']

In [None]:
#filter out a bunch the agencies above
black_cat2 = black_cat1[~black_cat1.organization_name.isin(subset3)]

In [None]:
#filter out for any scores below 0.19
black_cat2 = black_cat2.loc[black_cat2['best_match_score'] > 0.19 ]

In [None]:
#total agencies that matched over with decent scores
black_cat2['organization_name'].nunique()

# Walking back in the other 21 agencies that were cut into a seperate dataframe

In [None]:
#getting a list of agencies with bad scores
bad_fuzzy_matches = black_cat1.loc[black_cat1['best_match_score'] < 0.19 ]

In [None]:
bad_fuzzy_matches = bad_fuzzy_matches[['organization_name','agency', 'ntd_id']]

In [None]:
bad_fuzzy_matches = bad_fuzzy_matches.drop_duplicates()

In [None]:
#making a manual crosswalk
#bad_fuzzy_matches.to_csv("./bad_fuzzy_matches.csv", index = False) 

In [None]:
#reading crosswalk 3, what I manually inputted
crosswalk3 = pd.read_excel('corrected_bad_fuzzy_match.xlsx')

In [None]:
#crosswalk on the left, I only want a dataframe with the poorly matched agencies
unmatched = crosswalk3.merge(df_5311, on=['organization_name'],  how='left')

In [None]:
#now grabbing vehicles and GTFS info on the left
unmatched2 = unmatched.merge(vehicles_gtfs, on=['agency','ntd_id'],  how='left')

In [None]:
unmatched2['organization_name'].nunique()

# R Bind

In [None]:
GTFS_Vehicles_BC = black_cat2.append(pd.DataFrame(data = unmatched2), ignore_index=True)

In [None]:
GTFS_Vehicles_BC = GTFS_Vehicles_BC.drop(columns=['best_match_score','__id_left','__id_right','vehicle_type'])

In [None]:
GTFS_Vehicles_BC.shape

## One last check to make sure all the names are there
<b> Still having issues - even though their NTD ID has been manually changed in the Cal ITP data set, still not carrying over any GTFS info</b>
* Butte
* City of Fairfiefld

<b> Inherently, aren't sure if these agencies correspond with any GTFS records </b> 
* Calaveras Transit Agency
* Tuloumne
* Alpine County
* Fresno Council of Gov
* Klamath Trinity Non-Emergency Transportationâ€
* Greyhound
* County of LA
* City of Tehachapi
* Calaveras County Work
* County of Nevada Public Works
* County Connection (Central Contra Costa Transit)
* County of Shasta Department of Public Works
* Glenn County Transportation Commission
* Tehama County Transit Agency

In [None]:
Names = GTFS_Vehicles_BC[['organization_name','agency','ntd_id','itp_id','gtfs_schedule_status']]
Names = Names.drop_duplicates()
Names.sort_values('itp_id')

In [None]:
Names.isna().sum()

In [None]:
Names.shape

## Generate a column that indicates that the rows  that don't have GTFS/NTD ID/Cal ITP, since we aren't able to match these agencies ('true'), while everything else is 'false.'

In [None]:
missing_list = Names[Names.isna().any(axis=1)]

In [None]:
missing_agencies = missing_list[['organization_name']]

In [None]:
missing_agencies=missing_agencies.values.tolist()

In [None]:
missing_agencies

In [None]:
GTFS_Vehicles_BC = GTFS_Vehicles_BC.assign(No_GTFS_NTD_Matches = GTFS_Vehicles_BC.apply(lambda x: 0 if x.organization_name in missing_agencies else 0, axis=1))

In [None]:
#df_combined2 = df_combined2.assign( Allocated_Before_July_31_2020_1_is_yes = df_combined2.apply(lambda x: 0 if x.Allocation_Date > pd.Timestamp(2020, 7, 31, 0) else 1, axis=1))

## Generate a column that indicates that the rows  that don't have GTFS, since we aren't able to match these agencies ('true'), while everything else is 'false.'

In [None]:
missing_gtfs = Names[Names['gtfs_schedule_status'].isnull()]
missing_gtfs = missing_gtfs[['organization_name']]
missing_gtfs=missing_gtfs.values.tolist()

In [None]:
GTFS_Vehicles_BC = GTFS_Vehicles_BC.assign(No_GTFS_Matches =GTFS_Vehicles_BC.apply(lambda x: 0 if x.organization_name in missing_agencies else 0, axis=1))

## Final preview
* Deleted a few things: agency, state, legacy_id, years old
* LOTS of rows that look the same but the column "upin" makes them different. Do we need every year/contract that the agency has produced?

In [None]:
GTFS_Vehicles_BC.isna().sum()

In [None]:
GTFS_Vehicles_BC.columns

In [None]:
#dropping columns. I don't want Project closed by...project closed time
GTFS_Vehicles_BC = GTFS_Vehicles_BC.drop(columns=['state', 'legacy_ntd_id', 'years_old:','agency'])

In [None]:
GTFS_Vehicles_BC.shape

In [None]:
#writing to GCS bucket
GTFS_Vehicles_BC.to_csv("gs://calitp-analytics-data/data-analyses/5311 /GTFS_Vehicles_BC.csv", index= False)