# Update Program Code Descriptions

In [1]:
import numpy as np
import pandas as pd
from siuba import *

from shared_utils import geography_utils
from dla_utils import _dla_utils

from calitp import to_snakecase


import utils



In [2]:
pd.set_option("display.max_columns", 100)
pd.set_option('display.max_colwidth', None)

## Read in Data and function development

In [3]:
df = utils.read_data_all()
    
    ## function that adds known agency name to df 
df = utils.identify_agency(df, 'summary_recipient_defined_text_field_1_value')

In [4]:
df.sample(3)

Unnamed: 0,fmis_transaction_date,program_code,program_code_description,project_number,recipient_project_number,project_title,county_code,congressional_district,improvement_type,improvement_type_description,obligations_amount,summary_recipient_defined_text_field_1_value,implementing_agency_locode,implementing_agency,district,county_name,rtpa_name,mpo_name
1226,2022-09-01,YS30,Highway Safety Improvement Program (HSIP),2152252,0819000045S,IN RIVERSIDE COUNTY NEAR MORENO VALLEY ON INTERSTATE 215 FROM 0.2 MILE NORTH OF ALESSANDRO BOULEVARD OVERCROSSING TO 0.2 MILE SOUTH OF EUCALYPTUS AVE,65.0,Cong Dist 41,17.0,Construction Engineering,1079600.0,S SCAG,,California,8.0,Riverside County,,
794,2022-09-20,Y001,National Highway Performance Program (NHPP),P070145,0315000082S,"IN AND NEAR MARYSVILLE, FROM SOUTH OF 14TH STREET TO NORTH OF CEMETERY ROAD. ROADWAY REHABILITATION AND OPERATIONAL IMPROVEMENTS INCLUDING TURN POCKE",115.0,Cong Dist 3,16.0,Right of Way,2000000.0,S SACOG,,California,3.0,Yuba County,,
847,2022-07-06,Y001,National Highway Performance Program (NHPP),P118068,0716000083S,"IN THE CITY OF LOS ANGELES, FROM TOPANGA CANYON BOULEVARD TO SEPULVEDA BOULEVARD. CONSTRUCT AND INSTALL STORMWATER TREATMENT BEST MANAGEMENT PRACTIC",37.0,Cong Dist 30,42.0,Training,488.23,S SCAG,,California,7.0,Los Angeles County,,


In [5]:
#checking to make sure summary_recipient_defined_text_field_1_value (and implementing_agency) is unique
df>>group_by(_.project_number)>>summarize(n = _.summary_recipient_defined_text_field_1_value.nunique())>>arrange(-_.n)

Unnamed: 0,project_number,n
0,000C480,1
1,000C524,1
2,000C529,1
3,000C530,1
4,000C531,1
...,...,...
547,X075048,1
548,X077028,1
549,X081034,1
550,X093010,1


### Condense Projects into One

In [6]:
df_agg = df.astype(str).groupby('project_number').agg(' | ' .join).reset_index()

In [7]:
df_agg.sample()

Unnamed: 0,project_number,fmis_transaction_date,program_code,program_code_description,recipient_project_number,project_title,county_code,congressional_district,improvement_type,improvement_type_description,obligations_amount,summary_recipient_defined_text_field_1_value,implementing_agency_locode,implementing_agency,district,county_name,rtpa_name,mpo_name
348,5942273,2022-08-01 | 2022-08-01,Y230 | Y230,Surface Transportation Block Grant (STBG) Program | Surface Transportation Block Grant (STBG) Program,0616000166L | 0616000166L,MILLERTON ROAD FROM FRIANT ROAD TO MARINA DRIVE WIDEN ROADWAY FROM 2LU TO 4LD | MILLERTON ROAD FROM FRIANT ROAD TO MARINA DRIVE WIDEN ROADWAY FROM 2LU TO 4LD,19.0 | 19.0,Cong Dist 22 | Cong Dist 4,15.0 | 15.0,Preliminary Engineering | Preliminary Engineering,17624.04 | 129242.96,L5942COFCG | L5942COFCG,5942.0 | 5942.0,Fresno County | Fresno County,6.0 | 6.0,Fresno County | Fresno County,Council of Fresno County Governments | Council of Fresno County Governments,Council Of Fresno County Goverments | Council Of Fresno County Goverments


In [8]:
(list(df.columns))

['fmis_transaction_date',
 'program_code',
 'program_code_description',
 'project_number',
 'recipient_project_number',
 'project_title',
 'county_code',
 'congressional_district',
 'improvement_type',
 'improvement_type_description',
 'obligations_amount',
 'summary_recipient_defined_text_field_1_value',
 'implementing_agency_locode',
 'implementing_agency',
 'district',
 'county_name',
 'rtpa_name',
 'mpo_name']

In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1241 entries, 0 to 1211
Data columns (total 18 columns):
 #   Column                                        Non-Null Count  Dtype         
---  ------                                        --------------  -----         
 0   fmis_transaction_date                         1241 non-null   datetime64[ns]
 1   program_code                                  1241 non-null   object        
 2   program_code_description                      1240 non-null   object        
 3   project_number                                1241 non-null   object        
 4   recipient_project_number                      1240 non-null   object        
 5   project_title                                 1241 non-null   object        
 6   county_code                                   1241 non-null   float64       
 7   congressional_district                        1241 non-null   object        
 8   improvement_type                              1241 non-null   float6

In [10]:
df2 = df.copy()

In [11]:
df_agg = utils.condense_df(df2)

In [12]:
df_agg.sample(5)

Unnamed: 0,fmis_transaction_date,project_number,implementing_agency,summary_recipient_defined_text_field_1_value,program_code,program_code_description,recipient_project_number,improvement_type,improvement_type_description,project_title,obligations_amount,congressional_district,district,county_code,county_name,implementing_agency_locode,rtpa_name,mpo_name
101,2022-04-29,P051031,California,S SACOG,Y001,National Highway Performance Program (NHPP),0316000113S,15.0,Preliminary Engineering,SR 51 FROM J STREET TO ARDEN WAY IN SACRAMENTO SR 51 CORRIDOR IMPROVEMENTS,3007542.84,Cong Dist 51,11.0,73.0,San Diego County,,,
121,2022-05-05,5444019,Orinda,L5444MTC,Y001,National Highway Performance Program (NHPP),0419000097L,14.0 | 17.0,Bridge Rehabilitation - No Added Capacity | Construction Engineering,MINER ROAD BRIDGE OVER SAN PABLO CREEK (BR #28C0330) SEISMIC RETROFIT,281136.18,Cong Dist 11,4.0,13.0,Contra Costa County,5444.0,Metropolitan Transportation Commission,Metropolitan Transportation Commission
412,2022-09-01,X037225,California,S SCAG,YS30,Highway Safety Improvement Program (HSIP),0716000020S,17.0 | 21.0,Construction Engineering | Safety,"IN THE CITIES OF COMPTON, PARAMOUNT AND BELLFLOWER; ALSO ON ROUTE 105 (PM R0.50/R18.14), ROUTE 110 (PM 1.23/R11.89) AND ROUTE 405 (PM 0.27/28.51). C",2941400.0,Cong Dist 38 | Cong Dist 43 | Cong Dist 44,7.0,37.0,Los Angeles County,,,
300,2022-07-14,5954135,San Bernardino County,L5954SCAG,Y001,National Highway Performance Program (NHPP),0814000092L,16.0,Right of Way,"ROCK SPRINGS ROAD OVER THE MOJAVE RIVER, BR. NO. 54C-0670 BRIDGE REPLACEMENT",442650.0,Cong Dist 8,8.0,71.0,San Bernardino County,5954.0,San Bernardino Associated Governments,Southern California Association Of Governments
463,2022-09-09,Q101349,California,S SBCAG,Y001,National Highway Performance Program (NHPP),0518000131S,15.0,Preliminary Engineering,"SANTA BARBARA COUNTY, IN AND NEAR THE CITIES OF MONTECITO AND SANTA BARBARA, FROM SHELFFIELD DRIVE TO SYCAMORE CREEK. CONSTRUCT HOV LANES",642000.0,Cong Dist 24,5.0,83.0,Santa Barbara County,,,


In [13]:
#should return 552
len(df_agg)

552

In [14]:
# def condense_df(df):
    
#     # make sure columns are in string format
#     df[['county_code', 'improvement_type',
#      'implementing_agency_locode', 'district',
#      'program_code_description', 'recipient_project_number']] = df[['county_code', 'improvement_type',
#                                                                      'implementing_agency_locode', 'district',
#                                                                      'program_code_description', 'recipient_project_number']].astype(str)
#     # aggreate df using .agg function and join in the unique values into one row
#     df_agg = (df
#            .assign(count=1)
#            .groupby(['fmis_transaction_date','project_number', 'implementing_agency', 'summary_recipient_defined_text_field_1_value'])
#            .agg({'program_code':lambda x:' | '.join(x.unique()), # get unique values to concatenate
#                  'program_code_description':lambda x:' | '.join(x.unique()), # get unique values to concatenate
#                  'recipient_project_number':lambda x:' | '.join(x.unique()), #'first',
#                  'improvement_type':lambda x:' | '.join(x.unique()), # get unique values to concatenate
#                  'improvement_type_description':lambda x:' | '.join(x.unique()),  # get unique values to concatenate
#                  'project_title':'first', #should be the same                 
#                  'obligations_amount':'sum', #sum of the obligations amount
#                  'congressional_district':lambda x:' | '.join(x.unique()), # get unique values to concatenate
#                  'district':lambda x:' | '.join(x.unique()), # get unique values to concatenate
#                  'county_code':lambda x:' | '.join(x.unique()), # get unique values to concatenate
#                  'county_name':lambda x:' | '.join(x.unique()), # get unique values to concatenate
#                  'implementing_agency_locode':lambda x:' | '.join(x.unique()), # get unique values to concatenate
#                  'rtpa_name':'first', #should be the same
#                  'mpo_name':'first',  #should be the same
#                 }).reset_index())
    
#     return df_agg

In [15]:
# df2[['county_code'
#      , 'improvement_type',
#      'implementing_agency_locode',
#      'district',
#      'program_code_description',
#      'recipient_project_number']] = df2[['county_code',
#                                          'improvement_type',
#                                          'implementing_agency_locode',
#                                          'district', 'program_code_description',
#                                          'recipient_project_number']].astype(str)

In [16]:
# df_agg = (df2
#            .assign(count=1)
#            .groupby(['fmis_transaction_date','project_number', 'implementing_agency', 'summary_recipient_defined_text_field_1_value'])
#            .agg({'program_code':lambda x:' | '.join(x.unique()), # get unique values to concatenate
#                  'program_code_description':lambda x:' | '.join(x.unique()), # get unique values to concatenate
#                  'recipient_project_number':lambda x:' | '.join(x.unique()), #'first',
#                  'improvement_type':lambda x:' | '.join(x.unique()), # get unique values to concatenate
#                  'improvement_type_description':lambda x:' | '.join(x.unique()),  # get unique values to concatenate
#                  'project_title':'first', #should be the same                 
#                  'obligations_amount':'sum', #sum of the obligations amount
#                  'congressional_district':lambda x:' | '.join(x.unique()), # get unique values to concatenate
#                  'district':lambda x:' | '.join(x.unique()), # get unique values to concatenate
#                  'county_code':lambda x:' | '.join(x.unique()), # get unique values to concatenate
#                  'county_name':lambda x:' | '.join(x.unique()), # get unique values to concatenate
#                  'implementing_agency_locode':lambda x:' | '.join(x.unique()), # get unique values to concatenate
#                  'rtpa_name':'first', #should be the same
#                  'mpo_name':'first',  #should be the same
#                 }).reset_index())

In [17]:
len(df_agg>>filter(_.improvement_type.str.contains(' | ')))

320

In [18]:
full_df_agg = utils.get_new_desc_title(df_agg)

In [19]:
full_df_agg

Unnamed: 0,fmis_transaction_date,project_number,implementing_agency,summary_recipient_defined_text_field_1_value,program_code,program_code_description,recipient_project_number,improvement_type,improvement_type_description,project_title,obligations_amount,congressional_district,district,county_code,county_name,implementing_agency_locode,rtpa_name,mpo_name,project_title_new
0,2022-02-04,5064095,Pasadena,L5064SCAG,YS30,Highway Safety Improvement Program (HSIP),0721000154L,17.0 | 21.0,Construction Engineering | Safety,VARIOUS LOCATIONS THROUGHOUT THE WEST SIDE OF THE CITY. REPAIR EXISTING DAMAGED GUARDRAI,289000.00,Cong Dist 27,7.0,37.0,Los Angeles County,5064.0,Los Angeles County Metropolitan Transportation Auth.,Southern California Association Of Governments,Repair Guardrails in Pasadena
1,2022-02-08,5002199,Sacramento,L5002SACOG,YS30,Highway Safety Improvement Program (HSIP),0317000138L,21.0 | 17.0,Safety | Construction Engineering,"INTERSECTIONS OF 65TH ST EXPWY AT 21ST AVE, 65TH ST EXPWY AT FRUITRIDGE RD, ARDEN WAY AT HERITAGE LN, ARDEN WAY AT CHALLENGE WAY, HOWE AVE. AT SWARTH",1441500.00,Cong Dist 3 | Cong Dist 6,3.0,67.0,Sacramento County,5002.0,Sacramento Area Council of Governments,Sacramento Area Council Of Governments,Safety Improvements in Sacramento
2,2022-02-08,5002209,Sacramento,L5002SACOG,YS30,Highway Safety Improvement Program (HSIP),0319000101L,17.0 | 21.0,Construction Engineering | Safety,DUCKHORN DRIVE FROM ARENA BOULEVARD TO FAR NIENTE WAY INSTALL CURVE WARNING SIGNS AND INSTALL RAISED MEDIANS.,622170.00,Cong Dist 6,3.0,67.0,Sacramento County,5002.0,Sacramento Area Council of Governments,Sacramento Area Council Of Governments,Install Signage in Sacramento
3,2022-02-09,5117016,Hercules,L5117MTC,Y230,Surface Transportation Block Grant (STBG) Program,0420000346L,5.0,4R - Maintenance Resurfacing,HERCULES: SYCAMORE AVE FROM CIVIC DR TO WILLOW/PALM AVE PAVEMENT REHABILITATION,492000.00,Cong Dist 5,4.0,13.0,Contra Costa County,5117.0,Metropolitan Transportation Commission,Metropolitan Transportation Commission,Pavement Rehabilitation in Hercules
4,2022-02-10,5005154,San Jose,L5005MTC,Y230,Surface Transportation Block Grant (STBG) Program,0419000079L,4.0 | 17.0,4R - No Added Capacity | Construction Engineering,"ON MCKEE ROAD BETWEEN ROUTE 101 AND TOYON AVENUE IMPLEMENT SAFETY IMPROVEMENTS INCLUDING MEDIAN ISLANDS, ADA CURB RAMP, SPEED RADAR SIGN, ENHANCED CR",6994933.00,Cong Dist 19,4.0,85.0,Santa Clara County,5005.0,Metropolitan Transportation Commission,Metropolitan Transportation Commission,Improve Signage in San Jose
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
547,2022-09-22,6507035,San Bernardino County Transportation Authority,L6507SCAG,Y230,Surface Transportation Block Grant (STBG) Program,0822000183L,15.0,Preliminary Engineering,"I-10 CORRIDOR CONTRACT 2: THE PROJECT WILL PROVIDE ONE EXPRESS LANES IN EACH DIRECTION FROM JUST EAST OF I-15 TOPEPPER AVENUE IN COLTON, CONNECTING T",14000000.00,Cong Dist 35,8.0,71.0,San Bernardino County,6507.0,NON-RTPA,Southern California Association Of Governments,Express Lanes in San Bernardino County Transportation Authority
548,2022-09-22,P057073,Los Angeles County Metropolitan Transportation Authority,L6065SCAG,Y460,National Highway Freight Program (NHFP),0715000076S,11.0 | 17.0 | 42.0,Bridge Replacement - No Added Capacity | Construction Engineering | Training,IN LOS ANGELES COUNTY. ROUTE 57/60 CONFLUENCE CHOKEPOINT RELIEF PROGRAM. RECONSTRUCT GRAND AVENUE OVERCROSSING. RECONSTRUCT NORTHBOUND SR-57 CONNECTO,74000000.00,Cong Dist 39,7.0,37.0,Los Angeles County,6065.0,Los Angeles County Metropolitan Transportation Auth.,Southern California Association Of Governments,Bridge Replacement in Los Angeles County Metropolitan Transportation Authority
549,2022-09-23,0010206,California,S AMBAG,Y410 | Y450 | Y550,Safe and Accessible Transportation Options - Metropolitan Planning | Metropolitan Planning Program | State Planning and Research (SPR),0022000351S,18.0,Planning,OWP 22/23 MONTEREY BAY REGION CONSOLIDATED PLANNING GRANT (CPG) PROJECT FOR ASSOCIATION OF MONTEREY BAY AREA GOVERNMENTS (AMBAG) OVERALL WORK PROGRAM,1826820.02,Cong Dist 0,,999.0,Statewide,,,,Planning and Research Statewide
550,2022-09-23,2023000,California,,YR10,State RTP Administration,,44.0,Other,2023 Rec Trails Project: State Parks Incurred Administrative Expenses,241760.00,Cong Dist 0,,999.0,Statewide,,,,Recreational Trails Project Statewide


### New issue: `project_title_new` taking groupings from `improvement_type_description`


In [20]:
#checking the description column to add "project" flag and subsetting data
test_df = (utils.add_description(full_df_agg, 'project_title'))>>filter(_.project_type=='Project')
# test_df = (test_df>>select(_.implementing_agency, _.program_code_description, _.improvement_type, _.improvement_type_description, _.project_title,
#                           _.county_name, _.project_title_new, _.project_type))

In [21]:
len(test_df)

188

In [22]:
print((f"There are {len(test_df>>filter(_.improvement_type.str.contains(' | ')))} "
           f"entires out of {len(full_df_agg)} that have concatenated entired for improvement_type_description"))

There are 122 entires out of 552 that have concatenated entired for improvement_type_description


In [23]:
#entries with concatenated improvement types descriptions
(test_df>>filter(_.improvement_type.str.contains(' | '))).improvement_type_description.value_counts()

Construction Engineering | Safety                                                                                      27
4R - Restoration & Rehabilitation | Construction Engineering | Training                                                15
4R - Restoration & Rehabilitation | Construction Engineering                                                           15
Construction Engineering | Facilities for Pedestrians and Bicycles                                                      9
Construction Engineering | Safety | Training                                                                            9
Bridge Rehabilitation - No Added Capacity | Construction Engineering                                                    4
Bridge Replacement - No Added Capacity | Construction Engineering                                                       4
4R - Maintenance  Resurfacing | Construction Engineering                                                                3
4R - Maintenance  Resurf

In [24]:
## filter down to just the projects with multiple improvement type descriptions
test_df = (test_df>>filter(_.improvement_type.str.contains(' | ')))

In [25]:
# #function to rework DOES NOT WORK
# def update_no_matched(df, flag_col, desc_col, program_code_desc_col): 
#     """
#     function to itreate over projects that did not match the first time
#     using an existing project's short description of project type. 
#     """
    
#     def return_project_type(df):
        
#         if (df[flag_col] == "Project") & df[desc_col].str.contains("Bridge Rehabilitation") | df[desc_col].str.contains("Bridge Rehabilitation - No Added Capacity") | df[desc_col].str.contains("Bridge Rehabilitation - Added Capacity"):
#             return ("Bridge Rehabilitation")
        
#         elif (df[flag_col] == "Project") & df[desc_col].str.contains("Facilities for Pedestrians and Bicycles"):
#             return (df[desc_col])
        
#         elif (df[flag_col] == "Project") & df[desc_col].str.contains("Safety"):
#             return (df[desc_col] + " Improvements")
            
#         elif (df[flag_col] == "Project") & df[desc_col].str.contains("Planning "):
#             return "Project Planning" 
            
#         elif (df[flag_col] == "Project") & df[desc_col].str.contains("Preliminary Engineering"):
#             return (df[desc_col] + " Projects ")
        
#         elif (df[flag_col] == "Project") & df[desc_col].str.contains("Construction Engineering"):
#             return (df[desc_col] + " Projects")
        
#         elif (df[flag_col] == "Project") & df[desc_col].str.contains("4R - Restoration & Rehabilitation"):
#             return ("Road Restoration & Rehabilitation")
        
#         elif (df[flag_col] == "Project") & df[desc_col].str.contains("4R - Maintenance  Resurfacing"):
#             return ("Maintenance Resurfacing")
        
#         elif (df[flag_col] == "Project") & df[desc_col].str.contains("Bridge Replacement - Added Capacity")| df[desc_col].str.contains("Bridge Replacement - No Added Capacity") | df[desc_col].str.contains("Bridge New Construction")| df[desc_col].str.contains("Special Bridge"):
#             return ("Bridge Replacement")
        
#         elif (df[flag_col] == "Project") & df[desc_col].str.contains("Mitigation of Water Pollution due to Highway Runoff"):
#             return (df[desc_col])
        
#         elif (df[flag_col] == "Project") & df[desc_col].str.contains("4R - Added Capacity"):
#             return ("Added Roadway Capacity")
        
#         elif (df[flag_col] == "Project") & df[desc_col].str.contains("4R - No Added Capacity"):
#             return ("Road Construction")
        
#         elif (df[flag_col] == "Project") & df[desc_col].str.contains("New  Construction Roadway"):
#             return ("New Construction Roadway")
        
#         elif (df[flag_col] == "Project") & df[desc_col].str.contains("Traffic Management/Engineering - HOV"):
#             return ("Traffic Management Project")
        
#         elif (df[flag_col] == "Project") & df[desc_col].str.contains("Right of Way"):
#             return (df[desc_col] + " Project")
        
#         # elif (df[flag_col] == "Project") & (df[program_code_desc_col]== "National Highway Performance Program (NHPP)"): 
#         #     return ("National Highway Performance Program Support") 
        
#         # elif (df[flag_col] == "Project") & (df[desc_col] != "Other"):
#         #     return (df[desc_col])
    
#         else:
#             return df[flag_col] 

#         return df

#     df['project_type2'] = df.apply(return_project_type, axis = 1)

#     return df

In [26]:
#test_df2 = update_no_matched(test_df, 'project_type', 'improvement_type_description', 'program_code_description')

In [27]:
#test_df2.sample(5)

In [28]:
#trying another version
#update: worked-- moving to utils
def add_description_4_no_match(df, desc_col):
    ##using np.where. code help: https://stackoverflow.com/questions/43905930/conditional-if-statement-if-value-in-row-contains-string-set-another-column

    
    ## method for project in first column
    df['project_type2'] = (np.where(df[desc_col].str.contains("Bridge Rehabilitation"),"Bridge Rehabilitation",
                        np.where(df[desc_col].str.contains("Bridge Rehabilitation - No Added Capacity") | df[desc_col].str.contains("Bridge Rehabilitation - Added Capacity"), "Bridge Rehabilitation",
                        np.where(df[desc_col].str.contains("Bridge Replacement - Added Capacity")| df[desc_col].str.contains("Bridge Replacement - No Added Capacity"), "Bridge Replacement",
                        np.where(df[desc_col].str.contains("Bridge New Construction")| df[desc_col].str.contains("Special Bridge"), "Bridge Replacement",
                        np.where(df[desc_col].str.contains("Facilities for Pedestrians and Bicycles"), "Facilities for Pedestrians and Bicycles",
                        np.where(df[desc_col].str.contains("Mitigation of Water Pollution due to Highway Runoff"), "Mitigation of Water Pollution due to Highway Runoff",
                        np.where(df[desc_col].str.contains("Traffic Management/Engineering - HOV"), "Traffic Management Project",
                        np.where(df[desc_col].str.contains("Planning "), "Project Planning",
                        np.where(df[desc_col].str.contains("4R - Restoration & Rehabilitation"), "Road Restoration & Rehabilitation",
                        np.where(df[desc_col].str.contains("4R - Maintenance  Resurfacing"), "Maintenance Resurfacing",
                        np.where(df[desc_col].str.contains("4R - Added Capacity"), "Added Roadway Capacity",
                        np.where(df[desc_col].str.contains("4R - No Added Capacity"), "Road Construction",
                        np.where(df[desc_col].str.contains("Safety"), "Safety Improvements",
                        np.where(df[desc_col].str.contains("New  Construction Roadway"), "New Construction Roadway",
                        np.where(df[desc_col].str.contains("Preliminary Engineering"), "Preliminary Engineering Projects",
                        np.where(df[desc_col].str.contains("Construction Engineering"), "Construction Engineering Projects",
                        np.where(df[desc_col].str.contains("Right of Way"), "Right of Way Project",
                                    "Project"))))))))))))))))))
    
    return df

In [29]:
test_df3 = add_description_4_no_match(test_df, 'improvement_type_description')

In [30]:
test_df3.project_type2.value_counts()

Safety Improvements                                    40
Road Restoration & Rehabilitation                      33
Bridge Replacement                                     12
Facilities for Pedestrians and Bicycles                 9
Bridge Rehabilitation                                   7
Maintenance Resurfacing                                 6
Road Construction                                       3
Mitigation of Water Pollution due to Highway Runoff     3
Preliminary Engineering Projects                        3
Traffic Management Project                              2
Construction Engineering Projects                       2
New Construction Roadway                                1
Added Roadway Capacity                                  1
Name: project_type2, dtype: int64

In [31]:
test_df3.loc[0:60]

Unnamed: 0,fmis_transaction_date,project_number,implementing_agency,summary_recipient_defined_text_field_1_value,program_code,program_code_description,recipient_project_number,improvement_type,improvement_type_description,project_title,obligations_amount,congressional_district,district,county_code,county_name,implementing_agency_locode,rtpa_name,mpo_name,project_title_new,project_method,project_type,project_type2
1,2022-02-08,5002199,Sacramento,L5002SACOG,YS30,Highway Safety Improvement Program (HSIP),0317000138L,21.0 | 17.0,Safety | Construction Engineering,"INTERSECTIONS OF 65TH ST EXPWY AT 21ST AVE, 65TH ST EXPWY AT FRUITRIDGE RD, ARDEN WAY AT HERITAGE LN, ARDEN WAY AT CHALLENGE WAY, HOWE AVE. AT SWARTH",1441500.0,Cong Dist 3 | Cong Dist 6,3.0,67.0,Sacramento County,5002.0,Sacramento Area Council of Governments,Sacramento Area Council Of Governments,Safety Improvements in Sacramento,,Project,Safety Improvements
13,2022-02-15,5152022,Morgan Hill,L5152MTC,Y230,Surface Transportation Block Grant (STBG) Program,0422000001L,6.0 | 17.0,4R - Restoration & Rehabilitation | Construction Engineering,EAST DUNNE AVENUE BETWEEN LOWER THOMAS GRADE AND THE EASTERN CITY LIMIT. FEDERAL PARTICIPATING LOCATION SEGMENTS: 1) LOWER THOMAS GRADE TO FLAMING OA,857000.0,Cong Dist 19,4.0,85.0,Santa Clara County,5152.0,Metropolitan Transportation Commission,Metropolitan Transportation Commission,Road Restoration & Rehabilitation in Morgan Hill,,Project,Road Restoration & Rehabilitation
14,2022-02-15,5213039,Sunnyvale,L5213MTC,Y001,National Highway Performance Program (NHPP),04925752L,14.0 | 17.0,Bridge Rehabilitation - No Added Capacity | Construction Engineering,"FAIR OAKS AVE OVER CALTRAIN & UPRR, BRIDGE # 37C0765\r THIS REQUEST IS TO CONVERT AC FUNDS UNDER CONSTRUCTION AND R/W TO REGULAR FEDERAL FUNDS. BRIDGE",5051908.0,Cong Dist 17,4.0,85.0,Santa Clara County,5213.0,Metropolitan Transportation Commission,Metropolitan Transportation Commission,Bridge Rehabilitation in Sunnyvale,Construct,Project,Bridge Rehabilitation
15,2022-02-15,5213060,Sunnyvale,L5213MTC,Y003,Projects to Reduce PM 2.5 Emissions,0418000456L,6.0 | 17.0,4R - Restoration & Rehabilitation | Construction Engineering,"IN EAST SUNNYVALE SENSE OF PLACE PLAN AREA: ON DUANE AVENUE FROM FAIR OAKS AVENUE TO LAWRENCE EXPRESSWAY, STEWART DRIVE FROM WOLFE ROAD TO DUANE AVEN",1741000.0,Cong Dist 17,4.0,85.0,Santa Clara County,5213.0,Metropolitan Transportation Commission,Metropolitan Transportation Commission,Road Restoration & Rehabilitation in Sunnyvale,,Project,Road Restoration & Rehabilitation
17,2022-02-15,5928128,Contra Costa County,L5928MTC,Y001,National Highway Performance Program (NHPP),0416000343L,11.0 | 17.0,Bridge Replacement - No Added Capacity | Construction Engineering,MARSH DRIVE BRIDGE NO. 28C-0442 OVER THE WALNUT CREEK CHANNEL REPLACE EXISTING TWO-LANE BRIDGE WITH A NEW TWO-LANE BRIDGE,8482104.0,Cong Dist 11,4.0,13.0,Contra Costa County,5928.0,Metropolitan Transportation Commission,Metropolitan Transportation Commission,Bridge Replacement in Contra Costa County,Replace,Project,Bridge Replacement
18,2022-02-16,5006839,Los Angeles,L5006SCAG,Y001,National Highway Performance Program (NHPP),0716000279L,11.0 | 17.0,Bridge Replacement - No Added Capacity | Construction Engineering,"SIXTH STREET VIADUCT OVER LA RIVER, US 101, AND UPRR, BNSF, AMTRACK, LACMTA AND METROLINK TRACKS BRIDGE # 53C-1880 AND STATE BRIDGE # 53-0595THIS PRO",21400000.0,Cong Dist 34,7.0,37.0,Los Angeles County,5006.0,Los Angeles County Metropolitan Transportation Auth.,Southern California Association Of Governments,Bridge Replacement in Los Angeles,,Project,Bridge Replacement
24,2022-02-16,5919114,Placer County,L5919SACOG,Y233,Surface Transportation Block Grant (STBG) Program,0313000225L,11.0 | 17.0,Bridge Replacement - No Added Capacity | Construction Engineering,CROSBY HAROLD ROAD OVER DOTY CREEK 0.9 MILES NORTH OF WISE ROAD. BR.# 19C0111 REPLACE 1-LANE BRIDGE W/ A 2-LANE BRIDGE (TC),3987897.0,Cong Dist 1,3.0,61.0,Placer County,5919.0,Placer County Transportation Planning Agency,Sacramento Area Council Of Governments,Bridge Replacement in Placer County,Replace,Project,Bridge Replacement
47,2022-03-07,5102051,San Mateo,L5102MTC,Y230,Surface Transportation Block Grant (STBG) Program,0420000363L,5.0 | 17.0,4R - Maintenance Resurfacing | Construction Engineering,"HILLSDALE BLVD, PACIFIC BLVD, PALM AVE, AND BERMUDA DR RESURFACING OF PACIFIC BLVD AND BERMUDA DR, SLURRY SEAL APPLICATION ON PALM AVE AND HILLSDALE",1593000.0,Cong Dist 14,4.0,81.0,San Mateo County,5102.0,Metropolitan Transportation Commission,Metropolitan Transportation Commission,Maintenance Resurfacing in San Mateo,,Project,Maintenance Resurfacing
56,2022-03-14,5012157,Oakland,L5012MTC,Y230,Surface Transportation Block Grant (STBG) Program,0419000338L,6.0 | 17.0,4R - Restoration & Rehabilitation | Construction Engineering,OAKPORT ST FROM EDGEWATER TO I-880 FREEWAY ENTRANCE; OAKLAND AVE FROM ORANGE STREET TO MACARTHUR; MONTEREY BLVD FROM MAIDEN LN TO BENNET PL IN OAKLAN,4895000.0,Cong Dist 13,4.0,1.0,Alameda County,5012.0,Metropolitan Transportation Commission,Metropolitan Transportation Commission,Road Restoration & Rehabilitation in Oakland,,Project,Road Restoration & Rehabilitation


In [32]:
test_df3>>filter(_.improvement_type_description.str.contains("Other"))

Unnamed: 0,fmis_transaction_date,project_number,implementing_agency,summary_recipient_defined_text_field_1_value,program_code,program_code_description,recipient_project_number,improvement_type,improvement_type_description,project_title,obligations_amount,congressional_district,district,county_code,county_name,implementing_agency_locode,rtpa_name,mpo_name,project_title_new,project_method,project_type,project_type2
240,2022-06-16,P014105,California,S SCAG,Y001,National Highway Performance Program (NHPP),0716000029S,17.0 | 42.0 | 44.0,Construction Engineering | Training | Other,"IN AND NEAR SANTA CLARITA, FROM CANYON PARK BOULEVARD TO SPRING CANYON ROAD UNDERCROSSING. MINOR CONCRETE BEYOND GORE, SLOPE PAVE BENEATH ABUTMENTS,",4243100.0,Cong Dist 25,7.0,37.0,Los Angeles County,,,,Construction Engineering Projects in Los Angeles County,,Project,Construction Engineering Projects
519,2022-09-20,5956241,Riverside County,L5956SCAG,Y400,Congestion Mitigation & Air Quality Improvement (CMAQ),0815000042L,17.0 | 44.0,Construction Engineering | Other,NORTH SIDE OF SALT CREEK FLOOD CONTROL CHANNEL FROM GOETZ ROAD TO ANTELOPE ROAD IN THE CITY OF MENIFEE AND ON THE NORTH SIDE OF DOMENIGONI PARKWAY FR,326612.52,Cong Dist 36,8.0,65.0,Riverside County,5956.0,Riverside County Transportation Commission,Southern California Association Of Governments,Construction Engineering Projects in Riverside County,,Project,Construction Engineering Projects


### Add new function to larger function

In [33]:
# using df_agg

df = utils.condense_df(df)

In [34]:
proj_unique_cat = utils.add_description(df, 'project_title')

In [35]:
len(proj_unique_cat>>filter(_.project_type=='Project'))

188

In [36]:
proj_unique_cat.loc[proj_unique_cat['project_type'] == 'Project', 'project_method'] = ""

In [37]:
proj_unique_cat['project_type'] = proj_unique_cat['project_type'].replace('Project', np.NaN)

In [38]:
proj_unique_cat_title =  utils.add_description_4_no_match(proj_unique_cat, 'improvement_type_description')


In [39]:
proj_unique_cat_title['project_type'] = proj_unique_cat_title['project_type'].fillna(proj_unique_cat_title['project_type2'])


In [40]:
proj_unique_cat_title = utils.add_new_title(proj_unique_cat, "project_method", "project_type", "implementing_agency", "county_name")


In [41]:
proj_unique_cat_title = proj_unique_cat_title.rename(columns={'project_name_new':'project_title_new'})
#proj_unique_cat_title.drop(columns =['project_method', 'project_type', 'project_type2'], axis=1, inplace=True)

In [42]:
proj_title_mapping = (dict(proj_unique_cat_title[['project_number', 'project_title_new']].values))

In [43]:
df['project_title_new'] = df.project_number.map(proj_title_mapping)

In [44]:
df

Unnamed: 0,fmis_transaction_date,project_number,implementing_agency,summary_recipient_defined_text_field_1_value,program_code,program_code_description,recipient_project_number,improvement_type,improvement_type_description,project_title,obligations_amount,congressional_district,district,county_code,county_name,implementing_agency_locode,rtpa_name,mpo_name,project_method,project_type,project_type2,project_name_new,project_title_new
0,2022-02-04,5064095,Pasadena,L5064SCAG,YS30,Highway Safety Improvement Program (HSIP),0721000154L,17.0 | 21.0,Construction Engineering | Safety,VARIOUS LOCATIONS THROUGHOUT THE WEST SIDE OF THE CITY. REPAIR EXISTING DAMAGED GUARDRAI,289000.00,Cong Dist 27,7.0,37.0,Los Angeles County,5064.0,Los Angeles County Metropolitan Transportation Auth.,Southern California Association Of Governments,Repair,Guardrails,Safety Improvements,Repair Guardrails in Pasadena,Repair Guardrails in Pasadena
1,2022-02-08,5002199,Sacramento,L5002SACOG,YS30,Highway Safety Improvement Program (HSIP),0317000138L,21.0 | 17.0,Safety | Construction Engineering,"INTERSECTIONS OF 65TH ST EXPWY AT 21ST AVE, 65TH ST EXPWY AT FRUITRIDGE RD, ARDEN WAY AT HERITAGE LN, ARDEN WAY AT CHALLENGE WAY, HOWE AVE. AT SWARTH",1441500.00,Cong Dist 3 | Cong Dist 6,3.0,67.0,Sacramento County,5002.0,Sacramento Area Council of Governments,Sacramento Area Council Of Governments,,Safety Improvements,Safety Improvements,Safety Improvements in Sacramento,Safety Improvements in Sacramento
2,2022-02-08,5002209,Sacramento,L5002SACOG,YS30,Highway Safety Improvement Program (HSIP),0319000101L,17.0 | 21.0,Construction Engineering | Safety,DUCKHORN DRIVE FROM ARENA BOULEVARD TO FAR NIENTE WAY INSTALL CURVE WARNING SIGNS AND INSTALL RAISED MEDIANS.,622170.00,Cong Dist 6,3.0,67.0,Sacramento County,5002.0,Sacramento Area Council of Governments,Sacramento Area Council Of Governments,Install,Signage,Safety Improvements,Install Signage in Sacramento,Install Signage in Sacramento
3,2022-02-09,5117016,Hercules,L5117MTC,Y230,Surface Transportation Block Grant (STBG) Program,0420000346L,5.0,4R - Maintenance Resurfacing,HERCULES: SYCAMORE AVE FROM CIVIC DR TO WILLOW/PALM AVE PAVEMENT REHABILITATION,492000.00,Cong Dist 5,4.0,13.0,Contra Costa County,5117.0,Metropolitan Transportation Commission,Metropolitan Transportation Commission,,Pavement Rehabilitation,Maintenance Resurfacing,Pavement Rehabilitation in Hercules,Pavement Rehabilitation in Hercules
4,2022-02-10,5005154,San Jose,L5005MTC,Y230,Surface Transportation Block Grant (STBG) Program,0419000079L,4.0 | 17.0,4R - No Added Capacity | Construction Engineering,"ON MCKEE ROAD BETWEEN ROUTE 101 AND TOYON AVENUE IMPLEMENT SAFETY IMPROVEMENTS INCLUDING MEDIAN ISLANDS, ADA CURB RAMP, SPEED RADAR SIGN, ENHANCED CR",6994933.00,Cong Dist 19,4.0,85.0,Santa Clara County,5005.0,Metropolitan Transportation Commission,Metropolitan Transportation Commission,Improve,Signage,Road Construction,Improve Signage in San Jose,Improve Signage in San Jose
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
547,2022-09-22,6507035,San Bernardino County Transportation Authority,L6507SCAG,Y230,Surface Transportation Block Grant (STBG) Program,0822000183L,15.0,Preliminary Engineering,"I-10 CORRIDOR CONTRACT 2: THE PROJECT WILL PROVIDE ONE EXPRESS LANES IN EACH DIRECTION FROM JUST EAST OF I-15 TOPEPPER AVENUE IN COLTON, CONNECTING T",14000000.00,Cong Dist 35,8.0,71.0,San Bernardino County,6507.0,NON-RTPA,Southern California Association Of Governments,,Express Lanes,Preliminary Engineering Projects,Express Lanes in San Bernardino County Transportation Authority,Express Lanes in San Bernardino County Transportation Authority
548,2022-09-22,P057073,Los Angeles County Metropolitan Transportation Authority,L6065SCAG,Y460,National Highway Freight Program (NHFP),0715000076S,11.0 | 17.0 | 42.0,Bridge Replacement - No Added Capacity | Construction Engineering | Training,IN LOS ANGELES COUNTY. ROUTE 57/60 CONFLUENCE CHOKEPOINT RELIEF PROGRAM. RECONSTRUCT GRAND AVENUE OVERCROSSING. RECONSTRUCT NORTHBOUND SR-57 CONNECTO,74000000.00,Cong Dist 39,7.0,37.0,Los Angeles County,6065.0,Los Angeles County Metropolitan Transportation Auth.,Southern California Association Of Governments,,Bridge Replacement,Bridge Replacement,Bridge Replacement in Los Angeles County Metropolitan Transportation Authority,Bridge Replacement in Los Angeles County Metropolitan Transportation Authority
549,2022-09-23,0010206,California,S AMBAG,Y410 | Y450 | Y550,Safe and Accessible Transportation Options - Metropolitan Planning | Metropolitan Planning Program | State Planning and Research (SPR),0022000351S,18.0,Planning,OWP 22/23 MONTEREY BAY REGION CONSOLIDATED PLANNING GRANT (CPG) PROJECT FOR ASSOCIATION OF MONTEREY BAY AREA GOVERNMENTS (AMBAG) OVERALL WORK PROGRAM,1826820.02,Cong Dist 0,,999.0,Statewide,,,,,Planning and Research,Project Planning,Planning and Research Statewide,Planning and Research Statewide
550,2022-09-23,2023000,California,,YR10,State RTP Administration,,44.0,Other,2023 REC TRAILS PROJECT: STATE PARKS INCURRED ADMINISTRATIVE EXPENSES,241760.00,Cong Dist 0,,999.0,Statewide,,,,,Recreational Trails Project,Project,Recreational Trails Project Statewide,Recreational Trails Project Statewide


In [45]:
# from get_new_desc_title
    
    
#     proj_unique_cat = add_description(proj_unique, 'project_title')
    
#     #remove project method column values so that the title function wont double count
#     proj_unique_cat.loc[proj_unique_cat['project_type'] == 'Project', 'project_method'] = ""
    # proj_unique_cat['project_type'].replace('Project', np.NaN)
    
    #update for the projects not in the first round of descriptions
    # proj_unique_cat_title =  add_description_4_no_match(proj_unique_cat, 'improvement_type_description')
    ## proj_unique_cat_title = update_no_matched(proj_unique_cat, "project_type", 'improvement_type_description', 'program_code_description')
    
    #fill nan values in 'Project_type' with values from 'project_type2' from add_description_4_no_match function
    # proj_unique_cat_title['project_type'] = proj_unique_cat_title['project_type'].fillna(proj_unique_cat_title['project_type2'])
    
    #add title - second round to account for statewide projects
    # proj_unique_cat_title = add_new_title(proj_unique_cat, "project_method", "project_type", "implementing_agency", "county_name")
    
    # rename new title one
    # proj_unique_cat_title = proj_unique_cat_title.rename(columns={'project_name_new':'project_title_new'})
    # proj_unique_cat_title.drop(columns =['project_method', 'project_type', 'project_type2'], axis=1, inplace=True)
    
    #map the title back to df
#     proj_title_mapping = (dict(proj_unique_cat_title[['project_number', 'project_title_new']].values))
    
#     df['project_title_new'] = df.project_number.map(proj_title_mapping)

#     return df

## Test & Export

In [46]:
GCS_FILE_PATH  = 'gs://calitp-analytics-data/data-analyses/dla/dla-iija'

In [47]:
###test agg. PASS df with title column, concat and 552 rows
agg = utils.get_clean_data(full_or_agg = 'agg')

In [48]:
sorted(list(agg.project_title_new))

[' Added Roadway Capacity in Marin County',
 ' Added Roadway Capacity in Orange County',
 ' Bike Lanes in Alameda',
 ' Bike Share Program in Metropolitan Transportation Commission',
 ' Bike Share Program in Windsor',
 ' Bridge Preventive Maintenance in Humboldt County',
 ' Bridge Preventive Maintenance in Pleasanton',
 ' Bridge Rehabilitation in Chico',
 ' Bridge Rehabilitation in Contra Costa County',
 ' Bridge Rehabilitation in Los Angeles County',
 ' Bridge Rehabilitation in Monterey County',
 ' Bridge Rehabilitation in Palm Springs',
 ' Bridge Rehabilitation in Sacramento County',
 ' Bridge Rehabilitation in San Benito County',
 ' Bridge Rehabilitation in San Diego',
 ' Bridge Rehabilitation in San Francisco County',
 ' Bridge Rehabilitation in San Francisco County',
 ' Bridge Rehabilitation in San Joaquin County',
 ' Bridge Rehabilitation in San Joaquin County',
 ' Bridge Rehabilitation in San Luis Obispo County',
 ' Bridge Rehabilitation in Sierra County',
 ' Bridge Rehabilitatio

In [51]:
agg.sample(5)

Unnamed: 0,fmis_transaction_date,project_number,implementing_agency,summary_recipient_defined_text_field_1_value,program_code,program_code_description,recipient_project_number,improvement_type,improvement_type_description,project_title,obligations_amount,congressional_district,district,county_code,county_name,implementing_agency_locode,rtpa_name,mpo_name,project_title_new
101,2022-04-29,P051031,California,S SACOG,Y001,National Highway Performance Program (NHPP),0316000113S,15.0,Preliminary Engineering,SR 51 FROM J STREET TO ARDEN WAY IN SACRAMENTO SR 51 CORRIDOR IMPROVEMENTS,3007542.84,Cong Dist 51,11.0,73.0,San Diego County,,,,Preliminary Engineering Projects in San Diego County
211,2022-06-12,0055156,California,S MCAG,Y001,National Highway Performance Program (NHPP),1016000046S,5.0 | 17.0 | 42.0,4R - Maintenance Resurfacing | Construction Engineering | Training,"MERCED COUNTY 0.4 MILE NORTH OF THE FRESNO COUNTY LINE AT THE JOHN ""CHUCK"" ERRECA SAFETY ROADSIDE REST AREA REPLACE SAFETY ROADSIDE REST AREA (SRRA)",31405894.22,Cong Dist 16,10.0,47.0,Merced County,,,,Maintenance Resurfacing in Merced County
483,2022-09-13,5451023,Mission Viejo,L5451SCAG,Y230,Surface Transportation Block Grant (STBG) Program,12931559L,13.0 | 17.0,Bridge Rehabilitation - Added Capacity | Construction Engineering,LA PAZ ROAD: MUIRLANDS BLVD. TO CRISANTA DR. WIDEN TWO OVERHEADS OVER BNSF,5420508.0,Cong Dist 45,12.0,59.0,Orange County,5451.0,Orange County Transportation Authority,Southern California Association Of Governments,Widen Road in Mission Viejo
478,2022-09-12,P204006,California,S KCOG,YS30,Highway Safety Improvement Program (HSIP),0617000297S,17.0 | 21.0,Construction Engineering | Safety,KERN COUNTY IN BAKERSFIELD AT 8TH STREET. INSTALLATION OF HIGH INTENSITY ACTIVATED CROSSWALK.,1818500.0,Cong Dist 21,9.0,29.0,Kern County,,,,Safety Improvements in Kern County
181,2022-05-18,5378049,Palmdale,L5378SCAG,Y230,Surface Transportation Block Grant (STBG) Program,0722000002L,1.0,New Construction Roadway,"AVE R BETWEEN SIERRA HIGHWAY AND 25TH STREET. AVE R COMPLETE STREET WITH SIDEWALKS GAP CLOSURES, BIKE LANES, ADA RAMPS AND ENHANCED CROSSWALKS",3200541.0,Cong Dist 25,7.0,37.0,Los Angeles County,5378.0,Los Angeles County Metropolitan Transportation Auth.,Southern California Association Of Governments,Complete Streets in Palmdale


In [52]:
agg = utils.title_column_names(agg)

In [57]:
# agg.to_csv(f"{GCS_FILE_PATH}/FMIS_projects_agg.csv")

In [54]:
###test full. PASS title, no concat and 1241 rows
full = utils.get_clean_data(full_or_agg = 'full')

In [55]:
full = utils.title_column_names(full)

In [58]:
# full.to_csv(f"{GCS_FILE_PATH}/FMIS_projects_all.csv")