In [1]:
import pandas as pd
from siuba import *
import numpy as np

import altair as alt

from shared_utils import altair_utils
from shared_utils import geography_utils
from shared_utils import calitp_color_palette as cp
from shared_utils import styleguide

from calitp import to_snakecase



In [2]:
df=pd.read_csv('gs://calitp-analytics-data/data-analyses/5310/5310_org_info_manualfill.csv')

In [3]:
df.sample(5)

Unnamed: 0,organization_name,ntd_id,itp_id,name_NTD_Airtable,doing_business_as,reporter_acronym,organization_type,reporter_type,city,county,mobility_services_operated,merge_status
132,Lake Links Inc,,,,,,Non-Profit Organization,,Clearlake,Lake,,No Match
79,Humboldt Medi-Trans,,,Humboldt Medi-Trans Inc,,,Non-Profit Organization,,McKinleyville,Humboldt,Humboldt Medi-Trans,"Manual Match, Airtable"
10,City of Santa Rosa,90017.0,301.0,City of Santa Rosa,,,"City, County or Local Government Unit or Depar...",Full Reporter,Santa Rosa,Sonoma,"Santa Rosa CityBus,Santa Rosa Call Ahead Service",Name Match; NTD
64,Sunline Transit Agency,90079.0,327.0,SunLine Transit Agency,,,Independent Public Agency or Authority of Tran...,Full Reporter,Thousand Palms,Riverside,"SunLine Transit,SunLine School Tripper,SunDial...","Manual Match, NTD"
119,Calaveras County Public Works,,,,,,"City, County or Local Government Unit or Depar...",,San Andreas,Calaveras,,No Match


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 147 entries, 0 to 146
Data columns (total 12 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   organization_name           147 non-null    object 
 1   ntd_id                      37 non-null     object 
 2   itp_id                      78 non-null     float64
 3   name_NTD_Airtable           108 non-null    object 
 4   doing_business_as           7 non-null      object 
 5   reporter_acronym            28 non-null     object 
 6   organization_type           147 non-null    object 
 7   reporter_type               37 non-null     object 
 8   city                        146 non-null    object 
 9   county                      146 non-null    object 
 10  mobility_services_operated  90 non-null     object 
 11  merge_status                147 non-null    object 
dtypes: float64(1), object(11)
memory usage: 13.9+ KB


In [5]:
df>>count(_.county)>>arrange(-_.n)

Unnamed: 0,county,n
32,Riverside,10
0,Alameda,9
35,San Bernardino,9
9,Contra Costa,8
38,San Francisco,7
19,Kern,6
24,Mendocino,5
51,Solano,5
33,Sacramento,4
37,San Diego,4


In [6]:
df>>filter(_.county.str.contains(", "))

Unnamed: 0,organization_name,ntd_id,itp_id,name_NTD_Airtable,doing_business_as,reporter_acronym,organization_type,reporter_type,city,county,mobility_services_operated,merge_status
2,Area 1 Agency on Aging,,399.0,Area 1 Agency on Aging,,,Non-Profit Organization,,Eureka,"Humboldt, De Norte",,Name Match; Airtable
3,Big Valley 50 Plus,,404.0,Big Valley 50 Plus,,,Non-Profit Organization,,Beiber,"Lassen, Modoc",Big Valley 50 Plus,Name Match; Airtable
13,Common Ground Senior Services,,419.0,Common Ground Senior Services,,,Non-Profit Organization,,San Andreas,"Calaveras, Amador, Tuolumne",Silver Streak Transport,Name Match; Airtable
19,Friends of Children with Special Needs,,,Friends of Children with Special Needs,,,Non-Profit Organization,,Fremont,"Santa Clara, Alameda",,Name Match; Airtable
24,Inyo-Mono Association for the Handicapped,,434.0,Inyo-Mono Association for the Handicapped,,,Non-Profit Organization,,Bishop,"Inyo, Mono",Inyo-Mono Association for the Handicapped,Name Match; Airtable
51,Self-Help for the Elderly,,,Self-Help for the Elderly,,,Non-Profit Organization,,San Francisco,"San Francisco, Alameda, San Mateo, Santa Clara",Self-Help for the Elderly,Name Match; Airtable
52,Tahoe Transportation District,91092.0,331.0,Tahoe Transportation District,,TTD,Independent Public Agency or Authority of Tran...,Full Reporter,Zephyr Cove,"Douglas, Placer, Washoe, Carson City","Tahoe Truckee Area Regional Transportation,Tah...",Name Match; Airtable
53,The Arc of Amador and Calaveras,,461.0,The Arc of Amador and Calaveras,,,Non-Profit Organization,,Sutter Creek,"Amador, Calaveras",The Arc of Amador and Calaveras,Name Match; Airtable
74,Community Bridges Liftline,,420.0,Community Bridges Inc,,,Non-Profit Organization,,Watsonville,"Monterey, San Mateo, San Benito, Santa Clara, ...",Lift Line,"Manual Match, Airtable"
80,Lutheran Social Services of Southern California,,439.0,Lutheran Social Services,,,Non-Profit Organization,,Orange,"Orange, Los Angeles, Riverside, San Bernardino...",,"Manual Match, Airtable"


In [7]:
df>>count(_.organization_type)>>arrange(-_.n)

Unnamed: 0,organization_type,n
5,Non-Profit Organization,90
3,Independent Public Agency or Authority of Tran...,28
0,"City, County or Local Government Unit or Depar...",12
2,Independent Agency,5
1,Company,4
4,MPO/RTPA,3
6,Other,3
7,Private-Non-Profit Corporation,1
8,University,1


## Charts

In [8]:
df.sample()

Unnamed: 0,organization_name,ntd_id,itp_id,name_NTD_Airtable,doing_business_as,reporter_acronym,organization_type,reporter_type,city,county,mobility_services_operated,merge_status
135,Tuolumne County Transit Agency (TCTA),9R02-91057,,Tuolumne County Transit,,TCT,Independent Public Agency or Authority of Tran...,Rural Reporter,Sonora,Tuolumne,,"Manual Match, NTD"


In [9]:
def labeling(word):
    LABEL_DICT = {
        "dist": "District",
        "nunique": "Number of Unique",
        "n":"Count",
        "name_NTD_Airtable":"Organization Name"
    }

    if (word == "mpo") or (word == "rtpa"):
        word = word.upper()
    elif word in LABEL_DICT.keys():
        word = LABEL_DICT[word]
    else:
        #word = word.replace("n_", "Number of ").title()
        word = word.replace("unique_", "Number of Unique ").title()
        word = word.replace("_", " ").title()

    return word

In [10]:
def bar_chart(df, x_col, y_col, color_col, chart_title=""):

    bar = (
        alt.Chart(df)
        .mark_bar()
        .encode(
            x=alt.X(x_col, title=labeling(x_col), sort=("-y")),
            y=alt.Y(y_col, title=labeling(y_col)),
            color=alt.Color(
                color_col,
                scale=alt.Scale(range=altair_utils.CALITP_CATEGORY_BRIGHT_COLORS),
                legend=alt.Legend(title=(labeling(color_col)), symbolLimit=10)
            )
        ).properties(title=chart_title))
    
    chart = styleguide.preset_chart_config(bar)
    
    return chart

In [11]:
org_counts = df>>count(_.organization_type)

In [12]:
org_counts>>arrange(-_.n)

Unnamed: 0,organization_type,n
5,Non-Profit Organization,90
3,Independent Public Agency or Authority of Tran...,28
0,"City, County or Local Government Unit or Depar...",12
2,Independent Agency,5
1,Company,4
4,MPO/RTPA,3
6,Other,3
7,Private-Non-Profit Corporation,1
8,University,1


In [13]:
bar_chart(org_counts,
    "organization_type",
    "n",
    "organization_type",
    "5310 Organization Types"
)

In [14]:
def charttext(df, x_col, y_col, color_col):
    bars = alt.Chart(df).mark_bar().encode(
            x=alt.X(x_col, title=labeling(x_col), sort=("-y")),
            y=alt.Y(y_col, title=labeling(y_col)),
            color=alt.Color(
                color_col,
                scale=alt.Scale(range=altair_utils.CALITP_CATEGORY_BRIGHT_COLORS),
                legend=alt.Legend(title=(labeling(color_col)), symbolLimit=10)
    ))

    text = bars.mark_text(
        align="left", baseline="middle", color="black", dy=3
    ).encode(
        text= y_col, color=alt.value("black")
    )
    
    display(styleguide.preset_chart_config(bars + text))

In [15]:
charttext(org_counts, "organization_type", "n", "organization_type")

In [16]:
## create long df for county counts

In [17]:
df.sample(5)

Unnamed: 0,organization_name,ntd_id,itp_id,name_NTD_Airtable,doing_business_as,reporter_acronym,organization_type,reporter_type,city,county,mobility_services_operated,merge_status
44,Redwood Coast Transit Authority,9R02-91097,261.0,Redwood Coast Transit Authority,,RCTA,Independent Public Agency or Authority of Tran...,Rural Reporter,Crescent City,Del Norte,"Redwood Coast Transit,Redwood Coast Dial-A-Ride",Name Match; NTD
10,City of Santa Rosa,90017,301.0,City of Santa Rosa,,,"City, County or Local Government Unit or Depar...",Full Reporter,Santa Rosa,Sonoma,"Santa Rosa CityBus,Santa Rosa Call Ahead Service",Name Match; NTD
100,United Cerebral Palsy Association of Greater S...,,464.0,United Cerebral Palsy Association of Greater S...,,,Non-Profit Organization,,Sacramento,Sacramento,UCP Safe Trip,"Manual Match, Airtable"
109,County of Sonoma Human Services Department Adu...,,,,,,Independent Agency,,Sonoma,Sonoma,,No Match
78,Faith in Action,,,Faith in Action Inc,,,Non-Profit Organization,,Vacaville,Solano,Ride with Pride,"Manual Match, Airtable"


In [18]:
single_county = df[~df["county"].str.contains(",", na=False)]

In [19]:
single_county>>count(_.county)>>arrange(-_.n)

Unnamed: 0,county,n
20,Riverside,10
0,Alameda,9
23,San Bernardino,9
5,Contra Costa,8
25,San Francisco,7
11,Kern,6
15,Mendocino,5
33,Solano,5
21,Sacramento,4
24,San Diego,4


In [20]:
bar_chart((single_county>>count(_.county)),
    "county",
    "n",
    "county",
    "Counties with 5310 Organizations (Single County)"
)

In [21]:
df1 = df>>select(_.organization_name, _.ntd_id, _.itp_id, _.organization_type, _.city, _.county, _.merge_status)

In [22]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 147 entries, 0 to 146
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   organization_name  147 non-null    object 
 1   ntd_id             37 non-null     object 
 2   itp_id             78 non-null     float64
 3   organization_type  147 non-null    object 
 4   city               146 non-null    object 
 5   county             146 non-null    object 
 6   merge_status       147 non-null    object 
dtypes: float64(1), object(6)
memory usage: 8.2+ KB


In [23]:
df1.explode('county')

Unnamed: 0,organization_name,ntd_id,itp_id,organization_type,city,county,merge_status
0,Angel View Inc,,,Non-Profit Organization,Cathedral City,Riverside,Name Match; Airtable
1,ARC Imperial Valley,,398.0,Non-Profit Organization,El Centro,Imperial,Name Match; Airtable
2,Area 1 Agency on Aging,,399.0,Non-Profit Organization,Eureka,"Humboldt, De Norte",Name Match; Airtable
3,Big Valley 50 Plus,,404.0,Non-Profit Organization,Beiber,"Lassen, Modoc",Name Match; Airtable
4,Care-A-Van Transit Systems Inc,,407.0,Company,Hemet,Riverside,Name Match; Airtable
...,...,...,...,...,...,...,...
142,Loma Linda University Health,,,Other,Loma Linda,San Bernardino,No Match
143,New Advances for People with Disabilities,,,Non-Profit Organization,Bakersfield,Kern,No Match
144,Pomeroy Recreation & Rehabilitation Center,,,Non-Profit Organization,San Francisco,San Francisco,No Match
145,Regents of the University of California Santa ...,,,University,Santa Cruz,Santa Cruz,No Match


In [24]:
## from https://medium.com/swlh/three-routes-convert-comma-separated-column-to-rows-c17c85079ecf

df2 = df1.assign(county_split = df['county'].str.split(', ')).drop("county", axis=1)\
    .rename(columns = {"county_split": "county"})


In [25]:
df2

Unnamed: 0,organization_name,ntd_id,itp_id,organization_type,city,merge_status,county
0,Angel View Inc,,,Non-Profit Organization,Cathedral City,Name Match; Airtable,[Riverside]
1,ARC Imperial Valley,,398.0,Non-Profit Organization,El Centro,Name Match; Airtable,[Imperial]
2,Area 1 Agency on Aging,,399.0,Non-Profit Organization,Eureka,Name Match; Airtable,"[Humboldt, De Norte]"
3,Big Valley 50 Plus,,404.0,Non-Profit Organization,Beiber,Name Match; Airtable,"[Lassen, Modoc]"
4,Care-A-Van Transit Systems Inc,,407.0,Company,Hemet,Name Match; Airtable,[Riverside]
...,...,...,...,...,...,...,...
142,Loma Linda University Health,,,Other,Loma Linda,No Match,[San Bernardino]
143,New Advances for People with Disabilities,,,Non-Profit Organization,Bakersfield,No Match,[Kern]
144,Pomeroy Recreation & Rehabilitation Center,,,Non-Profit Organization,San Francisco,No Match,[San Francisco]
145,Regents of the University of California Santa ...,,,University,Santa Cruz,No Match,[Santa Cruz]


In [26]:
df2 = df2.explode('county')

In [27]:
df2>>count(_.county)>>arrange(-_.n)

Unnamed: 0,county,n
0,Alameda,12
29,Riverside,12
32,San Bernardino,11
6,Contra Costa,10
34,San Francisco,10
39,Santa Clara,9
15,Kern,6
37,San Mateo,6
43,Solano,6
21,Mendocino,5


In [28]:
bar_chart((df2>>count(_.county)),
    "county",
    "n",
    "county",
    "Counties with 5310 Organizations Operating"
)

In [29]:
(df2>>count(_.county)).mean()



n    3.5
dtype: float64

In [30]:
(df2>>group_by(_.organization_type)>>count(_.county)>>arrange(-_.n)).head(20)

Unnamed: 0,organization_type,county,n
51,Non-Profit Organization,Alameda,11
55,Non-Profit Organization,Contra Costa,8
79,Non-Profit Organization,San Francisco,8
84,Non-Profit Organization,Santa Clara,8
74,Non-Profit Organization,Riverside,7
77,Non-Profit Organization,San Bernardino,6
61,Non-Profit Organization,Kern,5
82,Non-Profit Organization,San Mateo,5
67,Non-Profit Organization,Mendocino,4
75,Non-Profit Organization,Sacramento,4


### Counts

In [31]:
len(df1>>filter(_.ntd_id.notnull()))

37

In [32]:
(df1>>filter(_.ntd_id.notnull())).organization_type.value_counts()

Independent Public Agency or Authority of Transit Service                25
City, County or Local Government Unit or Department of Transportation     8
MPO/RTPA                                                                  3
Private-Non-Profit Corporation                                            1
Name: organization_type, dtype: int64

In [33]:
(df1>>filter(_.ntd_id.notnull())>>filter(_.organization_type=="Private-Non-Profit Corporation"))

Unnamed: 0,organization_name,ntd_id,itp_id,organization_type,city,county,merge_status
66,Easy Lift Transportation Inc,90243,100.0,Private-Non-Profit Corporation,Goleta,Santa Barbara,"Manual Match, NTD"


In [34]:
df1.organization_type.value_counts()

Non-Profit Organization                                                  90
Independent Public Agency or Authority of Transit Service                28
City, County or Local Government Unit or Department of Transportation    12
Independent Agency                                                        5
Company                                                                   4
MPO/RTPA                                                                  3
Other                                                                     3
Private-Non-Profit Corporation                                            1
University                                                                1
Name: organization_type, dtype: int64

In [35]:
print(len(df1.query("organization_type==('Independent Public Agency or Authority of Transit Service') or organization_type==('City, County or Local Government Unit or Department of Transportation')")))


df1.query("organization_type==('Independent Public Agency or Authority of Transit Service') or organization_type==('City, County or Local Government Unit or Department of Transportation')")


40


Unnamed: 0,organization_name,ntd_id,itp_id,organization_type,city,county,merge_status
7,City of Fontana,,414.0,"City, County or Local Government Unit or Depar...",Fontana,San Bernardino,Name Match; Airtable
8,City of Rio Vista,9R02-91014,264.0,"City, County or Local Government Unit or Depar...",Rio Vista,Solano,Name Match; NTD
9,City of Roseville,90168,271.0,"City, County or Local Government Unit or Depar...",Roseville,Placer,Name Match; NTD
10,City of Santa Rosa,90017,301.0,"City, County or Local Government Unit or Depar...",Santa Rosa,Sonoma,Name Match; NTD
11,City of Visalia,90091,361.0,"City, County or Local Government Unit or Depar...",Visalia,Tulare,Name Match; NTD
12,Colusa County Transit Agency,9R02-91112,74.0,"City, County or Local Government Unit or Depar...",Colusa,Colusa,Name Match; NTD
16,Eastern Sierra Transit Authority,9R02-91062,99.0,Independent Public Agency or Authority of Tran...,Bishop,Inyo,Name Match; NTD
17,El Dorado County Transit Authority,90229,101.0,Independent Public Agency or Authority of Tran...,Diamond Springs,El Dorado,Name Match; NTD
21,Humboldt Transit Authority,9R02-91036,135.0,Independent Public Agency or Authority of Tran...,Eureka,Humboldt,Name Match; NTD
22,Imperial County Transportation Commission,90226,138.0,Independent Public Agency or Authority of Tran...,El Centro,Imperial,Name Match; NTD


In [36]:
print(len(df1.query("organization_name.str.contains('Transit Authority') or organization_name.str.contains('Transit Agency') or organization_name.str.contains('Transportation Authority') or organization_name.str.contains('Transportation Agency') or organization_name.str.contains('Transportation District') or organization_name.str.contains('Transit District')")
))

df1.query("organization_name.str.contains('Transit Authority') or organization_name.str.contains('Transit Agency') or organization_name.str.contains('Transportation Authority') or organization_name.str.contains('Transportation Agency') or organization_name.str.contains('Transportation District') or organization_name.str.contains('Transit District')")



25


Unnamed: 0,organization_name,ntd_id,itp_id,organization_type,city,county,merge_status
12,Colusa County Transit Agency,9R02-91112,74.0,"City, County or Local Government Unit or Depar...",Colusa,Colusa,Name Match; NTD
16,Eastern Sierra Transit Authority,9R02-91062,99.0,Independent Public Agency or Authority of Tran...,Bishop,Inyo,Name Match; NTD
17,El Dorado County Transit Authority,90229,101.0,Independent Public Agency or Authority of Tran...,Diamond Springs,El Dorado,Name Match; NTD
21,Humboldt Transit Authority,9R02-91036,135.0,Independent Public Agency or Authority of Tran...,Eureka,Humboldt,Name Match; NTD
25,Lake Transit Authority,9R02-91053,159.0,Independent Public Agency or Authority of Tran...,Lower Lake,Lake,Name Match; NTD
27,Marin County Transit District,90234,194.0,Independent Public Agency or Authority of Tran...,San Rafael,Marin,Name Match; NTD
28,Mendocino Transit Authority,9R02-91047,198.0,Independent Public Agency or Authority of Tran...,Ukiah,Mendocino,Name Match; NTD
32,Morongo Basin Transit Authority,9R02-91090,212.0,Independent Public Agency or Authority of Tran...,Joshua Tree,San Bernardino,Name Match; NTD
34,Napa Valley Transportation Authority,90088,218.0,Independent Public Agency or Authority of Tran...,Napa,Napa,Name Match; NTD
39,Palo Verde Valley Transit Agency,9R02-99454,238.0,Independent Public Agency or Authority of Tran...,Blythe,Riverside,Name Match; NTD
