# Analyzing Products, Components, and Contracts Data from Air Table
* [Data Source](https://airtable.com/appeVUdmRBi3K9hTS/tblLywLvMA2OTesQP/viwRRKOaZvvkSNfmU?blocks=hide)
* [Term Explanations](https://docs.calitp.org/data-infra/datasets_and_tables/transitdatabase.html)

In [1]:
import numpy as np
import pandas as pd

pd.options.display.max_columns = 50
pd.options.display.max_rows = 250
pd.set_option("display.max_colwidth", None)
pd.options.display.float_format = "{:.2f}".format

from itertools import chain
from os import path

import altair as alt
import utils
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from calitp import *
from PIL import Image
from shared_utils import altair_utils
from siuba import *
from collections import OrderedDict


GCS_FILE_PATH = "gs://calitp-analytics-data/data-analyses/transit_stacks/"



## Products Data

In [2]:
# drop columns with tons of NAs
products = (
    to_snakecase(pd.read_csv(f"{GCS_FILE_PATH}products_May_3.csv"))
    .drop(
        columns=[
            "business_model_features",
            "attachments",
            "status",
            "certifications",
            "connectivity",
            "accepted_input_components",
            "output_components",
            "input",
            "output",
        ]
    )
    .rename(columns={"name": "product_name"})
)

In [3]:
products.columns

Index(['product_name', 'components', 'vendor', 'url', 'requirements',
       'product_features', 'notes', 'organization_stack_components'],
      dtype='object')

In [4]:
products.isna().sum()

product_name                       0
components                         9
vendor                            40
url                               70
requirements                     233
product_features                 180
notes                            173
organization_stack_components    103
dtype: int64

In [5]:
products.dtypes

product_name                     object
components                       object
vendor                           object
url                              object
requirements                     object
product_features                 object
notes                            object
organization_stack_components    object
dtype: object

In [6]:
#remove quotes and replace NAN values with N/A
products = products.replace('"', '', regex=True).fillna("N/A")

In [7]:
#products['TESTING_org_stacks'] =products['organization_stack_components'] 

#remove potential duplicates on organization stack components. 
#https://stackoverflow.com/questions/47316783/python-dataframe-remove-duplicate-words-in-the-same-cell-within-a-column-in-pyt
products['organization_stack_components'] = (products['organization_stack_components'].str.split()
                              .apply(lambda x: OrderedDict.fromkeys(x).keys())
                              .str.join(' '))


In [8]:
products['organization_stack_components'] = (
    products['organization_stack_components']
    .apply(lambda x: ", ".join(set([y.strip() for y in x.split(",")])))
    .str.strip()
    .replace({'"': ""}, regex=True)
)

In [9]:
### Count number of strings by column in organization_stack_components column to see how many orgs are using this vendor.
# https://stackoverflow.com/questions/51502263/pandas-dataframe-object-has-no-attribute-str
products["count_of_orgs_using_product"] = (
    products["organization_stack_components"]
    .str.split(",+")
    .str.len()
    .groupby(products.product_name)
    .transform("sum")
)

In [10]:
#For organization stack components with N/A, replace count_of_orgs_using_product with 0
products.loc[(products["organization_stack_components"] == "N/A"), "count_of_orgs_using_product"] = 0

In [11]:
unique_vendors = products.vendor.nunique()
f"{unique_vendors} unique vendors"

'84 unique vendors'

In [12]:
f"{products.product_name.nunique()} unique products"

'259 unique products'

In [13]:
# Strip extra quotes that appear around Security System, replace it with a neater value
products = products.replace('"', "", regex=True)

In [14]:
products["components"] = products["components"].replace(
    {
        "Security System\n": "Security System",
    }
)

In [15]:
# Separate out components.
products2 = products.copy()

In [16]:
components = products2["components"].str.split(",")
cols = products2.columns.difference(["components"])

products2 = products2.loc[products.index.repeat(components.str.len()), cols].assign(
    components=list(chain.from_iterable(components.tolist()))
)

In [17]:
products[(products.product_name.str.contains("Cubic NextBus Suite", case=False))]

Unnamed: 0,product_name,components,vendor,url,requirements,product_features,notes,organization_stack_components,count_of_orgs_using_product
215,Cubic NextBus Suite,Real-time info,Cubic,https://www.cubic.com/solutions/transportation/nextbus,,,Link now leads to Umo Mobility Platform.\n,"City Coach-Cubic NextBus Suite -APC Software, Market Street Historic Railway-Cubic NextBus Suite -Arrival predictions, UC Davis Health MedTrans-Cubic NextBus Suite -Offboard signage, NCTD Sprinter-Cubic NextBus Suite -Mobile data terminal software, Unitrans-Cubic NextBus Suite -Arrival predictions, Sonoma County Transit-Cubic NextBus Suite -Real-time info, The Loop-Cubic NextBus Suite -Real-time info, Placer County Transit-Cubic NextBus Suite -Mobile data terminal software, OmniTrans sbX-Daktronics (Unspecified Model), Unitrans-Cubic NextBus Suite -Mobile data terminal software, Placer County Transit-Cubic NextBus Suite -Real-time info, Sonoma County Transit-Cubic NextBus Suite -APC Software, UC Davis Health MedTrans-Cubic NextBus Suite -AVL Software, Foothill Transit-Cubic NextBus Suite -Arrival predictions, City Coach-Cubic NextBus Suite -AVL Software, Dumbarton Express-Cubic NextBus Suite -Real-time info, Gold Coast Transit-Cubic NextBus Suite -Real-time info, Cityline-Cubic NextBus Suite -Real-time info, Avail myStop-Real-time info, Glendale Beeline-Cubic NextBus Suite -Real-time info, Fairfield and Suisun Transit-Cubic NextBus Suite -Real-time info, OmniTrans, NCTD BREEZE, Market Street Historic Railway-Cubic NextBus Suite -Offboard signage, San Francisco Cable Cars, Glendale Beeline-Cubic NextBus Suite -AVL Software, Bear Transit-Cubic NextBus Suite -Real-time info, Bear Transit, Sonoma County Transit-Cubic NextBus Suite -Mobile data terminal software, Fairfield and Suisun Transit-Cubic NextBus Suite -Arrival predictions, Unitrans-Cubic NextBus Suite -Offboard signage, City Coach-Cubic NextBus Suite -Arrival predictions, UC Davis Health MedTrans-Cubic NextBus Suite -Alerts Content Management System, Unitrans, GTrans-Cubic NextBus Suite, Palos Verdes Peninsula Transit Authority-Cubic NextBus Suite -Real-time info, Sonoma County Transit-Cubic NextBus Suite -Interior signage, Fairfield and Suisun Transit-Cubic NextBus Suite -APC Software, Foothill Transit-Cubic NextBus Suite, Cubic NextBus Suite -Offboard signage, Sonoma County Transit-Cubic NextBus Suite -Arrival predictions, Unitrans-Cubic NextBus Suite -AVL Software, Connectpoint LCD Signage-Offboard signage, Unitrans-Cubic NextBus Suite -Alerts Content Management System, Sonoma County Transit-Cubic NextBus Suite -Annunciator, NCTD Coaster, UC Davis Health MedTrans-Cubic NextBus Suite -Arrival predictions, Placer County Transit-Cubic NextBus Suite -Headsigns, Placer County Transit-Cubic NextBus Suite -Interior signage, Placer County Transit-Cubic NextBus Suite -Arrival predictions, BruinBus-Cubic NextBus Suite -Real-time info, Muni Metro, Placer County Transit-Cubic NextBus Suite -Annunciator, Palos Verdes Peninsula Transit Authority-Cubic NextBus Suite -Arrival predictions, Manteca Transit-Cubic NextBus Suite -Real-time info, Placer County Transit-Cubic NextBus Suite -Offboard signage, Solano Express-Cubic NextBus Suite -Real-time info, Placer County Transit-Cubic NextBus Suite -AVL Software, Fairfield and Suisun Transit-Cubic NextBus Suite -Mobile data terminal software, UC Davis Health MedTrans-Cubic NextBus Suite -Mobile data terminal software, OmniTrans sbX-Cubic NextBus Suite -Arrival predictions, Clever Devices CleverCAD-Mobile data terminal software, Market Street Historic Railway-Cubic NextBus Suite -Real-time info, Fairfield and Suisun Transit-Cubic NextBus Suite -AVL Software, UC Davis Health MedTrans-Cubic NextBus Suite -APC Software, City Coach-Cubic NextBus Suite -Real-time info, Tahoe Truckee Area Regional Transportation-Cubic NextBus Suite -Real-time info, Muni Bus, Sonoma County Transit-Cubic NextBus Suite -AVL Software, OmniTrans sbX-Cubic NextBus Suite -Alerts Content Management System, Sonoma County Transit-Cubic NextBus Suite, City Coach-Cubic NextBus Suite -Mobile data terminal software, Unitrans-Cubic NextBus Suite -APC Software, OmniTrans sbX-Cubic NextBus Suite -Real-time info, Palos Verdes Peninsula Transit Authority-Cubic NextBus Suite -AVL Software, UC Davis Health MedTrans-Cubic NextBus Suite -Real-time info",76


### What % of vendors with scheduling software also provide GTFS data out of the box? 

In [18]:
# https://stackoverflow.com/questions/47125076/how-to-filter-rows-containing-specific-string-values-with-an-and-operator
gtfs_scheduling = products[
    products["components"].str.contains("GTFS")
    & products["components"].str.contains("Scheduling")
]
gtfs_scheduling[
    ["product_name", "components", "vendor", "count_of_orgs_using_product", "notes"]
]

Unnamed: 0,product_name,components,vendor,count_of_orgs_using_product,notes
128,TripShot - Unspecified,"KPI Reporting,Scheduling (Fixed-route),Scheduling (Demand-Responsive),Mobile trip planning app,Real-time info,GTFS generation",TripShot Inc.,1,
234,Giro HASTUS,"Scheduling (Fixed-route),GTFS generation",Giro Inc.,28,
235,The Master Scheduler,"Scheduling (Fixed-route),GTFS generation",The Master Scheduler,5,http://themasterscheduler.com/whitepapers/TMS2GT.pdf
236,Trapeze - Unspecified,"Scheduling (Fixed-route),GTFS generation",Trapeze Group,22,
237,Optibus,"Scheduling (Fixed-route),GTFS Schedule Publishing,GTFS generation",Optibus,9,


In [19]:
vendors_gtfs_scheduling = gtfs_scheduling.vendor.nunique()
vendors_gtfs_scheduling

5

In [20]:
f"About {(vendors_gtfs_scheduling/unique_vendors)} of vendors offer both GTFS generation and scheduling software"

'About 0.05952380952380952 of vendors offer both GTFS generation and scheduling software'

### Vendors with the most customers

In [21]:
# filter out for any products w/o vendor info
no_vendor_nulls = products.loc[products["vendor"] != "N/A"]

In [22]:
vendors_with_most_customers = (
    no_vendor_nulls.groupby("vendor")
    .agg({"count_of_orgs_using_product": "sum"})
    .sort_values("count_of_orgs_using_product", ascending=False)
    .reset_index()
    .head(5)
    .rename(columns={"count_of_orgs_using_product": "total customers"})
)

utils.basic_bar_chart(
    vendors_with_most_customers,
    "total customers",
    "vendor",
    "vendor",
    "Vendors with the Most Customers",
)

### Vendor with the most products


In [23]:
vendors_with_most_products = (
    no_vendor_nulls.vendor.value_counts()
    .to_frame()
    .head(5)
    .reset_index()
    .rename(columns={"vendor": "total products", "index": "vendor"})
)

utils.basic_bar_chart(
    vendors_with_most_products,
    "vendor",
    "total products",
    "vendor",
    "Vendors with the most Products",
)

### Most popular products in general
* Metric: count_of_orgs_using_product column is the count of strings delinated by comma in the organization stacks component column for each product.
    * Assume that each value is a separate organization.
* Many organizations rely on "in house activity." 

In [24]:
most_popular = (
    products[
        ["product_name", "vendor", "components", "count_of_orgs_using_product", "notes"]
    ]
    .sort_values("count_of_orgs_using_product", ascending=False)
    .head(10)
    .rename(
        columns={
            "count_of_orgs_using_product": "number_of_organizations_using_the_product"
        }
    )
)

### Look at 5 categories more in depth.
* We chose scheduling, AVL, GTFS generation, mobile ticketing, and real time components. 

In [25]:
# List of components we are interested in.
subset_list = [
    "GTFS generation",
    "AVL On-board Computer",
    "AVL Software",
    "Mobile ticketing",
    "Real-time info",
    "Scheduling (Fixed-route)",
]

In [26]:
# new dataframe with only the components in our list
subset_df = products2[products2.components.isin(subset_list)]

In [27]:
# double check that it's correct.
subset_df.components.unique()

array(['Real-time info', 'AVL On-board Computer', 'AVL Software',
       'Mobile ticketing', 'GTFS generation', 'Scheduling (Fixed-route)'],
      dtype=object)

In [28]:
# keep only certain columns and re order them.
subset_df = subset_df[
    [
        "components",
        "vendor",
        "product_name",
        "organization_stack_components",
        "count_of_orgs_using_product",
    ]
]
len(subset_df)

97

In [29]:
subset_df = subset_df.rename(
    columns={
        "vendor": "Vendor",
        "count_of_orgs_using_product": "Total Organizations",
    }
)

In [30]:
#subset_df.loc[subset_df["components"] == "Real-time info"]

In [31]:

for i in subset_list:
    utils.fancy_bar_chart(
        (subset_df[(subset_df.components == i)])
        .sort_values("Total Organizations", ascending=False)
        .head(5),
        "Total Organizations",
        "Vendor",
        "Total Organizations",
        f"Top 5 Vendors in {i}",
    )

## Components Data

In [32]:
components = to_snakecase(pd.read_csv(f"{GCS_FILE_PATH}components_May_3.csv"))

In [33]:
components.shape

(107, 11)

In [34]:
components.isna().sum()

name                               0
aliases                           95
system                            73
location                           1
function_group                     3
description                       88
products                          36
organization_stack_components     82
example_stacks                    94
example_stacks_copy              107
properties_+_features             96
dtype: int64

In [35]:
# Count number of products within each component category.
# https://stackoverflow.com/questions/51502263/pandas-dataframe-object-has-no-attribute-str
components["count_of_products_in_categories"] = (
    components["products"]
    .str.split(",+")
    .str.len()
    .groupby(components.name)
    .transform("sum")
)

In [36]:
# Double Checking
components.loc[components["name"] == "APC Sensors"]

Unnamed: 0,name,aliases,system,location,function_group,description,products,organization_stack_components,example_stacks,example_stacks_copy,properties_+_features,count_of_products_in_categories
5,APC Sensors,Automatic Passenger Counters,APC,Vehicle,Operations,,"RouteMatch - APC,Bishop Peak BPT Ecosystem,Centum Adetel APC ,UTA APC Sensors (Unspecified),UTA Horizontal APC Sensors,UTA Overhead APC Sensors,Connexionz APC,Iris Irma Matrix,UTA Model 30 CPU,Hella APS-B,Hella APS-R-Poe,Dilax PRT-400,Dilax IRS-320R,Dilax (Unspecified Model)","""SacRT Light Rail, e-van, SacRT GO, Elk Grove Transit Services, Folsom Stage Line, Causeway Connection, Sacramento Airport Express, SacRT Bus, SacRT Neighborhood Ride-UTA APC Sensors (Unspecified)-APC Sensors""",,,,14.0


### Top 10 "crowded" product categories
* Count number of strings in the "products" column and group by "name" column to get total number of products in each sector.
* Real-time info is the most "crowded" category with 32 different products.
* Most categories only have one product.
* About 4 unique products in each category, when filtering out any categories with 0 products recorded.

In [37]:
f"{components.name.nunique()} unique categories"

'107 unique categories'

In [38]:
product_categories = (
    components[["name", "count_of_products_in_categories"]]
    .sort_values("count_of_products_in_categories", ascending=False)
    .rename(columns={"name": "category"})
)

In [39]:
product_categories.count_of_products_in_categories.value_counts()

0.00     36
1.00     19
2.00      8
7.00      6
4.00      6
6.00      6
3.00      4
5.00      4
12.00     3
10.00     3
9.00      2
11.00     2
13.00     2
14.00     2
20.00     1
16.00     1
19.00     1
32.00     1
Name: count_of_products_in_categories, dtype: int64

In [40]:
# filter out any categories with 0 products - 36 values are empty
product_categories = product_categories[
    product_categories["count_of_products_in_categories"] > 0
]

In [41]:
f"{product_categories.count_of_products_in_categories.median()} average products per category"

'4.0 average products per category'

In [42]:
most_saturated_category = product_categories.head(10).rename(
    columns={"count_of_products_in_categories": "number of unique products"}
)

In [43]:
utils.basic_bar_chart(
    most_saturated_category,
    "number of unique products",
    "category",
    "category",
    "Most Saturated Categories",
)

### Least saturated category with "the most" customers.
* How many organizations rely on that component?
* Then see how many products fall under that category that customers can choose from.

In [44]:
components.tail(1)

Unnamed: 0,name,aliases,system,location,function_group,description,products,organization_stack_components,example_stacks,example_stacks_copy,properties_+_features,count_of_products_in_categories
106,Yard Management,,,Yard,Maintenance,,,,,,,0.0


In [45]:
# see how many orgs use a product with that particular component
products_usage = (
    products2.groupby("components")
    .agg({"count_of_orgs_using_product": "sum"})
    .sort_values("count_of_orgs_using_product", ascending=False)
    .reset_index()
)

In [46]:
len(products_usage)

72

In [47]:
# I made a dataframe that takes the product with the most customers out of each component category.
# Join the aforementioned dataframe with components.
merge1 = pd.merge(
    components,
    products_usage,
    how="outer",
    left_on=["name"],
    right_on=["components"],
    indicator=True,
)

In [48]:
merge1._merge.value_counts()

both          71
left_only     36
right_only     1
Name: _merge, dtype: int64

In [49]:
# keep relevant cols
merge1 = merge1[
    [
        "name",
        "components",
        "function_group",
        "count_of_orgs_using_product",
        "count_of_products_in_categories",
        "_merge",
    ]
]

In [50]:
len(merge1)

108

In [51]:
# filter out any products with 0 users
merge1 = merge1[merge1["count_of_orgs_using_product"] > 0]

In [52]:
merge1.count_of_products_in_categories.describe()

count   59.00
mean     6.61
std      5.91
min      1.00
25%      2.00
50%      5.00
75%     10.00
max     32.00
Name: count_of_products_in_categories, dtype: float64

In [53]:
# filter out any values in "count_of_products_in_categories" col below 50% percentile
merge1[merge1["count_of_products_in_categories"] < 5.5].sort_values(
    ["count_of_orgs_using_product", "count_of_products_in_categories"], ascending=False
).head()

Unnamed: 0,name,components,function_group,count_of_orgs_using_product,count_of_products_in_categories,_merge
21,Driver Sign-up,Driver Sign-up,Scheduling,72.0,2.0,both
8,Arrival predictions,Arrival predictions,Operations,54.0,4.0,both
40,Headsigns,Headsigns,Rider info,54.0,4.0,both
47,Location Sensors,Location Sensors,Operations,52.0,3.0,both
72,Run cutting,Run cutting,Scheduling,48.0,1.0,both


### Function Groups
* Most of the products are under the "operations" group.

In [54]:
components.function_group.value_counts()

Operations         46
Rider info         20
Fare collection    10
Scheduling          7
Backoffice          6
Maintenance         6
IT                  4
Traffic             3
Reporting           1
Rider Info          1
Name: function_group, dtype: int64

## Contracts Data

In [55]:
contracts = (
    to_snakecase(pd.read_csv(f"{GCS_FILE_PATH}Contracts_May_11.csv"))
    .drop(columns=["attachments", "organization_stack_components", "name"])
    .rename(
        columns={
            "type_of_contract:_functional_category": "functional_category",
            "type_of_contract:_functions": "contract_type",
        }
    )
)

In [56]:
contracts.isna().sum()

contract_holder          1
contract_vendor          1
contract_name           48
functional_category     16
contract_type           14
start_date              36
end_date               119
renewal_option          30
value                  150
notes                  111
dtype: int64

In [57]:
# First row of data is just N/A, so dropping it here.
contracts = contracts.dropna(subset=["contract_holder"])

In [58]:
len(contracts)

158

In [59]:
f"{ contracts.contract_holder.nunique()} organizations in contracts data set"

'56 organizations in contracts data set'

In [60]:
f"{ contracts.contract_vendor.nunique()} vendors in contracts data set"

'44 vendors in contracts data set'

### 125 contracts have none/no record for renewal options, 4 auto-renews

In [61]:
contracts.renewal_option.value_counts()

None           125
Auto-renews      4
Name: renewal_option, dtype: int64

### Looking at contract duration:
* Contracts with an end date value populated, average duration is 3 years.

In [62]:
# Editing date time cols to the right data type
contracts = contracts.assign(
    start_date=pd.to_datetime(contracts.start_date, errors="coerce"),
    end_date=pd.to_datetime(contracts.end_date, errors="coerce"),
)

In [63]:
# new column for duration of contract year.
contracts["duration_of_contract_year"] = (
    (contracts["end_date"] - contracts["start_date"]).dt.days
) / 365

In [64]:
# fill in NAs with 0 and round to 0
contracts["duration_of_contract_year"] = (
    contracts["duration_of_contract_year"].fillna(0).round(0)
)

In [65]:
# new df for rows with end dates
filtered_for_end_date = contracts[contracts["end_date"].notnull()]

In [66]:
duration = (
    filtered_for_end_date.groupby("duration_of_contract_year")
    .agg({"contract_holder": "count"})
    .reset_index()
    .rename(
        columns={
            "duration_of_contract_year": "contract length",
            "contract_holder": "number_of_contracts",
        }
    )
)

In [67]:
filtered_for_end_date["duration_of_contract_year"].median()

3.0

In [68]:
len(filtered_for_end_date)

39

In [69]:
end_dates_by_quarters = (
    filtered_for_end_date.groupby(
        pd.to_datetime(filtered_for_end_date["end_date"], format="%b-%Y").dt.to_period(
            "Q"
        )
    )["contract_holder"]
    .count()
    .to_frame()
    .reset_index()
    .rename(
        columns={
            "contract_holder": "number_of_contracts",
            "end_date": "quarter of expiration",
        }
    )
)

In [70]:
end_dates_by_quarters["quarter of expiration"] = end_dates_by_quarters[
    "quarter of expiration"
].astype("str")

utils.basic_bar_chart(
    end_dates_by_quarters,
    "quarter of expiration",
    "number_of_contracts",
    "number_of_contracts",
    "Contract Expiration Dates",
)

### Separate contract type to look at elements within each contract.
* Functional category is less descriptive, looking at contract type instead.
* There are 71 different types because of all various combinations of GTFS, mobile ticketing, etc elements a contract can have. 
* Separating them out by commas might make it easier to to analyze.
* After splitting the combinations, only 32 cateogries.

In [71]:
f"{ contracts.contract_type.nunique()} unique contract types"

'71 unique contract types'

In [72]:
contracts = contracts.fillna("N/A")

In [73]:
contracts["contract_type"] = contracts["contract_type"].replace(
    {
        "Payment processor/Merchant services,Mobile ticketing": "Payment Processor or Merchant Services or Mobile Ticketing",
        "Payment processor/Merchant services": "Payment Processor or Merchant Services",
    }
)

In [74]:
# https://stackoverflow.com/questions/52575290/how-to-separate-string-into-multiple-rows-in-pandas
contract_type = contracts["contract_type"].str.split(",")
cols = contracts.columns.difference(["contract_type"])

In [75]:
contracts2 = contracts.loc[
    contracts.index.repeat(contract_type.str.len()), cols
].assign(contract_type_use=list(chain.from_iterable(contract_type.tolist())))

In [76]:
contracts2.contract_type_use.nunique()

32

In [77]:
contracts2.shape

(341, 11)

In [78]:
# just checking that everything is correct
contracts2.loc[
    contracts2["contract_type_use"] == "Payment Processor or Merchant Services"
]

Unnamed: 0,contract_holder,contract_name,contract_vendor,duration_of_contract_year,end_date,functional_category,notes,renewal_option,start_date,value,contract_type_use
19,City of Roseville,,Elavon,0.0,,,"Aware of vendors, but do not have a record of the actual contract\n",,,,Payment Processor or Merchant Services
38,Eastern Sierra Transit Authority,,Elavon,0.0,,Fare Payments,,,2018-02-01 00:00:00,,Payment Processor or Merchant Services
79,San Joaquin Regional Transit District,,FIS,0.0,,Fare Payments,,,2015-07-01 00:00:00,,Payment Processor or Merchant Services
95,Santa Cruz Metropolitan Transit District,,FIS,0.0,,Fare Payments,,,2017-10-13 00:00:00,,Payment Processor or Merchant Services
107,Tahoe Transportation District,,Square Inc.,0.0,,Offboard fares,"Aware of vendors, but do not have a record of the actual contract\n",,,,Payment Processor or Merchant Services
114,"""University of California, Davis""",,TouchNet,0.0,,,"Aware of vendors, but do not have a record of the actual contract\n",,,,Payment Processor or Merchant Services
117,Yuba-Sutter Transit Authority,,Elavon,0.0,,Fare Payments,"Aware of vendors, but do not have a record of the actual contract\n",,,,Payment Processor or Merchant Services



### Most common contract element
* Most contracts have an element of GTFS Generation, followed by Vehicle Locations, and Arrival Predictions.

In [79]:
most_common_contract_product = (
    contracts2.contract_type_use.value_counts()
    .to_frame()
    .reset_index()
    .rename(
        columns={
            "index": "element",
            "contract_type_use": "number_of_contracts with this element",
        }
    )
    .head(10)
)

In [80]:
most_common_contract_product

Unnamed: 0,element,number_of_contracts with this element
0,GTFS Generation,35
1,Vehicle Locations,34
2,Arrival predictions,33
3,MDT,23
4,Real-time service alerts,19
5,Annunciator,18
6,Interior signage,18
7,Headsigns,16
8,Mobile ticketing,15
9,Realtime info,15


utils.basic_bar_chart(
    most_common_contract_product,
    "number_of_contracts with this element",
    "element",
    "element",
    "Most Common Element in Contracts",
)

### Most popular vendors by contract awarded

In [81]:
vendors2 = (
    contracts.contract_vendor.value_counts()
    .to_frame()
    .reset_index()
    .head(5)
    .rename(columns={"index": "vendor", "contract_vendor": "number_of_contracts"})
)

utils.basic_bar_chart(
    vendors2,
    "number_of_contracts",
    "vendor",
    "vendor",
    "Vendors Awarded The Most Contracts",
)

### Organizations that hold the most contracts

In [82]:
contract_holders = (
    contracts.contract_holder.value_counts()
    .to_frame()
    .reset_index()
    .rename(columns={"index": "holders", "contract_holder": "# contracts"})
    .head(10)
)

utils.basic_bar_chart(
    contract_holders,
    "# contracts",
    "holders",
    "holders",
    "Organizations with the Most Contracts",
)

### Looking at contracts and their elements with end dates from May 2022 to 2024.
* Of the contracts ending, most have an element of vehicle locations and arrival predictions.

In [83]:
filtered_for_end_date.end_date.describe()



count                      39
unique                     33
top       2020-06-30 00:00:00
freq                        4
first     2010-09-21 00:00:00
last      2024-06-30 00:00:00
Name: end_date, dtype: object

In [84]:
# Keep only end dates from now until 2024.
end_dates = filtered_for_end_date.loc[
    (filtered_for_end_date["end_date"] >= "2022-05-01")]

In [85]:
end_dates[
    [
        "contract_holder",
        "functional_category",
        "start_date",
        "end_date",
        "duration_of_contract_year",
    ]
].sort_values("end_date")

Unnamed: 0,contract_holder,functional_category,start_date,end_date,duration_of_contract_year
148,Monterey-Salinas Transit,Passenger counting,2019-04-09,2022-05-01,3.0
149,Redwood Coast Transit Authority,CAD/AVL,2019-05-03,2022-05-03,3.0
151,Anaheim Transportation Network,Offboard rider information,2017-05-17,2022-05-17,5.0
152,Trinity County,"Offboard rider information,Scheduling",2019-08-01,2022-07-31,3.0
153,Tuolumne County Transit Agency,"CAD/AVL,Offboard rider information",2019-05-08,2023-05-07,4.0
154,Ventura County Transportation Commission,"Offboard rider information,CAD/AVL,Onboard rider information",2017-11-03,2023-11-02,6.0
155,Yuba-Sutter Transit Authority,"CAD/AVL,Onboard rider information",2018-11-07,2023-11-06,5.0
156,Santa Cruz Metropolitan Transit District,"Onboard rider information,Offboard rider information,CAD/AVL",2018-11-29,2023-11-28,5.0
157,Marin County Transit District,Scheduling,2019-07-01,2024-06-30,5.0
158,Monterey-Salinas Transit,"CAD/AVL,Onboard rider information,Offboard rider information",2019-05-01,2024-06-30,5.0


## Export

In [86]:
with pd.ExcelWriter(f"{GCS_FILE_PATH}transit_stacks.xlsx") as writer:
    products.to_excel(writer, sheet_name="products_clean", index=False)
    components.to_excel(writer, sheet_name="components_clean", index=False)
    contracts.to_excel(writer, sheet_name="contracts_clean", index=False)
    contracts2.to_excel(writer, sheet_name="contracts_delineated", index=False)