# Taking a first look and cleaning the data.
* [Data Source](https://airtable.com/appeVUdmRBi3K9hTS/tblLywLvMA2OTesQP/viwRRKOaZvvkSNfmU?blocks=hide)
* [Term Explanations](https://docs.calitp.org/data-infra/datasets_and_tables/transitdatabase.html)

In [1]:
import numpy as np
import pandas as pd

pd.options.display.max_columns = 50
pd.options.display.max_rows = 250
pd.set_option("display.max_colwidth", None)
pd.options.display.float_format = "{:.2f}".format

from itertools import chain
from os import path

import altair as alt
import charts
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from calitp import *
from PIL import Image
from shared_utils import altair_utils
from siuba import *
from collections import OrderedDict

# from wordcloud import STOPWORDS, ImageColorGenerator, WordCloud

GCS_FILE_PATH = "gs://calitp-analytics-data/data-analyses/transit_stacks/"



## Products Data

In [2]:
# drop columns with tons of NAs
products = (
    to_snakecase(pd.read_csv(f"{GCS_FILE_PATH}products_May_3.csv"))
    .drop(
        columns=[
            "business_model_features",
            "attachments",
            "status",
            "certifications",
            "connectivity",
            "accepted_input_components",
            "output_components",
            "input",
            "output",
        ]
    )
    .rename(columns={"name": "product_name"})
)

In [3]:
products.columns

Index(['product_name', 'components', 'vendor', 'url', 'requirements',
       'product_features', 'notes', 'organization_stack_components'],
      dtype='object')

In [4]:
products.isna().sum()

product_name                       0
components                         9
vendor                            40
url                               70
requirements                     233
product_features                 180
notes                            173
organization_stack_components    103
dtype: int64

In [5]:
products.dtypes

product_name                     object
components                       object
vendor                           object
url                              object
requirements                     object
product_features                 object
notes                            object
organization_stack_components    object
dtype: object

In [6]:
products.head(1)

Unnamed: 0,product_name,components,vendor,url,requirements,product_features,notes,organization_stack_components
0,TO CONFIRM,,,,,,,


In [7]:
products['organization_stack_components'] =products['organization_stack_components'].str.replace(r'\b(\w+)(\s+\1)+\b', r'\1')



In [8]:
### Count number of strings by column in organization_stack_components column to see how many orgs are using this vendor.
# https://stackoverflow.com/questions/51502263/pandas-dataframe-object-has-no-attribute-str
products["count_of_orgs_using_product"] = (
    products["organization_stack_components"]
    .str.split(",+")
    .str.len()
    .groupby(products.product_name)
    .transform("sum")
)

In [9]:
unique_vendors = products.vendor.nunique()
f"{unique_vendors} unique vendors"

'83 unique vendors'

In [10]:
f"{products.product_name.nunique()} unique products"

'259 unique products'

In [11]:
products = products.fillna("N/A")

In [12]:
products.sample(1)

Unnamed: 0,product_name,components,vendor,url,requirements,product_features,notes,organization_stack_components,count_of_orgs_using_product
38,Ridecheck Plus,APC Software,,,,,,Big Blue Bus-Ridecheck Plus-APC Software,1.0


### What % of vendors with scheduling software also provide GTFS data out of the box? 

In [13]:
# https://stackoverflow.com/questions/47125076/how-to-filter-rows-containing-specific-string-values-with-an-and-operator
gtfs_scheduling = products[
    products["components"].str.contains("GTFS")
    & products["components"].str.contains("Scheduling")
]
gtfs_scheduling[
    ["product_name", "components", "vendor", "count_of_orgs_using_product", "notes"]
]

Unnamed: 0,product_name,components,vendor,count_of_orgs_using_product,notes
128,TripShot - Unspecified,"KPI Reporting,Scheduling (Fixed-route),Scheduling (Demand-Responsive),Mobile trip planning app,Real-time info,GTFS generation",TripShot Inc.,1.0,
234,Giro HASTUS,"Scheduling (Fixed-route),GTFS generation",Giro Inc.,35.0,
235,The Master Scheduler,"Scheduling (Fixed-route),GTFS generation",The Master Scheduler,5.0,http://themasterscheduler.com/whitepapers/TMS2GT.pdf
236,Trapeze - Unspecified,"Scheduling (Fixed-route),GTFS generation",Trapeze Group,23.0,
237,Optibus,"Scheduling (Fixed-route),GTFS Schedule Publishing,GTFS generation",Optibus,10.0,


In [14]:
vendors_gtfs_scheduling = gtfs_scheduling.vendor.nunique()
vendors_gtfs_scheduling

5

In [15]:
f"About {(vendors_gtfs_scheduling/unique_vendors)} of vendors offer both GTFS generation and scheduling software"

'About 0.060240963855421686 of vendors offer both GTFS generation and scheduling software'

### Vendors with the most customers

In [16]:
# filter out for any products w/o vendor info
no_vendor_nulls = products.loc[products["vendor"] != "N/A"]

In [17]:
vendors_with_most_customers = (
    no_vendor_nulls.groupby("vendor")
    .agg({"count_of_orgs_using_product": "sum"})
    .sort_values("count_of_orgs_using_product", ascending=False)
    .reset_index()
    .head(5)
    .rename(columns={"count_of_orgs_using_product": "total customers"})
)

In [18]:
charts.basic_bar_chart(
    vendors_with_most_customers,
    "total customers",
    "vendor",
    "vendor",
    "Vendors with the Most Customers",
)

### Vendor with the most products


In [19]:
vendors_with_most_products = (
    no_vendor_nulls.vendor.value_counts()
    .to_frame()
    .head(5)
    .reset_index()
    .rename(columns={"vendor": "total products", "index": "vendor"})
)

In [20]:
charts.basic_bar_chart(
    vendors_with_most_products,
    "vendor",
    "total products",
    "vendor",
    "Vendors with the most Products",
)

### Most popular products in general
* Metric: count_of_orgs_using_product column is the count of strings delinated by comma in the organization stacks component column for each product.
    * Assume that each value is a separate organization.
* Genfare FareBox has 94 values, making it the most popular product
* Many organizations rely on "in house activity." 

In [21]:
most_popular = (
    products[
        ["product_name", "vendor", "components", "count_of_orgs_using_product", "notes"]
    ]
    .sort_values("count_of_orgs_using_product", ascending=False)
    .head(10)
    .rename(
        columns={
            "count_of_orgs_using_product": "number_of_organizations_using_the_product"
        }
    )
)

### Most popular products by component type and # of organizations 
* Break out components by comma.
* Most popular determined by count of organizations using a particularly product.
* Only keep that product.

In [22]:
products2 = products.copy()

In [23]:
# Separate out components.
components = products2["components"].str.split(",")
cols = products2.columns.difference(["components"])

products2 = products2.loc[products.index.repeat(components.str.len()), cols].assign(
    components=list(chain.from_iterable(components.tolist()))
)

In [24]:
# Strip extra quotes that appear around Security System, replace it with a neater value
products2 = products2.replace('"', "", regex=True)

In [25]:
products2["components"] = products2["components"].replace(
    {
        "Security System\n": "Security System",
    }
)

In [26]:
# filter out any products with 0 organizations in org stacks
popular_products = products2.loc[products2["count_of_orgs_using_product"] > 0]

In [27]:
# Only keep relevant columns, sort dataframe by A-Z using components
popular_products = products2[
    ["components", "vendor", "product_name", "count_of_orgs_using_product"]
].sort_values("components", ascending=True)

In [28]:
# Keep only the product within a component category with the highest # of organizations using this product.
# https://stackoverflow.com/questions/53842287/select-rows-with-highest-value-from-groupby
popular_products = (
    popular_products.drop_duplicates()
    .sort_values(["components"], ascending=False)
    .groupby(["components"])
    .first()
    .reset_index()
    .rename(columns={"components": "component", "product_name": "most_popular_product"})
)

In [29]:
# sorting vendors by how many times they appear having the most customers in a component category.
popular_products

Unnamed: 0,component,vendor,most_popular_product,count_of_orgs_using_product
0,APC On-Board Computer,GMV Syncromatics Inc,GMV/Syncromatics APC,0.0
1,APC Sensors,Dilax Inc.,Dilax PRT-400,0.0
2,APC Software,Uber Inc.,RouteMatch - APC,0.0
3,AVL On-board Computer,Luminator Technology Group,Luminator mSET - Operator Performance Module,0.0
4,AVL Software,Clever Devices Ltd.,Clever Devices - Unspecified,50.0
5,Advertising Agency,,Intersection,2.0
6,Alerts Content Management System,Uber Inc.,RouteShout,15.0
7,Alerts Subscription Service,IBI,IBI Alerts,2.0
8,Annunciator,Mackenzie Laboratories,Mackenzie Laboratories Annunciator (Unspecified Model),1.0
9,Archival Operations Data,Swiftly Inc.,Swiftly Inc.,0.0


### Look at 5 categories more in depth.
* We chose scheduling, AVL, GTFS generation, mobile ticketing, and real time components. 

In [30]:
# List of components we are interested in.
subset_list = [
    "GTFS generation",
    "AVL On-board Computer",
    "AVL Software",
    "Mobile ticketing",
    "Real-time info",
    "Scheduling (Fixed-route)",
]

In [31]:
# new dataframe with only the components in our list
subset_df = products2[products2.components.isin(subset_list)]

In [32]:
# double check that it's correct.
subset_df.components.unique()

array(['Real-time info', 'AVL On-board Computer', 'AVL Software',
       'Mobile ticketing', 'GTFS generation', 'Scheduling (Fixed-route)'],
      dtype=object)

In [33]:
# keep only certain columns and re order them.
subset_df = subset_df[
    [
        "components",
        "vendor",
        "product_name",
        "organization_stack_components",
        "count_of_orgs_using_product",
    ]
]
len(subset_df)

97

In [34]:
subset_df = subset_df.rename(
    columns={
        "vendor": "Vendor",
        "count_of_orgs_using_product": "Total Organizations",
    }
)

In [99]:
subset_df.loc[subset_df["components"] == "Real-time info"]

Unnamed: 0,components,Vendor,product_name,organization_stack_components,Total Organizations
11,Real-time info,Uber Inc.,RouteShout,"El Dorado Transit, Sacramento/South Lake Tahoe Connector Bus, El Dorado Dial-A-Ride, SAC-MED, El Dorado Paratransit-RouteShout-Real-time info,El Dorado Transit, Sacramento/South Lake Tahoe Connector Bus, El Dorado Dial-A-Ride, SAC-MED, El Dorado Paratransit-RouteShout-Alerts Content Management System,Porterville Transit-RouteShout, RouteMatch - In-Vehicle Technology-Real-time info,El Dorado Transit-RouteShout-Real-time info,Anaheim Resort Transportation-Token Transit Mobile Ticketing, RouteShout-Mobile ticketing",15.0
40,Real-time info,Swiftly Inc.,Swiftly Inc.,,0.0
50,Real-time info,,Cubic - Unspecified,"Golden Gate Ferry, Golden Gate Oracle Park Ferry , Golden Gate Transit-Cubic - Unspecified-Payment processor,Long Beach Transit-Cubic - Unspecified-Payment processor,Bay Area Rapid Transit-Cubic - Unspecified-Ticket Vending Machines,NCTD BREEZE, NCTD Sprinter, NCTD Coaster-Cubic - Unspecified-Ticket Vending Machines,Long Beach Transit-Cubic - Unspecified-Ticket Vending Machines,Muni Metro-Cubic - Unspecified-Ticket Vending Machines,Big Blue Bus-Cubic - Unspecified-Ticket Vending Machines,Caltrain-VenTek venSTATION, Cubic - Unspecified-Ticket Vending Machines",13.0
54,Real-time info,TransLoc,TransLOC (unspecified),"Triton Transit Shuttles-TransLOC (unspecified)-Real-time info,Anteater Express-TransLOC (unspecified)-Real-time info",2.0
68,Real-time info,GMV Syncromatics Inc,Syncromatics - Unspecified,Fresno County Rural Transit-Syncromatics - Unspecified-KPI Reporting,1.0
80,Real-time info,INIT SE,INIT (Unspecified Model),"Gold Coast Transit-INIT (Unspecified Model)-Scheduling (Fixed-route),Gold Coast Transit-INIT (Unspecified Model)-GTFS generation,Golden Gate Ferry, Golden Gate Oracle Park Ferry , Golden Gate Transit-INIT (Unspecified Model)-Arrival predictions,Golden Gate Ferry, Golden Gate Oracle Park Ferry , Golden Gate Transit-INIT (Unspecified Model)-AVL Software,Golden Gate Ferry, Golden Gate Oracle Park Ferry , Golden Gate Transit-INIT (Unspecified Model)-Mobile data terminal software,Golden Gate Ferry, Golden Gate Oracle Park Ferry , Golden Gate Transit-INIT (Unspecified Model)-Offboard signage,Golden Gate Ferry, Golden Gate Oracle Park Ferry , Golden Gate Transit-INIT (Unspecified Model)-Headsigns,Golden Gate Ferry, Golden Gate Oracle Park Ferry , Golden Gate Transit-INIT (Unspecified Model)-Annunciator,Golden Gate Ferry, Golden Gate Oracle Park Ferry , Golden Gate Transit-INIT (Unspecified Model)-Interior signage,Golden Gate Ferry, Golden Gate Oracle Park Ferry , Golden Gate Transit-INIT (Unspecified Model)-APC Software,NCTD BREEZE, NCTD Coaster, NCTD Sprinter-Dilax (Unspecified Model), INIT (Unspecified Model)-APC Software,VTA Bus, VTA ACE Shuttles, VTA Light Rail, VTA Rapid Bus, VTA Express Bus, VTA School Tripper-INIT (Unspecified Model)-APC Software,Metrolink-Conduent (Unspecified Model), INIT (Unspecified Model)-Ticket Vending Machines",38.0
128,Real-time info,TripShot Inc.,TripShot - Unspecified,BruinBus-TripShot - Unspecified-Real-time info,1.0
209,Real-time info,Trapeze Group,Trapeze Traveler Experience,"-Trapeze Traveler Experience-Arrival predictions,Long Beach Transit-Trapeze Traveler Experience-Arrival predictions,Monterey-Salinas Transit, MST Monterey Trolley-Trapeze Traveler Experience-Arrival predictions,B-Line-Trapeze Traveler Experience-Arrival predictions,Tri-Valley Wheels-Trapeze Traveler Experience-Arrival predictions,Long Beach Transit-Trapeze Traveler Experience-Alerts Content Management System,Monterey-Salinas Transit, MST Monterey Trolley-Trapeze Traveler Experience-Alerts Content Management System",9.0
210,Real-time info,Clever Devices Ltd.,Clever Devices RTPI,"SacRT Light Rail, e-van, SacRT GO, Elk Grove Transit Services, Folsom Stage Line, Causeway Connection, Sacramento Airport Express, SacRT Bus, SacRT Neighborhood Ride-Clever Devices RTPI-Real-time info,SacRT Light Rail, e-van, SacRT GO, Elk Grove Transit Services, Folsom Stage Line, Causeway Connection, Sacramento Airport Express, SacRT Bus, SacRT Neighborhood Ride-Clever Devices RTPI-Arrival predictions,Riverside Transit-Clever Devices RTPI-Arrival predictions,County Connection-Clever Devices RTPI-Arrival predictions,Santa Barbara Metropolitan Transit District-Clever Devices RTPI-Arrival predictions,SacRT Bus, SacRT Light Rail-Clever Devices RTPI-Arrival predictions,GTrans-Clever Devices RTPI-Arrival predictions,Riverside Transit-Clever Devices RTPI-Alerts Content Management System,County Connection-Clever Devices RTPI-Alerts Content Management System,Santa Barbara Metropolitan Transit District-Clever Devices RTPI-Alerts Content Management System,GTrans-Clever Devices RTPI-Alerts Content Management System,Riverside Transit-Clever Devices RTPI-Real-time info,VTA Bus, VTA ACE Shuttles, VTA Light Rail, VTA Rapid Bus, VTA Express Bus, VTA School Tripper-Clever Devices RTPI-Real-time info,County Connection-Clever Devices RTPI-Real-time info,SacRT Bus, SacRT Light Rail-Clever Devices RTPI-Real-time info,GTrans-Clever Devices RTPI-Real-time info",39.0
211,Real-time info,Open Transit Software Foundation,OneBusAway Real-Time Transit Info,"NCTD BREEZE, NCTD Coaster, NCTD Sprinter-OneBusAway Real-Time Transit Info-Arrival predictions,MTS Rapid, MTS Bus, San Diego Trolley-OneBusAway Real-Time Transit Info-Real-time info",6.0


In [36]:
# charts
for i in subset_list:
    charts.fancy_bar_chart(
        (subset_df[(subset_df.components == i)])
        .sort_values("Total Organizations", ascending=False)
        .head(5),
        "Total Organizations",
        "Vendor",
        "Total Organizations",
        f"Top 5 Vendors in {i}",
    )

In [37]:
# products2.components.unique()

In [38]:
# products2.groupby("components").agg({'product_name':'count'}).sort_values('product_name', ascending = False).head(5)

## Components Data

In [39]:
components = to_snakecase(pd.read_csv(f"{GCS_FILE_PATH}components_May_3.csv"))

In [40]:
components.shape

(107, 11)

In [41]:
components.isna().sum()

name                               0
aliases                           95
system                            73
location                           1
function_group                     3
description                       88
products                          36
organization_stack_components     82
example_stacks                    94
example_stacks_copy              107
properties_+_features             96
dtype: int64

In [42]:
# Count number of products within each component category.
# https://stackoverflow.com/questions/51502263/pandas-dataframe-object-has-no-attribute-str
components["count_of_products_in_categories"] = (
    components["products"]
    .str.split(",+")
    .str.len()
    .groupby(components.name)
    .transform("sum")
)

In [43]:
# Double Checking
components.loc[components["name"] == "APC Sensors"]

Unnamed: 0,name,aliases,system,location,function_group,description,products,organization_stack_components,example_stacks,example_stacks_copy,properties_+_features,count_of_products_in_categories
5,APC Sensors,Automatic Passenger Counters,APC,Vehicle,Operations,,"RouteMatch - APC,Bishop Peak BPT Ecosystem,Centum Adetel APC ,UTA APC Sensors (Unspecified),UTA Horizontal APC Sensors,UTA Overhead APC Sensors,Connexionz APC,Iris Irma Matrix,UTA Model 30 CPU,Hella APS-B,Hella APS-R-Poe,Dilax PRT-400,Dilax IRS-320R,Dilax (Unspecified Model)","""SacRT Light Rail, e-van, SacRT GO, Elk Grove Transit Services, Folsom Stage Line, Causeway Connection, Sacramento Airport Express, SacRT Bus, SacRT Neighborhood Ride-UTA APC Sensors (Unspecified)-APC Sensors""",,,,14.0


### Top 10 "crowded" product categories
* Count number of strings in the "products" column and group by "name" column to get total number of products in each sector.
* Real-time info is the most "crowded" category with 32 different products.
* Most categories only have one product.
* About 4 unique products in each category, when filtering out any categories with 0 products recorded.

In [44]:
f"{components.name.nunique()} unique categories"

'107 unique categories'

In [45]:
product_categories = (
    components[["name", "count_of_products_in_categories"]]
    .sort_values("count_of_products_in_categories", ascending=False)
    .rename(columns={"name": "category"})
)

In [46]:
product_categories.count_of_products_in_categories.value_counts()

0.00     36
1.00     19
2.00      8
7.00      6
4.00      6
6.00      6
3.00      4
5.00      4
12.00     3
10.00     3
9.00      2
11.00     2
13.00     2
14.00     2
20.00     1
16.00     1
19.00     1
32.00     1
Name: count_of_products_in_categories, dtype: int64

In [47]:
# filter out any categories with 0 products - 36 values are empty
product_categories = product_categories[
    product_categories["count_of_products_in_categories"] > 0
]

In [48]:
f"{product_categories.count_of_products_in_categories.median()} average products per category"

'4.0 average products per category'

In [49]:
most_saturated_category = product_categories.head(10).rename(
    columns={"count_of_products_in_categories": "number of unique products"}
)

In [50]:
charts.basic_bar_chart(
    most_saturated_category,
    "number of unique products",
    "category",
    "category",
    "Most Saturated Categories",
)

### Least saturated category with "the most" customers.
* How many organizations rely on that component?
* Then see how many products fall under that category that customers can choose from.

In [51]:
components.sample(1)

Unnamed: 0,name,aliases,system,location,function_group,description,products,organization_stack_components,example_stacks,example_stacks_copy,properties_+_features,count_of_products_in_categories
4,APC On-Board Computer,,APC,Vehicle,Operations,,"GMV/Syncromatics APC,RouteMatch - APC,UTA SmartSensor Interface,ETA SPOT ITS,Dilax PCU for APC,Dilax (Unspecified Model)",,,,,6.0


In [52]:
# see how many orgs use a product with that particular component
products_usage = (
    products2.groupby("components")
    .agg({"count_of_orgs_using_product": "sum"})
    .sort_values("count_of_orgs_using_product", ascending=False)
    .reset_index()
)

In [53]:
len(products_usage)

72

In [54]:
# I made a dataframe that takes the product with the most customers out of each component category.
# Join the aforementioned dataframe with components.
merge1 = pd.merge(
    components,
    products_usage,
    how="outer",
    left_on=["name"],
    right_on=["components"],
    indicator=True,
)

In [55]:
merge1._merge.value_counts()

both          70
left_only     37
right_only     2
Name: _merge, dtype: int64

In [56]:
# keep relevant cols
merge1 = merge1[
    [
        "name",
        "components",
        "function_group",
        "count_of_orgs_using_product",
        "count_of_products_in_categories",
        "_merge",
    ]
]

In [57]:
len(merge1)

109

In [58]:
# filter out any products with 0 users
merge1 = merge1[merge1["count_of_orgs_using_product"] > 0]

In [59]:
merge1.count_of_products_in_categories.describe()

count   58.00
mean     6.69
std      5.93
min      1.00
25%      2.00
50%      5.50
75%     10.00
max     32.00
Name: count_of_products_in_categories, dtype: float64

In [60]:
# filter out any values in "count_of_products_in_categories" col below 50% percentile
merge1[merge1["count_of_products_in_categories"] < 5.5].sort_values(
    ["count_of_orgs_using_product", "count_of_products_in_categories"], ascending=False
)

Unnamed: 0,name,components,function_group,count_of_orgs_using_product,count_of_products_in_categories,_merge
21,Driver Sign-up,Driver Sign-up,Scheduling,112.0,2.0,both
72,Run cutting,Run cutting,Scheduling,74.0,1.0,both
40,Headsigns,Headsigns,Rider info,70.0,4.0,both
8,Arrival predictions,Arrival predictions,Operations,60.0,4.0,both
47,Location Sensors,Location Sensors,Operations,54.0,3.0,both
16,Computer Automated Dispatch (Responsive),Computer Automated Dispatch (Responsive),Operations,36.0,5.0,both
98,Ticket Vending Machines,Ticket Vending Machines,Fare collection,30.0,4.0,both
42,Incident Management,Incident Management,Operations,19.0,1.0,both
77,Side signs,Side signs,Rider info,16.0,1.0,both
43,Infotainment,Infotainment,Rider Info,14.0,4.0,both


### Function Groups
* Most of the products are under the "operations" group.

In [61]:
components.function_group.value_counts()

Operations         46
Rider info         20
Fare collection    10
Scheduling          7
Backoffice          6
Maintenance         6
IT                  4
Traffic             3
Reporting           1
Rider Info          1
Name: function_group, dtype: int64

## Contracts Data

In [62]:
contracts = (
    to_snakecase(pd.read_csv(f"{GCS_FILE_PATH}Contracts_May_11.csv"))
    .drop(columns=["attachments", "organization_stack_components", "name"])
    .rename(
        columns={
            "type_of_contract:_functional_category": "functional_category",
            "type_of_contract:_functions": "contract_type",
        }
    )
)

In [63]:
contracts.shape

(159, 10)

In [64]:
contracts.isna().sum()

contract_holder          1
contract_vendor          1
contract_name           48
functional_category     16
contract_type           14
start_date              36
end_date               119
renewal_option          30
value                  150
notes                  111
dtype: int64

In [65]:
# First row of data is just N/A, so dropping it here.
contracts = contracts.dropna(subset=["contract_holder"])

In [66]:
len(contracts)

158

In [67]:
f"{ contracts.contract_holder.nunique()} organizations in contracts data set"

'56 organizations in contracts data set'

In [68]:
f"{ contracts.contract_vendor.nunique()} vendors in contracts data set"

'44 vendors in contracts data set'

### 125 contracts have none/no record for renewal options, 4 auto-renews

In [69]:
contracts.renewal_option.value_counts()

None           125
Auto-renews      4
Name: renewal_option, dtype: int64

### Looking at contract duration:
* Contracts with an end date value populated, average duration is 3 years.

In [70]:
# Editing date time cols to the right data type
contracts = contracts.assign(
    start_date=pd.to_datetime(contracts.start_date, errors="coerce"),
    end_date=pd.to_datetime(contracts.end_date, errors="coerce"),
)

In [71]:
# new column for duration of contract year.
contracts["duration_of_contract_year"] = (
    (contracts["end_date"] - contracts["start_date"]).dt.days
) / 365

In [72]:
# fill in NAs with 0 and round to 0
contracts["duration_of_contract_year"] = (
    contracts["duration_of_contract_year"].fillna(0).round(0)
)

In [73]:
# new df for rows with end dates
filtered_for_end_date = contracts[contracts["end_date"].notnull()]

In [74]:
duration = (
    filtered_for_end_date.groupby("duration_of_contract_year")
    .agg({"contract_holder": "count"})
    .reset_index()
    .rename(
        columns={
            "duration_of_contract_year": "contract length",
            "contract_holder": "number_of_contracts",
        }
    )
)

In [75]:
filtered_for_end_date["duration_of_contract_year"].median()

3.0

In [76]:
len(filtered_for_end_date)

39

In [77]:
end_dates_by_quarters = (
    filtered_for_end_date.groupby(
        pd.to_datetime(filtered_for_end_date["end_date"], format="%b-%Y").dt.to_period(
            "Q"
        )
    )["contract_holder"]
    .count()
    .to_frame()
    .reset_index()
    .rename(
        columns={
            "contract_holder": "number_of_contracts",
            "end_date": "quarter of expiration",
        }
    )
)

In [78]:
end_dates_by_quarters["quarter of expiration"] = end_dates_by_quarters[
    "quarter of expiration"
].astype("str")

In [79]:
charts.basic_bar_chart(
    end_dates_by_quarters,
    "quarter of expiration",
    "number_of_contracts",
    "number_of_contracts",
    "Contract Expiration Dates",
)

### Separate contract type to look at elements within each contract.
* Functional category is less descriptive, looking at contract type instead.
* There are 71 different types because of all various combinations of GTFS, mobile ticketing, etc elements a contract can have. 
* Separating them out by commas might make it easier to to analyze.
* After splitting the combinations, only 32 cateogries.

In [80]:
f"{ contracts.contract_type.nunique()} unique contract types"

'71 unique contract types'

In [81]:
contracts = contracts.fillna("N/A")

In [82]:
contracts["contract_type"] = contracts["contract_type"].replace(
    {
        "Payment processor/Merchant services,Mobile ticketing": "Payment Processor or Merchant Services or Mobile Ticketing",
        "Payment processor/Merchant services": "Payment Processor or Merchant Services",
    }
)

In [83]:
# https://stackoverflow.com/questions/52575290/how-to-separate-string-into-multiple-rows-in-pandas
contract_type = contracts["contract_type"].str.split(",")
cols = contracts.columns.difference(["contract_type"])

In [84]:
contracts2 = contracts.loc[
    contracts.index.repeat(contract_type.str.len()), cols
].assign(contract_type_use=list(chain.from_iterable(contract_type.tolist())))

In [85]:
contracts2.contract_type_use.nunique()

32

In [86]:
contracts2.shape

(341, 11)

In [87]:
# just checking that everything is correct
contracts2.loc[
    contracts2["contract_type_use"] == "Payment Processor or Merchant Services"
]

Unnamed: 0,contract_holder,contract_name,contract_vendor,duration_of_contract_year,end_date,functional_category,notes,renewal_option,start_date,value,contract_type_use
19,City of Roseville,,Elavon,0.0,,,"Aware of vendors, but do not have a record of the actual contract\n",,,,Payment Processor or Merchant Services
38,Eastern Sierra Transit Authority,,Elavon,0.0,,Fare Payments,,,2018-02-01 00:00:00,,Payment Processor or Merchant Services
79,San Joaquin Regional Transit District,,FIS,0.0,,Fare Payments,,,2015-07-01 00:00:00,,Payment Processor or Merchant Services
95,Santa Cruz Metropolitan Transit District,,FIS,0.0,,Fare Payments,,,2017-10-13 00:00:00,,Payment Processor or Merchant Services
107,Tahoe Transportation District,,Square Inc.,0.0,,Offboard fares,"Aware of vendors, but do not have a record of the actual contract\n",,,,Payment Processor or Merchant Services
114,"""University of California, Davis""",,TouchNet,0.0,,,"Aware of vendors, but do not have a record of the actual contract\n",,,,Payment Processor or Merchant Services
117,Yuba-Sutter Transit Authority,,Elavon,0.0,,Fare Payments,"Aware of vendors, but do not have a record of the actual contract\n",,,,Payment Processor or Merchant Services



### Most common contract element
* Most contracts have an element of GTFS Generation, followed by Vehicle Locations, and Arrival Predictions.

In [88]:
most_common_contract_product = (
    contracts2.contract_type_use.value_counts()
    .to_frame()
    .reset_index()
    .rename(
        columns={
            "index": "element",
            "contract_type_use": "number_of_contracts with this element",
        }
    )
    .head(10)
)

In [89]:
charts.basic_bar_chart(
    most_common_contract_product,
    "number_of_contracts with this element",
    "element",
    "element",
    "Most Common Element in Contracts",
)

In [90]:
elements_only = contracts2["contract_type_use"]

### Word cloud from Natalie's notebook
def word_cloud_gen(df):
    test = " ".join(df).lower()
    wordcloud2 = WordCloud(
        width=800,
        height=400,
        stopwords=STOPWORDS,
        collocations=True,
        background_color="white",
    ).generate(test)

    plt.figure(figsize=(12, 6))
    plt.imshow(wordcloud2, interpolation="bilInear")
    plt.axis("off")
    plt.show()

word_cloud_gen(elements_only)

### Most popular vendors by contract awarded

In [91]:
vendors2 = (
    contracts.contract_vendor.value_counts()
    .to_frame()
    .reset_index()
    .head(5)
    .rename(columns={"index": "vendor", "contract_vendor": "number_of_contracts"})
)

In [92]:
charts.basic_bar_chart(
    vendors2,
    "number_of_contracts",
    "vendor",
    "vendor",
    "Vendors Awarded The Most Contracts",
)

### Organizations that hold the most contracts

In [93]:
contract_holders = (
    contracts.contract_holder.value_counts()
    .to_frame()
    .reset_index()
    .rename(columns={"index": "holders", "contract_holder": "# contracts"})
    .head(10)
)

In [94]:
charts.basic_bar_chart(
    contract_holders,
    "# contracts",
    "holders",
    "holders",
    "Organizations with the Most Contracts",
)

### Looking at contracts and their elements with end dates from May 2022 to 2024.
* Of the contracts ending, most have an element of vehicle locations and arrival predictions.

In [95]:
filtered_for_end_date.end_date.describe()



count                      39
unique                     33
top       2020-06-30 00:00:00
freq                        4
first     2010-09-21 00:00:00
last      2024-06-30 00:00:00
Name: end_date, dtype: object

In [96]:
# Keep only end dates from now until 2024.
end_dates = filtered_for_end_date.loc[
    (filtered_for_end_date["end_date"] >= "2022-05-03")
]

In [97]:
len(end_dates)

9

In [98]:
end_dates[
    [
        "contract_holder",
        "functional_category",
        "contract_type",
        "start_date",
        "end_date",
        "duration_of_contract_year",
    ]
].sort_values("end_date")

Unnamed: 0,contract_holder,functional_category,contract_type,start_date,end_date,duration_of_contract_year
149,Redwood Coast Transit Authority,CAD/AVL,"Vehicle Locations,Real-time service alerts,Arrival predictions,Realtime info",2019-05-03,2022-05-03,3.0
151,Anaheim Transportation Network,Offboard rider information,Realtime info,2017-05-17,2022-05-17,5.0
152,Trinity County,"Offboard rider information,Scheduling","GTFS Generation,Web trip planner",2019-08-01,2022-07-31,3.0
153,Tuolumne County Transit Agency,"CAD/AVL,Offboard rider information","MDT,Real-time service alerts,Vehicle Locations,Arrival predictions,Web trip planner ,Realtime info",2019-05-08,2023-05-07,4.0
154,Ventura County Transportation Commission,"Offboard rider information,CAD/AVL,Onboard rider information","Headsigns,Interior signage,Annunciator,MDT,Arrival predictions,Vehicle Locations,Mobile trip planner",2017-11-03,2023-11-02,6.0
155,Yuba-Sutter Transit Authority,"CAD/AVL,Onboard rider information","Vehicle Locations,Arrival predictions,Headsigns,Interior signage",2018-11-07,2023-11-06,5.0
156,Santa Cruz Metropolitan Transit District,"Onboard rider information,Offboard rider information,CAD/AVL","Realtime info,Annunciator,Headsigns,Interior signage,MDT,Vehicle Locations,Arrival predictions,Real-time service alerts",2018-11-29,2023-11-28,5.0
157,Marin County Transit District,Scheduling,Scheduling,2019-07-01,2024-06-30,5.0
158,Monterey-Salinas Transit,"CAD/AVL,Onboard rider information,Offboard rider information","Annunciator,Interior signage,MDT,Real-time service alerts,Vehicle Locations,Arrival predictions,Offboard signage",2019-05-01,2024-06-30,5.0


## Export

with pd.ExcelWriter(f"{GCS_FILE_PATH}transit_stacks.xlsx") as writer:
    products.to_excel(writer, sheet_name="products_clean", index=False)
    popular_products.to_excel(writer, sheet_name="popular_products", index=False)
    components.to_excel(writer, sheet_name="components_clean", index=False)
    contracts.to_excel(writer, sheet_name="contracts_clean", index=False)
    contracts2.to_excel(writer, sheet_name="contracts_delineated", index=False)