# Cleaning
* Taking a first look and cleaning the data.
* [Data Source](https://airtable.com/appeVUdmRBi3K9hTS/tblLywLvMA2OTesQP/viwRRKOaZvvkSNfmU?blocks=hide)

In [1]:
import numpy as np
import pandas as pd

pd.options.display.max_columns = 50
pd.options.display.max_rows = 250
pd.set_option("display.max_colwidth", None)
pd.options.display.float_format = "{:.2f}".format

from itertools import chain

from calitp import *
from siuba import *

GCS_FILE_PATH = "gs://calitp-analytics-data/data-analyses/transit_stacks/"

## Products

In [2]:
# drop columns with tons of NAs
products = (
    to_snakecase(pd.read_csv(f"{GCS_FILE_PATH}products-Grid view (1).csv"))
    .drop(
        columns=[
            "business_model_features",
            "attachments",
            "status",
            "certifications",
            "connectivity",
            "accepted_input_components",
            "output_components",
            "input",
            "output",
        ]
    )
    .astype(str)
)

In [3]:
products.shape

(259, 8)

In [4]:
products.isna().sum()

name                             0
components                       0
vendor                           0
url                              0
requirements                     0
product_features                 0
notes                            0
organization_stack_components    0
dtype: int64

### Count number of strings in organization_stack_components column to see how many orgs are using this vendor.

In [5]:
# https://stackoverflow.com/questions/51502263/pandas-dataframe-object-has-no-attribute-str
products["count_of_orgs_using_product"] = (
    products["organization_stack_components"]
    .str.split(",+")
    .str.len()
    .groupby(products.name)
    .transform("sum")
)

### Each company can sell multiple components, split each component by comma 

In [6]:
# https://stackoverflow.com/questions/52575290/how-to-separate-string-into-multiple-rows-in-pandas
components = products["components"].str.split(",")

In [7]:
cols = products.columns.difference(["components"])

In [8]:
products2 = products.loc[products.index.repeat(components.str.len()), cols].assign(
    components=list(chain.from_iterable(components.tolist()))
)

In [9]:
products2.columns

Index(['count_of_orgs_using_product', 'name', 'notes',
       'organization_stack_components', 'product_features', 'requirements',
       'url', 'vendor', 'components'],
      dtype='object')

### Make the dataframe a little cleaner/smaller

In [10]:
products3 = products2[
    [
        "name",
        "vendor",
        "components",
        "count_of_orgs_using_product",
        "product_features",
        "requirements",
        "notes",
    ]
].rename(columns={"name": "product_name"})

In [11]:
products3.tail(2)

Unnamed: 0,product_name,vendor,components,count_of_orgs_using_product,product_features,requirements,notes
258,RouteMatch: Rider Web Portal,Uber Inc.,Web-based trip planner,1,Cloud-based,"RouteMatch Mobility, Pay, Demand\n","Book a trip (multimodal or point-to-point) according to agency business rules\nView vehicle location and real-time trip information\nAdd guests, attendants, and service animals; specify other service needs\nEnter service eligibility and mobility information\n\nThe Rider Web Portal is one of Pay’s rider facing components, allowing riders to:\nCalculate fares, add funds to account, and set notifications\nManage payment options; activate or deactivate smart cards and other fare media\nView transaction and trip histories\n"
258,RouteMatch: Rider Web Portal,Uber Inc.,Real-time info,1,Cloud-based,"RouteMatch Mobility, Pay, Demand\n","Book a trip (multimodal or point-to-point) according to agency business rules\nView vehicle location and real-time trip information\nAdd guests, attendants, and service animals; specify other service needs\nEnter service eligibility and mobility information\n\nThe Rider Web Portal is one of Pay’s rider facing components, allowing riders to:\nCalculate fares, add funds to account, and set notifications\nManage payment options; activate or deactivate smart cards and other fare media\nView transaction and trip histories\n"


In [12]:
products3.shape

(417, 7)

## Components

In [13]:
components = to_snakecase(pd.read_csv(f"{GCS_FILE_PATH}components-Grid view.csv"))

In [14]:
components.isna().sum()

name                               0
aliases                           95
system                            73
location                           1
function_group                     3
description                       88
products                          37
organization_stack_components     82
example_stacks                    94
example_stacks_copy              107
properties_+_features             96
dtype: int64

In [15]:
components.shape

(107, 11)

### Count number of products in each category 

In [58]:
# https://stackoverflow.com/questions/51502263/pandas-dataframe-object-has-no-attribute-str
components["products_in_categories"] = (
    components["products"]
    .str.split(",+")
    .str.len()
    .groupby(components.name)
    .transform("sum")
)

## Merge components with products

In [17]:
# Join
merge1 = pd.merge(
    products3,
    components,
    how="outer",
    left_on=["components"],
    right_on=["name"],
    indicator=True,
)

In [18]:
merge1._merge.value_counts()

both          405
right_only     38
left_only      12
Name: _merge, dtype: int64

In [19]:
merge2 = merge1.loc[merge1["_merge"] == "both"]

In [20]:
merge2 = merge1[
    [
        "product_name",
        "vendor",
        "components",
        "system",
        "location",
        "function_group",
        "description",
        "requirements",
        "product_features",
        "notes",
        "properties_+_features",
        "count_of_orgs_using_product",
    ]
]

In [21]:
merge2.shape

(455, 12)

In [22]:
merge2.sample(3)

Unnamed: 0,product_name,vendor,components,system,location,function_group,description,requirements,product_features,notes,properties_+_features,count_of_orgs_using_product
127,Dilax (Unspecified Model),Dilax Inc.,APC Sensors,APC,Vehicle,Operations,,,,,,9.0
197,Conduent ATLAS,Conduent Inc,Mobile ticketing,,Offsite,Fare collection,,,,,,1.0
420,,,,,Vehicle,Operations,,,,,,


## Contracts

In [23]:
contracts = to_snakecase(pd.read_csv(f"{GCS_FILE_PATH}contracts-Grid view.csv")).drop(
    columns=["attachments", "organization_stack_components", "name"]
)

In [35]:
len(contracts)

128

In [53]:
f'{ contracts.contract_holder.nunique()} organizations in contracts data set'

'51 organizations in contracts data set'

In [54]:
f'{ contracts.contract_vendor.nunique()} vendors in contracts data set'

'37 vendors in contracts data set'

In [27]:
contracts = contracts.assign(
    start_date=pd.to_datetime(contracts.start_date, errors="coerce"),
    end_date=pd.to_datetime(contracts.end_date, errors="coerce"),
)

In [29]:
contracts["duration_of_contract_year"] = (
    (contracts["end_date"] - contracts["start_date"]).dt.days
) / 365

In [57]:
contracts.renewal_option.value_counts()

None           125
Auto-renews      3
Name: renewal_option, dtype: int64

In [52]:
#Average contract length in years
contracts['duration_of_contract_year'].median()

3.0027397260273974

In [30]:
contracts.sample(3)

Unnamed: 0,contract_holder,contract_vendor,contract_name,type_of_contract:_functional_category,type_of_contract:_functions,start_date,end_date,renewal_option,value,notes,duration_of_contract_year
103,Santa Cruz Metropolitan Transit District,GMV Syncromatics Inc,Label used for the procurement.,"Onboard rider information,Offboard rider information,CAD/AVL","Realtime info,Annunciator,Headsigns,Interior signage,MDT,Vehicle Locations,Arrival predictions,Real-time service alerts",2018-11-29,2023-11-28,,,,5.0
53,El Dorado County Transit Authority,Trillium Inc.,Label used for the procurement.,Scheduling,"Web trip planner ,Mobile trip planner ,GTFS Generation",2015-02-23,NaT,,,,
104,Santa Cruz Metropolitan Transit District,Masabi,Label used for the procurement.,"Offboard fares,Fare Payments","Payment processor/Merchant services,Mobile ticketing",2019-07-23,2020-07-23,,,Option to renew for two one year terms,1.0


# Answering some questions

## What % of vendors with scheduling software also provide GTFS data out of the box?

In [44]:
gtfs_schedule_overlap = products[
    products["components"].str.contains(
        "GTFS & Schedule|Scheduling",
        case=False,
    )
]

In [45]:
gtfs_schedule_overlap2 = gtfs_schedule_overlap.groupby(
    ["vendor", "name", "components"]
).agg({"count_of_orgs_using_product": "max"})

In [46]:
gtfs_schedule_overlap2

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,count_of_orgs_using_product
vendor,name,components,Unnamed: 3_level_1
Ecolane,Ecolane (Unspecified Model),"Scheduling (Demand-Responsive),Scheduling (Fixed-route),Mobile trip planning app",4
Giro Inc.,Giro HASTUS,Scheduling (Fixed-route),35
INIT SE,INIT (Unspecified Model),"Driver Sign-up,Scheduling (Fixed-route),KPI Reporting,Cash Farebox,Mobile ticketing,Real-time info",38
Optibus,Optibus,"Scheduling (Fixed-route),GTFS Schedule Publishing,GTFS generation",10
SMA and Partners Ltd.,Viriato,Scheduling (Fixed-route),1
The Master Scheduler,The Master Scheduler,Scheduling (Fixed-route),5
Trapeze Group,Trapeze - Unspecified,Scheduling (Fixed-route),23
Trapeze Group,Trapeze TripSpark Streets,"Scheduling (Fixed-route),Computer Automated Dispatch (Responsive),Real-time info",1
TripShot Inc.,TripShot - Unspecified,"KPI Reporting,Scheduling (Fixed-route),Scheduling (Demand-Responsive),Mobile trip planning app,Real-time info",1
Uber Inc.,RouteMatch - Fixed: Base,"Scheduling (Fixed-route),Computer Automated Dispatch (Fixed),Real-time info,KPI Reporting",18


## Most common product category

In [63]:
components[['name','products_in_categories']].sort_values('products_in_categories', ascending = False)

Unnamed: 0,name,products_in_categories
70,Real-time info,32.0
53,Mobile trip planning app,19.0
9,AVL On-board Computer,19.0
14,Cash Farebox,17.0
56,Offboard signage,16.0
5,APC Sensors,14.0
66,Payment processor,14.0
52,Mobile ticketing,13.0
46,KPI Reporting,12.0
105,Web-based trip planner,12.0
