# Cleaning
* Taking a first look and cleaning the data.
* [Data Source](https://airtable.com/appeVUdmRBi3K9hTS/tblLywLvMA2OTesQP/viwRRKOaZvvkSNfmU?blocks=hide)

In [1]:
import numpy as np
import pandas as pd

pd.options.display.max_columns = 50
pd.options.display.max_rows = 250
pd.set_option("display.max_colwidth", None)
pd.options.display.float_format = "{:.2f}".format

from itertools import chain

import altair as alt
from calitp import *
from siuba import *

GCS_FILE_PATH = "gs://calitp-analytics-data/data-analyses/transit_stacks/"

## Products

In [2]:
# drop columns with tons of NAs
products = (
    to_snakecase(pd.read_csv(f"{GCS_FILE_PATH}products-Grid view (1).csv"))
    .drop(
        columns=[
            "business_model_features",
            "attachments",
            "status",
            "certifications",
            "connectivity",
            "accepted_input_components",
            "output_components",
            "input",
            "output",
        ]
    )
    .astype(str)
    .rename(columns={"name": "product_name"})
)

In [3]:
products.shape

(259, 8)

In [4]:
products.isna().sum()

product_name                     0
components                       0
vendor                           0
url                              0
requirements                     0
product_features                 0
notes                            0
organization_stack_components    0
dtype: int64

### Count number of strings in organization_stack_components column to see how many orgs are using this vendor.

In [5]:
# https://stackoverflow.com/questions/51502263/pandas-dataframe-object-has-no-attribute-str
products["count_of_orgs_using_product"] = (
    products["organization_stack_components"]
    .str.split(",+")
    .str.len()
    .groupby(products.product_name)
    .transform("sum")
)

### Each company can sell multiple components, split each component by comma 

In [6]:
# https://stackoverflow.com/questions/52575290/how-to-separate-string-into-multiple-rows-in-pandas
components = products["components"].str.split(",")
cols = products.columns.difference(["components"])
products2 = products.loc[products.index.repeat(components.str.len()), cols].assign(
    components=list(chain.from_iterable(components.tolist()))
)

In [7]:
products2.columns

Index(['count_of_orgs_using_product', 'notes', 'organization_stack_components',
       'product_features', 'product_name', 'requirements', 'url', 'vendor',
       'components'],
      dtype='object')

### Make the dataframe a little cleaner/smaller

In [8]:
products3 = products2[
    [
        "product_name",
        "vendor",
        "components",
        "count_of_orgs_using_product",
        "product_features",
        "requirements",
        "notes",
    ]
]

In [49]:
products3.sample(2)

Unnamed: 0,product_name,vendor,components,count_of_orgs_using_product,product_features,requirements,notes
249,Fleetwatch - System,S&A Systems Inc.,Vehicle Health Monitoring,1,,,
231,Trapeze Fixed Route Scheduling,Trapeze Group,Driver Sign-up,74,,,


In [10]:
products3.shape

(417, 7)

## Components

In [11]:
components = to_snakecase(pd.read_csv(f"{GCS_FILE_PATH}components-Grid view.csv"))

In [12]:
components.isna().sum()

name                               0
aliases                           95
system                            73
location                           1
function_group                     3
description                       88
products                          37
organization_stack_components     82
example_stacks                    94
example_stacks_copy              107
properties_+_features             96
dtype: int64

In [13]:
components.shape

(107, 11)

### Count number of products in each category 

In [14]:
# https://stackoverflow.com/questions/51502263/pandas-dataframe-object-has-no-attribute-str
components["count_of_products_in_categories"] = (
    components["products"]
    .str.split(",+")
    .str.len()
    .groupby(components.name)
    .transform("sum")
)

In [50]:
components.sample(1)

Unnamed: 0,name,aliases,system,location,function_group,description,products,organization_stack_components,example_stacks,example_stacks_copy,properties_+_features,count_of_products_in_categories
20,Driver Microphone,,,Vehicle,Operations,,,,Extra Small-Driver Microphone,,,0.0


## Merge components with products

In [16]:
# Join
merge1 = pd.merge(
    products3,
    components,
    how="outer",
    left_on=["components"],
    right_on=["name"],
    indicator=True,
)

In [17]:
merge1._merge.value_counts()

both          405
right_only     38
left_only      12
Name: _merge, dtype: int64

In [18]:
merge2 = merge1.loc[merge1["_merge"] == "both"]

In [19]:
merge2 = merge1[
    [
        "product_name",
        "vendor",
        "components",
        "system",
        "location",
        "function_group",
        "description",
        "requirements",
        "product_features",
        "notes",
        "properties_+_features",
        "count_of_orgs_using_product",
    ]
]

In [20]:
merge2.shape

(455, 12)

In [21]:
merge2.sample(3)

Unnamed: 0,product_name,vendor,components,system,location,function_group,description,requirements,product_features,notes,properties_+_features,count_of_orgs_using_product
170,Clever Devices - Unspecified,Clever Devices Ltd.,AVL Software,CAD/AVL,Backoffice,Operations,Automatic Vehicle Location (AVL) records and tracks the locations of your vehicles,,,,Latency,50.0
147,Swiftly Metronome,Swiftly Inc.,APC Software,APC,Backoffice,Reporting,,,Cloud-based,"Swiftly Metronome ensures your operations staff have visibility over your entire fixed-route fleet and can manage operations in real time, regardless of the hardware on board today or in the future.\n\nLive Operations - Get an up-to-the-second picture of early, late, and bunched vehicles in your system\n\nOnboard App - An intuitive onboard display that runs on commodity Android and iOS tablets\n\nService Adjustments - Dynamically manage scheduled service changes and unforeseen disruptions\n\nAPC Connector - Connect your APC units directly to the internet for real-time crowding and easy-to-access historical data\n\nAVAS Connector- Bring real-time information to your passengers through onboard audio and visual announcements in partnership with Way Sine",,19.0
169,Cubic - Unspecified,,AVL Software,CAD/AVL,Backoffice,Operations,Automatic Vehicle Location (AVL) records and tracks the locations of your vehicles,,,,Latency,13.0


## Contracts

In [22]:
contracts = (
    to_snakecase(pd.read_csv(f"{GCS_FILE_PATH}contracts-Grid view.csv"))
    .drop(columns=["attachments", "organization_stack_components", "name"])
    .rename(
        columns={
            "type_of_contract:_functional_category": "functional_category",
            "type_of_contract:_functions": "contract_type",
        }
    )
)

In [23]:
f"{ contracts.contract_holder.nunique()} organizations in contracts data set"

'51 organizations in contracts data set'

In [24]:
f"{ contracts.contract_vendor.nunique()} vendors in contracts data set"

'37 vendors in contracts data set'

In [25]:
contracts.renewal_option.value_counts()

None           125
Auto-renews      3
Name: renewal_option, dtype: int64

In [26]:
contracts = contracts.assign(
    start_date=pd.to_datetime(contracts.start_date, errors="coerce"),
    end_date=pd.to_datetime(contracts.end_date, errors="coerce"),
)

In [27]:
contracts["duration_of_contract_year"] = (
    (contracts["end_date"] - contracts["start_date"]).dt.days
) / 365

In [28]:
# Average contract length in years
filtered_for_end_date = contracts[contracts["end_date"].notnull()]
filtered_for_end_date["duration_of_contract_year"].median()

3.0027397260273974

In [29]:
# https://stackoverflow.com/questions/52575290/how-to-separate-string-into-multiple-rows-in-pandas
contract_type = contracts["contract_type"].str.split(",")
cols = contracts.columns.difference(["contract_type"])

In [30]:
contracts_delinated = contracts.loc[
    contracts.index.repeat(contract_type.str.len()), cols
].assign(contract_type_use=list(chain.from_iterable(contract_type.tolist())))

In [31]:
contracts_delinated.sample(3)

Unnamed: 0,contract_holder,contract_name,contract_vendor,duration_of_contract_year,end_date,functional_category,notes,renewal_option,start_date,value,contract_type_use
42,City of Visalia,Label used for the procurement.,GMV Syncromatics Inc,,NaT,"CAD/AVL,Passenger counting,Onboard rider information,Offboard rider information",,,2016-06-09,,MDT
94,Santa Barbara Metropolitan Transit District,Label used for the procurement.,Genfare,,NaT,"Offboard fares,Onboard fares",,,2016-12-27,,Farebox
64,Monterey-Salinas Transit,Label used for the procurement.,Trapeze Group,5.17,2024-06-30,"CAD/AVL,Onboard rider information,Offboard rider information",,,2019-05-01,,Real-time service alerts


# Answering some questions

## Products
### What % of vendors with scheduling software also provide GTFS data out of the box?
* Doesn't seem like any vendors/products contain both GTFS and schedule in the components section.

In [32]:
# https://stackoverflow.com/questions/26577516/how-to-test-if-a-string-contains-one-of-the-substrings-in-a-list-in-pandas
searchfor = ["GTFS", "schedule", "Scheduling", "Schedule", "scheduling"]

In [33]:
gtfs_schedule_overlap = products[
    products["components"].str.contains(
        "&".join(searchfor),
        case=False,
    )
]

In [34]:
gtfs_schedule_overlap

Unnamed: 0,product_name,components,vendor,url,requirements,product_features,notes,organization_stack_components,count_of_orgs_using_product


### 

## Components
### Top & bottom 10 product categories
* Real-time info is the most "crowded" category with 32 different products.
* Most categories only have one product.

In [35]:
def bar_chart(df, x_col, y_col):
    chart = (
        alt.Chart(df)
        .mark_bar()
        .encode(
            x=x_col,
            y=y_col,
            color=alt.Color(
                x_col, scale=alt.Scale(scheme="tealblues")
            ),
        )
    )
    return chart

In [36]:
product_categories = components[
    ["name", "count_of_products_in_categories"]
].sort_values("count_of_products_in_categories", ascending=False)

In [37]:
# filter out any categories with 0 products
product_categories = product_categories[
    product_categories["count_of_products_in_categories"] > 0
]

In [38]:
f"{product_categories.name.nunique()} unique categories"

'70 unique categories'

In [39]:
product_categories.head()

Unnamed: 0,name,count_of_products_in_categories
70,Real-time info,32.0
53,Mobile trip planning app,19.0
9,AVL On-board Computer,19.0
14,Cash Farebox,17.0
56,Offboard signage,16.0


In [40]:
f"Median number of different products in a category is {product_categories.count_of_products_in_categories.median()}"

'Median number of different products in a category is 4.0'

In [41]:
most_saturated_category = product_categories.head(10).rename(
    columns={"name": "category"}
)

In [42]:
bar_chart(most_saturated_category, 'count_of_products_in_categories', 'category')

## Looking at contracts
### Most common products bought
* Most contracts have an element of GTFS Generation, followed by Vehicle Locations, and Arrival Predictions.

In [43]:
most_common_contract_product = (
    contracts_delinated.contract_type_use.value_counts()
    .to_frame()
    .reset_index()
    .rename(
        columns={"index": "product_type", "contract_type_use": "number_of_contracts"}
    )
    .head(10)
)

In [44]:
bar_chart(most_common_contract_product, 'number_of_contracts', 'product_type')

### Most popular vendors

In [45]:
vendors = (
    contracts.contract_vendor.value_counts()
    .to_frame()
    .reset_index()
    .head(10)
    .rename(columns={"index": "vendor", "contract_vendor": "number_of_contracts"})
)

In [46]:

bar_chart(vendors, 'number_of_contracts', 'vendor')