# Taking a first look and cleaning the data.
* [Data Source](https://airtable.com/appeVUdmRBi3K9hTS/tblLywLvMA2OTesQP/viwRRKOaZvvkSNfmU?blocks=hide)
* [Term Explanations](https://docs.calitp.org/data-infra/datasets_and_tables/transitdatabase.html)

In [1]:
import numpy as np
import pandas as pd

pd.options.display.max_columns = 50
pd.options.display.max_rows = 250
pd.set_option("display.max_colwidth", None)
pd.options.display.float_format = "{:.2f}".format

from itertools import chain
from os import path

import altair as alt
import charts
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from calitp import *
from PIL import Image
from shared_utils import altair_utils
from siuba import *

# from wordcloud import STOPWORDS, ImageColorGenerator, WordCloud

GCS_FILE_PATH = "gs://calitp-analytics-data/data-analyses/transit_stacks/"



## Products Data

In [2]:
# drop columns with tons of NAs
products = (
    to_snakecase(pd.read_csv(f"{GCS_FILE_PATH}products_May_3.csv"))
    .drop(
        columns=[
            "business_model_features",
            "attachments",
            "status",
            "certifications",
            "connectivity",
            "accepted_input_components",
            "output_components",
            "input",
            "output",
        ]
    )
    .rename(columns={"name": "product_name"})
)

In [3]:
products.columns

Index(['product_name', 'components', 'vendor', 'url', 'requirements',
       'product_features', 'notes', 'organization_stack_components'],
      dtype='object')

In [4]:
products.isna().sum()

product_name                       0
components                         9
vendor                            40
url                               70
requirements                     233
product_features                 180
notes                            173
organization_stack_components    103
dtype: int64

In [5]:
### Count number of strings by column in organization_stack_components column to see how many orgs are using this vendor.
# https://stackoverflow.com/questions/51502263/pandas-dataframe-object-has-no-attribute-str
products["count_of_orgs_using_product"] = (
    products["organization_stack_components"]
    .str.split(",+")
    .str.len()
    .groupby(products.product_name)
    .transform("sum")
)

In [6]:
unique_vendors = products.vendor.nunique()
f"{unique_vendors} unique vendors"

'83 unique vendors'

In [7]:
f"{products.product_name.nunique()} unique products"

'259 unique products'

In [8]:
products.sample(1)

Unnamed: 0,product_name,components,vendor,url,requirements,product_features,notes,organization_stack_components,count_of_orgs_using_product
252,Trillium Trip Planner,Web-based trip planner,Trillium Inc.,https://trilliumtransit.com/consulting/web-design/,,,,"""Tahoe Truckee Area Regional Transportation, Tahoe Transportation Paratransit-Trillium Trip Planner-Web-based trip planner"",""El Dorado Transit, Sacramento/South Lake Tahoe Connector Bus, El Dorado Dial-A-Ride, SAC-MED, El Dorado Paratransit-Trillium Trip Planner-Web-based trip planner""",7.0


In [9]:
products = products.fillna("N/A")

### What % of vendors with scheduling software also provide GTFS data out of the box? 

In [10]:
# https://stackoverflow.com/questions/47125076/how-to-filter-rows-containing-specific-string-values-with-an-and-operator
gtfs_scheduling = products[
    products["components"].str.contains("GTFS")
    & products["components"].str.contains("Scheduling")
]
gtfs_scheduling[
    ["product_name", "components", "vendor", "count_of_orgs_using_product", "notes"]
]

Unnamed: 0,product_name,components,vendor,count_of_orgs_using_product,notes
128,TripShot - Unspecified,"KPI Reporting,Scheduling (Fixed-route),Scheduling (Demand-Responsive),Mobile trip planning app,Real-time info,GTFS generation",TripShot Inc.,1.0,
234,Giro HASTUS,"Scheduling (Fixed-route),GTFS generation",Giro Inc.,35.0,
235,The Master Scheduler,"Scheduling (Fixed-route),GTFS generation",The Master Scheduler,5.0,http://themasterscheduler.com/whitepapers/TMS2GT.pdf
236,Trapeze - Unspecified,"Scheduling (Fixed-route),GTFS generation",Trapeze Group,23.0,
237,Optibus,"Scheduling (Fixed-route),GTFS Schedule Publishing,GTFS generation",Optibus,10.0,


In [11]:
vendors_gtfs_scheduling = gtfs_scheduling.vendor.nunique()
vendors_gtfs_scheduling

5

In [12]:
f"About {(vendors_gtfs_scheduling/unique_vendors)} of vendors offer both GTFS generation and scheduling software"

'About 0.060240963855421686 of vendors offer both GTFS generation and scheduling software'

### Vendors with the most customers

In [13]:
# filter out for any products w/o vendor info
no_vendor_nulls = products.loc[products["vendor"] != "N/A"]

In [17]:
vendors_with_most_customers = (
    no_vendor_nulls.groupby("vendor")
    .agg({"count_of_orgs_using_product": "sum"})
    .sort_values("count_of_orgs_using_product", ascending=False)
    .reset_index()
    .head(5)
    .rename(columns={"count_of_orgs_using_product": "total customers"})
)

In [18]:
vendors_with_most_customers

Unnamed: 0,vendor,total customers
0,Clever Devices Ltd.,160.0
1,GMV Syncromatics Inc,147.0
2,Trapeze Group,139.0
3,Genfare,110.0
4,Cubic,106.0


In [21]:
charts.basic_bar_chart(
    vendors_with_most_customers,
    "total customers",
    "vendor",
    "vendor",
    "Vendors with the Most Customers",
)

### Vendor with the most products


In [22]:
vendors_with_most_products = (
    no_vendor_nulls.vendor.value_counts()
    .to_frame()
    .head(5)
    .reset_index()
    .rename(columns={"vendor": "total products", "index": "vendor"})
)

In [24]:
charts.basic_bar_chart(
    vendors_with_most_products,
    "vendor",
    "total products",
    "vendor",
    "Vendors with the most Products",
)

### Most popular products in general
* Metric: count_of_orgs_using_product column is the count of strings delinated by comma in the organization stacks component column for each product.
    * Assume that each value is a separate organization.
* Genfare FareBox has 94 values, making it the most popular product
* Many organizations rely on "in house activity." 

In [None]:
most_popular = (
    products[
        ["product_name", "vendor", "components", "count_of_orgs_using_product", "notes"]
    ]
    .sort_values("count_of_orgs_using_product", ascending=False)
    .head(10)
    .rename(
        columns={
            "count_of_orgs_using_product": "number_of_organizations_using_the_product"
        }
    )
)

In [None]:
charts.basic_bar_chart(
    most_popular,
    "number_of_organizations_using_the_product",
    "product_name",
    "product_name",
    "Most Popular Products",
)

### Most popular products by component type and # of organizations 
* Break out components by comma.
* Most popular determined by count of organizations using a particularly product.
* Only keep that product.

In [None]:
products2 = products.copy()

In [None]:
# Separate out components.
components = products2["components"].str.split(",")
cols = products2.columns.difference(["components"])

products2 = products2.loc[products.index.repeat(components.str.len()), cols].assign(
    components=list(chain.from_iterable(components.tolist()))
)

In [None]:
# Strip extra quotes that appear around Security System, replace it with a neater value
products2 = products2.replace('"', "", regex=True)

In [None]:
products2["components"] = products2["components"].replace(
    {
        "Security System\n": "Security System",
    }
)

In [None]:
# filter out any products with 0 organizations in org stacks
popular_products = products2.loc[products2["count_of_orgs_using_product"] > 0]

In [None]:
# Only keep relevant columns, sort dataframe by A-Z using components
popular_products = products2[
    ["components", "vendor", "product_name", "count_of_orgs_using_product"]
].sort_values("components", ascending=True)

In [None]:
# Keep only the product within a component category with the highest # of organizations using this product.
# https://stackoverflow.com/questions/53842287/select-rows-with-highest-value-from-groupby
popular_products = (
    popular_products.drop_duplicates()
    .sort_values(["components", "count_of_orgs_using_product"], ascending=False)
    .groupby(["components"])
    .first()
    .reset_index()
    .rename(columns={"components": "component", "product_name": "most_popular_product"})
)

In [None]:
# sorting vendors by how many times they appear having the most customers in a component category.
popular_products

## Components Data

In [None]:
components = to_snakecase(pd.read_csv(f"{GCS_FILE_PATH}components_May_3.csv"))

In [None]:
components.shape

In [None]:
components.isna().sum()

In [None]:
# Count number of products within each component category.
# https://stackoverflow.com/questions/51502263/pandas-dataframe-object-has-no-attribute-str
components["count_of_products_in_categories"] = (
    components["products"]
    .str.split(",+")
    .str.len()
    .groupby(components.name)
    .transform("sum")
)

In [None]:
# Double Checking
components.loc[components["name"] == "APC Sensors"]

### Top 10 "crowded" product categories
* Count number of strings in the "products" column and group by "name" column to get total number of products in each sector.
* Real-time info is the most "crowded" category with 32 different products.
* Most categories only have one product.
* About 4 unique products in each category, when filtering out any categories with 0 products recorded.

In [None]:
f"{components.name.nunique()} unique categories"

In [None]:
product_categories = (
    components[["name", "count_of_products_in_categories"]]
    .sort_values("count_of_products_in_categories", ascending=False)
    .rename(columns={"name": "category"})
)

In [None]:
# filter out any categories with 0 products - 36 values are empty
product_categories = product_categories[
    product_categories["count_of_products_in_categories"] > 0
]

In [None]:
f"{product_categories.count_of_products_in_categories.median()} average products per category"

In [None]:
most_saturated_category = product_categories.head(10).rename(
    columns={"count_of_products_in_categories": "number of unique products"}
)

In [None]:
charts.basic_bar_chart(
    most_saturated_category,
    "number of unique products",
    "category",
    "category",
    "Most Saturated Categories",
)

### Function Groups
* Most of the products are under the "operations" group.

In [None]:
components.function_group.value_counts()

## Contracts Data

In [None]:
contracts = (
    to_snakecase(pd.read_csv(f"{GCS_FILE_PATH}Contracts_May_11.csv"))
    .drop(columns=["attachments", "organization_stack_components", "name"])
    .rename(
        columns={
            "type_of_contract:_functional_category": "functional_category",
            "type_of_contract:_functions": "contract_type",
        }
    )
)

In [None]:
contracts.shape

In [None]:
contracts.isna().sum()

In [None]:
# First row of data is just N/A, so dropping it here.
contracts = contracts.dropna(subset=["contract_holder"])

In [None]:
len(contracts)

In [None]:
f"{ contracts.contract_holder.nunique()} organizations in contracts data set"

In [None]:
f"{ contracts.contract_vendor.nunique()} vendors in contracts data set"

### 125 contracts have none/no record for renewal options, 4 auto-renews

In [None]:
contracts.renewal_option.value_counts()

### Looking at contract duration:
* Contracts with an end date value populated, average duration is 3 years.

In [None]:
# Editing date time cols to the right data type
contracts = contracts.assign(
    start_date=pd.to_datetime(contracts.start_date, errors="coerce"),
    end_date=pd.to_datetime(contracts.end_date, errors="coerce"),
)

In [None]:
# new column for duration of contract year.
contracts["duration_of_contract_year"] = (
    (contracts["end_date"] - contracts["start_date"]).dt.days
) / 365

In [None]:
# fill in NAs with 0 and round to 0
contracts["duration_of_contract_year"] = (
    contracts["duration_of_contract_year"].fillna(0).round(0)
)

In [None]:
# new df for rows with end dates
filtered_for_end_date = contracts[contracts["end_date"].notnull()]

In [None]:
duration = (
    filtered_for_end_date.groupby("duration_of_contract_year")
    .agg({"contract_holder": "count"})
    .reset_index()
    .rename(
        columns={
            "duration_of_contract_year": "contract length",
            "contract_holder": "number_of_contracts",
        }
    )
)

In [None]:
filtered_for_end_date["duration_of_contract_year"].median()

In [None]:
len(filtered_for_end_date)

In [None]:
end_dates_by_quarters = (
    filtered_for_end_date.groupby(
        pd.to_datetime(filtered_for_end_date["end_date"], format="%b-%Y").dt.to_period(
            "Q"
        )
    )["contract_holder"]
    .count()
    .to_frame()
    .reset_index()
    .rename(
        columns={
            "contract_holder": "number_of_contracts",
            "end_date": "quarter of expiration",
        }
    )
)

In [None]:
end_dates_by_quarters["quarter of expiration"] = end_dates_by_quarters[
    "quarter of expiration"
].astype("str")

In [None]:
charts.basic_bar_chart(
    end_dates_by_quarters,
    "quarter of expiration",
    "number_of_contracts",
    "number_of_contracts",
    "Contract Expiration Dates",
)

### Separate contract type to look at elements within each contract.
* Functional category is less descriptive, looking at contract type instead.
* There are 71 different types because of all various combinations of GTFS, mobile ticketing, etc elements a contract can have. 
* Separating them out by commas might make it easier to to analyze.
* After splitting the combinations, only 32 cateogries.

In [None]:
f"{ contracts.contract_type.nunique()} unique contract types"

In [None]:
contracts = contracts.fillna("N/A")

In [None]:
contracts["contract_type"] = contracts["contract_type"].replace(
    {
        "Payment processor/Merchant services,Mobile ticketing": "Payment Processor or Merchant Services or Mobile Ticketing",
        "Payment processor/Merchant services": "Payment Processor or Merchant Services",
    }
)

In [None]:
# https://stackoverflow.com/questions/52575290/how-to-separate-string-into-multiple-rows-in-pandas
contract_type = contracts["contract_type"].str.split(",")
cols = contracts.columns.difference(["contract_type"])

In [None]:
contracts2 = contracts.loc[
    contracts.index.repeat(contract_type.str.len()), cols
].assign(contract_type_use=list(chain.from_iterable(contract_type.tolist())))

In [None]:
contracts2.contract_type_use.nunique()

In [None]:
contracts2.shape

In [None]:
# just checking that everything is correct
contracts2.loc[
    contracts2["contract_type_use"] == "Payment Processor or Merchant Services"
]


### Most common contract element
* Most contracts have an element of GTFS Generation, followed by Vehicle Locations, and Arrival Predictions.

In [None]:
most_common_contract_product = (
    contracts2.contract_type_use.value_counts()
    .to_frame()
    .reset_index()
    .rename(
        columns={
            "index": "element",
            "contract_type_use": "number_of_contracts with this element",
        }
    )
    .head(10)
)

In [None]:
charts.basic_bar_chart(
    most_common_contract_product,
    "number_of_contracts with this element",
    "element",
    "element",
    "Most Common Element in Contracts",
)

In [None]:
elements_only = contracts2["contract_type_use"]

# Word cloud from Natalie's notebook
def word_cloud_gen(df):
    test = " ".join(df).lower()
    wordcloud2 = WordCloud(
        width=800,
        height=400,
        stopwords=STOPWORDS,
        collocations=True,
        background_color="white",
    ).generate(test)

    plt.figure(figsize=(12, 6))
    plt.imshow(wordcloud2, interpolation="bilInear")
    plt.axis("off")
    plt.show()

word_cloud_gen(elements_only)

### Most popular vendors by contract awarded

In [None]:
vendors2 = (
    contracts.contract_vendor.value_counts()
    .to_frame()
    .reset_index()
    .head(5)
    .rename(columns={"index": "vendor", "contract_vendor": "number_of_contracts"})
)

In [None]:
charts.basic_bar_chart(
    vendors2,
    "number_of_contracts",
    "vendor",
    "vendor",
    "Vendors Awarded The Most Contracts",
)

### Organizations that hold the most contracts

In [None]:
contract_holders = (
    contracts.contract_holder.value_counts()
    .to_frame()
    .reset_index()
    .rename(columns={"index": "holders", "contract_holder": "# contracts"})
    .head(10)
)

In [None]:
charts.basic_bar_chart(
    contract_holders,
    "# contracts",
    "holders",
    "holders",
    "Organizations with the Most Contracts",
)

### Looking at contracts and their elements with end dates from May 2022 to 2024.
* Of the contracts ending, most have an element of vehicle locations and arrival predictions.

In [None]:
filtered_for_end_date.end_date.describe()

In [None]:
# Keep only end dates from now until 2024.
end_dates = filtered_for_end_date.loc[
    (filtered_for_end_date["end_date"] >= "2022-05-03")
]

In [None]:
len(end_dates)

In [None]:
end_dates[
    [
        "contract_holder",
        "functional_category",
        "contract_type",
        "start_date",
        "end_date",
        "duration_of_contract_year",
    ]
].sort_values("end_date")

## Export

with pd.ExcelWriter(f"{GCS_FILE_PATH}transit_stacks.xlsx") as writer:
    products.to_excel(writer, sheet_name="products_clean", index=False)
    popular_products.to_excel(writer, sheet_name="popular_products", index=False)
    components.to_excel(writer, sheet_name="components_clean", index=False)
    contracts.to_excel(writer, sheet_name="contracts_clean", index=False)
    contracts2.to_excel(writer, sheet_name="contracts_delineated", index=False)