# Service Components

***

In [1]:
import numpy as np
import pandas as pd

pd.options.display.max_columns = 50
pd.options.display.max_rows = 250
pd.set_option("display.max_colwidth", None)
pd.options.display.float_format = "{:.2f}".format

from collections import OrderedDict
from itertools import chain
from os import path

import altair as alt
import utils
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from calitp import *
from PIL import Image
from shared_utils import altair_utils
from siuba import *

GCS_FILE_PATH = "gs://calitp-analytics-data/data-analyses/transit_stacks/"



In [2]:
service_components = to_snakecase(
    pd.read_csv(f"{GCS_FILE_PATH}service-components-Market Analysis.csv")
)

In [3]:
len(service_components)

1032

In [4]:
service_components.dtypes

name                                                object
services                                            object
component                                           object
vendor__from_product_                               object
parent_organization__from_vendor___from_product_    object
provider__from_services_                            object
product                                             object
product_components                                  object
dtype: object

In [5]:
#Checking N/As 
service_components.isna().sum()

name                                                  0
services                                              5
component                                             2
vendor__from_product_                                82
parent_organization__from_vendor___from_product_    785
provider__from_services_                              5
product                                               2
product_components                                   59
dtype: int64

In [6]:
# Fill in N/A and rename cols to be a little clearer.
service_components = service_components.fillna("N/A").rename(
    columns={
        "component": "components_orgs_actually_use",
        "product_components": "all_components_in_product",
    }
)

In [7]:
service_components.head(2)

Unnamed: 0,name,services,components_orgs_actually_use,vendor__from_product_,parent_organization__from_vendor___from_product_,provider__from_services_,product,all_components_in_product
0,"GET Bus, GET Bus On Demand-Genfare Farebox (Unspecified)-Cash Farebox","GET Bus,GET Bus On Demand",Cash Farebox,Genfare,,"Golden Empire Transit District, Golden Empire Transit District",Genfare Farebox (Unspecified),Cash Farebox
1,"GET Bus, GET Bus On Demand-Token Transit Mobile Ticketing-Mobile ticketing","GET Bus,GET Bus On Demand",Mobile ticketing,Token Transit,,"Golden Empire Transit District, Golden Empire Transit District",Token Transit Mobile Ticketing,Mobile ticketing


In [8]:
#Drop Duplicates
service_components = service_components.drop_duplicates()

In [9]:
#Lenght went down from 1032 to 1023...
len(service_components)

1023

## Are there vendors that offer GTFS & Real Time that nobody is using?
***

In [10]:
#Filter out products that have GTFS and Scheduling in service components dataset 
gtfs_rt = service_components[
    service_components["all_components_in_product"].str.contains("GTFS", case = False)
    & service_components["all_components_in_product"].str.contains("real-time", case = False)
]

In [11]:
#Read in products data set 
FILE_NAME = "transit_stacks.xlsx"
products = pd.read_excel(f"{GCS_FILE_PATH}{FILE_NAME}", sheet_name="products_clean")

In [12]:
#Filter out products that have GTFS and Scheduling in products dataset 
products_gtfs_rt = products[
    products["components"].str.contains("GTFS", case = False)
    & products["components"].str.contains("real-time", case = False)
]

In [13]:
#Grab unique vendors and chane it to a set
products_vendors = set(products_gtfs_rt.vendor.unique().tolist())
products_vendors

{'GMV Syncromatics Inc', 'Swiftly Inc.', 'TripShot Inc.', 'Uber Inc.'}

In [14]:
#Grab unique vendors and chane it to a set
service_comps_vendors = set(gtfs_rt.vendor__from_product_.unique().tolist())
service_comps_vendors

{'DoubleMap Inc.,Nixle',
 'GMV Syncromatics Inc',
 'Optibus,GMV Syncromatics Inc',
 'Swiftly Inc.',
 'Token Transit,Uber Inc.',
 'TripShot Inc.',
 'Uber Inc.',
 'Uber Inc.,Uber Inc.'}

In [15]:
products_vendors - service_comps_vendors

set()

## Summarizing the DataFrame
***
* It looks like some "organization" uses the same product from the same vendor for 1 + components. 
    * Example: Tuolumne County Transit uses Doublemap RealTime product for both real-time and arrival predictions. However, 'real-time' and 'arrival predictions' are split on 2 different lines, even though they reference the same product, vendor, and customer.
* Group/summarize so the same service-vendor-product combination are all on the same line and we can perhaps avoid duplicative counting.

In [16]:
service_components2 = service_components.copy()

In [17]:
service_components2["components_usage"] = service_components2.groupby(["services", "vendor__from_product_", "product"])["components_orgs_actually_use"].transform(
    lambda x: ",".join(x)
)

In [18]:
service_components3 = service_components2.drop_duplicates(subset = ['services', 'product','vendor__from_product_','components_usage'])

In [19]:
service_components3 = service_components3[['services','vendor__from_product_',
                                           'provider__from_services_','parent_organization__from_vendor___from_product_','product', 'all_components_in_product', 'components_usage']]

In [20]:
service_components3.sample()

Unnamed: 0,services,vendor__from_product_,provider__from_services_,parent_organization__from_vendor___from_product_,product,all_components_in_product,components_usage
324,Porterville Transit,Uber Inc.,Tulare County Regional Transit Agency,,RouteMatch - In-Vehicle Technology,"Fare card system,Mobile ticketing","Arrival predictions,AVL Software,Mobile data terminal software,Annunciator,Interior signage,APC Software"


In [21]:
len(service_components3)

737

## Components
***

* To do: column that the components usage actually matches the components a product offers.

### How many components are orgs using versus how many components a product actually offers.  
***

In [22]:
#Number of components an organization uses that product for (clean this up with assign later)
#https://stackoverflow.com/questions/30202011/how-can-i-count-comma-separated-values-in-one-column-of-my-panda-table
service_components3['number_of_components_used'] = service_components3['components_usage'].apply(lambda x: len(x.split(','))).astype('int64')
#Number of components available in the product
service_components3['number_of_components_avail'] = service_components3['all_components_in_product'].apply(lambda x: len(x.split(','))).astype('int64')

#Dividing based
service_components3['percent_of_components_used'] = service_components3['number_of_components_used']/service_components3['number_of_components_avail']

In [23]:
#Lots of duplicates in  provider__from_services_ column
#https://stackoverflow.com/questions/47316783/python-dataframe-remove-duplicate-words-in-the-same-cell-within-a-column-in-pyt
service_components3['provider__from_services_'] = (service_components3['provider__from_services_'].str.split()
                              .apply(lambda x: OrderedDict.fromkeys(x).keys())
                              .str.join(' '))


In [24]:
service_components3.sample(3)

Unnamed: 0,services,vendor__from_product_,provider__from_services_,parent_organization__from_vendor___from_product_,product,all_components_in_product,components_usage,number_of_components_used,number_of_components_avail,percent_of_components_used
197,Arcadia Transit,Microsoft,City of Arcadia,,Excel,General Purpose Software,Scheduling (Fixed-route),1,1,1.0
304,"VTA Bus,VTA ACE Shuttles,VTA Light Rail,VTA Rapid Bus,VTA Express Bus,VTA School Tripper",Swiftly Inc.,"Santa Clara Valley Transportation Authority, Authority",,Swiftly Transitime,"Real-time info,Arrival predictions,Alerts Content Management System,Social Alerts,Alerts Subscription Service,GTFS Alerts Publication",Arrival predictions,1,6,0.17
989,Herky Streetcar,DoubleMap Inc.,Sacramento State University,TransLoc,DoubleMap RealTime,"Real-time info,Mobile trip planning app",Real-time info,1,2,0.5


## Frequency Analysis
***

In [35]:
service_components3.groupby(['product']).agg({'services':'count'}).sort_values('services', ascending = False).head(10)

Unnamed: 0_level_0,services
product,Unnamed: 1_level_1
Genfare Farebox (Unspecified),51
Trillium GTFS Manager,40
Excel,37
Cubic NextBus Suite,21
Swiftly Transitime,21
DoubleMap RealTime,18
GMV/Syncromatics Sync,18
GMV/Syncromatics Dispatch,17
Cashbox,17
Token Transit Mobile Ticketing,15
