# Anomalies in Data

In [1]:
import pandas as pd
from siuba import *

import numpy as np

import ipywidgets as widgets
from ipywidgets import *
from IPython.display import Markdown
from IPython.core.display import display

In [2]:
df = pd.read_csv('gs://calitp-analytics-data/data-analyses/dla/e-76Obligated/function_data.csv', low_memory=False).drop('Unnamed: 0', axis=1)



## Looking into MPOs

using this map to see where the agency is in realtion to nearest mpo https://gisdata-caltrans.opendata.arcgis.com/datasets/b3e0ef95520843ba8c1d3b9c0fa9a607_0/explore

In [3]:
print(len(df>>count(_.mpo)>>arrange(-_.n)))

31


* Accoring to the Caltrans GIS data, there are 18 mpos

In [4]:
#number of obligations under MPO
df>>count(_.mpo)>>arrange(-_.n)

Unnamed: 0,mpo,n
18,SCAG,4988
12,MTC,3236
13,NON-MPO,2592
14,SACOG,1865
2,CFCG,1404
8,KCOG,998
26,STANCOG,759
0,AMBAG,745
21,SJCG,515
29,TCAG,489


### Which MPOs have the least amount of obligations? What Agencies do they include?

In [None]:
df >> count(_.mpo)>>arrange(-_.n)>> filter(_.n==1)

#### MPO SANDA

In [None]:
df>>filter(_.mpo =='SANDA')

* maybe meant "SANDAG" but Big Bear Lake is in "SCAG" jurisdiction
* looked to excel download file and this there is anothe Big Bear Lake and SANDAG instance

In [None]:
df>>filter(_.agency.str.contains('Big Bear')) >> filter(_.mpo.str.contains("SANDA"))

google maps show the `project_location` intersections in the jurisdiction of Big Bear and not in SANDAG

#### MPO SHASTA

In [None]:
df >>filter(_.mpo=='SHASTA')

In [None]:
df>> filter(_.agency.str.contains('Shasta')) >> count(_.mpo)

In [None]:
df>>filter(_.agency.str.contains('Shasta')) >> filter(_.mpo=="MTC")

* in the project location column, the intersection is Tustin and Mayfair which is in Orange Couny, or SCAG. Agency is also Orange, yet the mpo is Shasta

#### MPO MNOLTC and Agencies under

In [None]:
df>> filter(_.mpo=='MNOLTC')

In [None]:
#looking to see if what other MPOs Rialto is in, should be in SCAG
df>> filter(_.agency=='Rialto') >> count(_.mpo)

In [None]:
df>> filter(_.agency=='Rialto') >> filter(_.mpo=='KCOG')

In [None]:
df>> filter(_.agency=='Rialto') >> filter(_.mpo=='SANDAG')

#### FSTIP

In [None]:
df>>filter(_.mpo=='FSTIP')

* FSTIP is the Federal Statewide Transportation Improvement Program. 

### Number of Unique Agencies in Each MPO

In [None]:

df >> group_by(_.mpo) >> summarize(n=_.agency.nunique()) >> arrange(-_.n) 

In [None]:
df >> filter(_.mpo=="SCAG") >> count(_.agency) >> arrange(-_.n)

## Function for Agencies in each MPO

creating a function to look into each MPO. What agencies are listed under the MPO name? Are the projects located in that jurisdiction? 

In [None]:
@interact
def find_agencies(place=df.mpo.sort_values().unique().tolist()):
    mpos = df[df.mpo==place]
    display(Markdown(f"**Agencies in {place}**"))
    display(mpos >> count(_.agency) >> arrange(-_.n))
    pd.set_option('display.max_rows', 500)

In [None]:
#using Siuba filter, we can start to look at the agenies under different MPOs

In [None]:
df>>filter(_.agency=='Marin County')>> filter(_.mpo=='SCAG')

* projet located in Lucas Valley, which is not in SCAG

In [None]:
df>>filter(_.agency=='Sonora')>> count(_.mpo)

In [None]:
df>>filter(_.agency=='Sonora')>> filter(_.mpo=='SCAG')

In [None]:
df>>filter(_.agency=='Shasta County')>> filter(_.mpo=='MTC')

In [None]:
df>>filter(_.agency=='Marina')>> filter(_.mpo=='AMBAG')

In [None]:
df>>filter(_.agency=='Kern County (District 9)')>> filter(_.mpo=='BCAG')

In [None]:
df>>filter(_.agency=='Santa Barbara County')>> filter(_.mpo=='BCAG')

In [None]:
df>>filter(_.agency=='Bishop')>> filter(_.mpo=='KCOG')

In [None]:
df>>filter(_.agency=='Los Angeles')>> filter(_.mpo=='MCAG')

In [None]:
df>>filter(_.agency=='Los Angeles')>> count(_.mpo)

In [None]:
df>>filter(_.agency=='Indio')>> filter(_.mpo=='MCTC')

In [None]:
df>>filter(_.agency=='Los Angeles')>> filter(_.mpo=='SJCOG')

### MPOs with all correct agencies

* AMBAG
* KCAG
* SBCG
* TMPO

### MPOs with some misplaced agencies

* BCAG
* MTC
* SCAG
* KGOC
* MNOLTC
* FCOG 
* KCOG
* MCAG
* SJCOG
* STANCOG
* MCTC
* SACOG
* SJCOG
* STANCOG 
* TCAG


### MPOs not defined / all incorrect  
* CFCG - has Fresno Counties. Fresno is part of FCOG
* COFCG - combination of multiple MPOs
* FCOG - includes Ventura and Fresno
* FSTIP - a Prefix rather than a MPO
* MNOLTC - one entry
* SANDA - one entry
* SDAG - maybe SANDAG
* SANDAG - mostly SCAG Agencies
* SHASTA - one agency not in Shasta County
* SJCG - most likely SJCOG
* SLAC - maybe SLOCOG
* STACOG - STANCOG
* STNCOG - STANCOG
* STPA - maybe SRTA


## Other

### Agnecy Name 'YRTS'

In [None]:
df>>

In [None]:
df>>filter(_.agency=='Yrts')

* 'Yrts' is actually ['Yarts'](https://yarts.com/), or Yosemite Area Regional Transportation System 

## Non-MPO Agencies
Looking at 'Non-MPO', 'ER NONE' and 'NaN' (or null)

#### Is Null

In [None]:
df>>filter(_.mpo.isnull())

### is ER NONE

In [None]:
df>>filter(_.mpo=='ER NONE')>>count(_.agency)

In [None]:
df>>filter(_.mpo=='ER NONE')>>filter(_.agency=='Ridgecrest')

In [None]:
df>>filter(_.mpo=='ER NONE')>>count(_.dist)

In [None]:
df>>filter(_.mpo=='ER NONE')>>filter(_.dist==8)

#### Is Non-MPO

In [None]:
df>>filter(_.mpo =='NON-MPO') >> count(_.agency)>>arrange(-_.n)

* 