# Clean up `data_prep.py`

* Get rid of `pd.set_option('display.max_columns', None)`, etc in functions, because those are more for notebooks. Not relevant when used in a function within a script because the script isn't printing outputs to see


In [1]:
import numpy as np
import pandas as pd
from siuba import *
from calitp_data_analysis.sql import to_snakecase
from calitp_data_analysis import geography_utils

GCS_FILE_PATH = "gs://calitp-analytics-data/data-analyses/5311 /"



## Original Function

* Pretty long, but does have sub-functions
* Break up into more distinct steps. Longer functions mean it's hard to debug, because it's you're probably writing, overwriting the same df, or creating copies `df`, `df1`, `df2`. Using functions to break that up allows you to return each df in each step, but you don't save extraneous copies.

In [2]:
import data_prep

vehicles_orig = data_prep.load_cleaned_vehiclesdata()
vehicles_orig.shape

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


(218, 26)

In [3]:
len(vehicles_orig)

218

In [4]:
vehicles_orig.columns

Index(['agency', 'ntd_id', 'reporter_type', '_0_9', '_10_12', '_13_15',
       '_16_20', '_21_25', '_26_30', '_31_60', '_60plus', 'total_vehicles',
       'average_age_of_fleet__in_years_', 'average_lifetime_miles_per_vehicle',
       'sum_15plus', 'Automobiles', 'Bus', 'Other', 'Service', 'Train', 'Van',
       'automobiles_door', 'bus_doors', 'train_doors', 'van_doors',
       'doors_sum'],
      dtype='object')

## Refactored

In [5]:
def load_vehicle_data():
    ## TO FIX
    # If you don't put brackets around sheet_name = ['Age Distribution'], 
    # you already return the df
    File_Vehicles =  "cleaned_vehicles.xlsx"
    vehicles_info =  pd.read_excel(f'{GCS_FILE_PATH}{File_Vehicles}',
                                   sheet_name = 'Age Distribution')
    #cannot use to_snakecase because of integer column names
    vehicles = (vehicles_info>>filter(_.State=='CA'))
    
    return vehicles

In [6]:
def get_vehicle_groups(row):
    Automobiles = ['Automobile','Sports Utility Vehicle']
    Bus = ['Bus','Over-the-road Bus','Articulated Bus','Double Decker Bus','Trolleybus']
    Vans = ['Van','','Minivan','Cutaway']
    Trains = ['Vintage Trolley','Automated Guideway Vehicle','Heavy Rail Passenger Car','Light Rail Vehicle',
             'Commuter Rail Self-Propelled Passenger Car','Commuter Rail Passenger Coach','Commuter Rail Locomotive',
            'Cable Car']
    Service = ['Automobiles (Service)',
               'Trucks and other Rubber Tire Vehicles (Service)',
               'Steel Wheel Vehicles (Service)']
    other = ['Other','Ferryboat']
    
    if row.vehicle_type in Automobiles:
        return "Automobiles"
    elif row.vehicle_type in Bus:
        return "Bus"
    elif row.vehicle_type in Trains:
        return "Train"
    elif row.vehicle_type in Vans:
        return "Van"
    elif row.vehicle_type in Service:
        return "Service"
    else:
        return "Other"

In [7]:
def initial_cleaning(df):    
    #Add up columns 0-9 to get a new bin
    zero_to_nine = [0,1,2,3,4,5,6,7,8,9]
    ten_to_twelve = [10, 11, 12]
    
    df['0-9'] = df[zero_to_nine].sum(axis=1)
    #Add up columns 10-12
    df['10-12'] = df[ten_to_twelve].sum(axis=1)
    
    ## TO FIX
    # Method chaining, basically stringing or chaining together a bunch of commands
    # so it's a bit neater, and also it does it in one go
    df2 = df.drop(columns = zero_to_nine + ten_to_twelve)
    df2 = (to_snakecase(df2)
           .astype({"ntd_id": str}) 
           .rename(columns = {"_60+": "_60plus"})
          )
    
    df2["vehicle_groups"] = df2.apply(lambda x: get_vehicle_groups(x), axis=1)
    
    return df2

In [8]:
# Use lists when there's the same set of columns you want to work with repeatedly
# Break it up into several lists if need be
# Whether lists live outside functions or inside functions depends if you need to call them again
age_under_15 = ["_0_9","_10_12", "_13_15"]
age_over_15 = ["_16_20", "_21_25","_26_30", "_31_60","_60plus"]

def get_age(df):
    # Moved this renaming into initial_cleaning function
    #df = df.rename(columns={'_60+': '_60plus'})

    age = geography_utils.aggregate_by_geography(
        df, 
        group_cols = ["agency", "ntd_id", "reporter_type"],
        sum_cols = ["total_vehicles"] + age_under_15 + age_over_15,
        mean_cols = ["average_age_of_fleet__in_years_", "average_lifetime_miles_per_vehicle"]
    ).sort_values(["agency","total_vehicles"], ascending=[True, True])
    
    older = (age.query('_21_25 != 0 or _26_30 != 0 or _31_60 != 0 or _60plus!=0'))
    older = older.assign(
        sum_15plus = older[age_over_15].sum(axis=1)
    )
    
    age = pd.merge(age, 
                   older>>select(_.agency, _.sum_15plus), 
                   on=['agency'], how='left')
    
        
    return age

def get_doors(df):
    
    types = df[["agency", "vehicle_groups"] + age_under_15 + age_over_15]
    types['sum_type'] = types[age_under_15 + age_over_15].sum(axis=1)
    
    ## At this point, the df is long (agency-vehicle_groups)
    
    #https://towardsdatascience.com/pandas-pivot-the-ultimate-guide-5c693e0771f3
    types2 = (types.pivot_table(index=["agency"],
                               columns="vehicle_groups", 
                               values="sum_type", aggfunc=np.sum, fill_value=0)
            ).reset_index()

    two_doors = ['Automobiles', 'Bus', 'Train']
    one_door = ['Van']
    door_cols = []
    
    for c in one_door + two_doors:
        # Create a new column, like automobile_door
        new_col = f"{c.lower()}_doors"
    
        # While new column is created, add it to list (door_cols)
        # Then, can easily sum across
        door_cols.append(new_col)
        
        if c in two_doors:
            multiplier = 2
        elif c in one_door:
            multiplier = 1
        types2[new_col] = types2[c] * multiplier
    
    types2["doors_sum"] = types2[door_cols].sum(axis=1)
    
    return types2

In [9]:
def clean_vehicles_data():
    vehicles = load_vehicle_data()
    vehicles2 = initial_cleaning(vehicles)

    # Use lists when there's the same set of columns you want to work with repeatedly
    # Break it up into several lists if need be
    # Whether lists live outside functions or inside functions depends if you need to call them again
    age_under_15 = ["_0_9","_10_12", "_13_15"]
    age_over_15 = ["_16_20", "_21_25","_26_30", "_31_60","_60plus"]
    
    # The lists above should live closer to the sub-functions they belong to
    
    # This df is aggregated at agency-level
    age_df = get_age(vehicles2)
    # This df is aggregated at agency-vehicle_group level 
    # but, pivoted to be agency-level
    doors_df = get_doors(vehicles2)
    
    df = pd.merge(
        age_df,
        doors_df,
        on = ["agency"],
        how = "left",
        validate = "1:1"
    )
    
    # Rename for now, because this might affect downstream stuff
    df = df.rename(columns = {"automobiles_doors": "automobiles_door"})    
    return df

In [10]:
df = clean_vehicles_data()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [11]:
set(df.columns).difference(set(vehicles_orig.columns))

set()

In [12]:
assert set(df.columns) == set(vehicles_orig.columns)

In [13]:
assert df.shape == vehicles_orig.shape

In [14]:
def pick_column_and_aggregate(df1, df2, col):
    if df1[col].sum() == df2[col].sum():
        print("PASS")
    else:
        print(f"{col}: FAIL")

In [15]:
cols = ['_0_9', '_10_12', '_13_15',
       '_16_20', '_21_25', '_26_30', '_31_60', '_60plus', 'total_vehicles',
       'average_age_of_fleet__in_years_', 'average_lifetime_miles_per_vehicle',
       'sum_15plus', 'Automobiles', 'Bus', 'Other', 'Service', 'Train', 'Van',
       'automobiles_door', 'bus_doors', 'train_doors', 'van_doors',
       'doors_sum']

for c in cols:
    pick_column_and_aggregate(df, vehicles_orig, c)

PASS
PASS
PASS
PASS
PASS
PASS
PASS
PASS
PASS
PASS
PASS
PASS
PASS
PASS
PASS
PASS
PASS
PASS
PASS
PASS
PASS
PASS
PASS
