In [90]:
# Import dependencies
import pandas as pd
import numpy as np
import matplotlib as plt

In [91]:
# Store large csv file into dataframe
filepath = 'logistics_ppe.csv'
ppe_data = pd.read_csv(filepath)
ppe_data.head()

Unnamed: 0,county,product_family,quantity_filled,shipping_zip_postal_code,as_of_date
0,Non-Governmental Entity,Surgical Masks,,92064,2020-08-12
1,Non-Governmental Entity,Hand Sanitizers,,92064,2020-08-12
2,San Diego,Test Kits,,92123,2020-08-12
3,San Diego,Coveralls (Hospitals or EMS),,92123,2020-08-12
4,San Diego,Coveralls (Hospitals or EMS),,92123,2020-08-12


In [92]:
# Rename data columns
ppedf = pd.DataFrame()

ppedf = ppe_data.rename(columns = {'county':'County', 
                                      'product_family':'Product',
                                     'quantity_filled': 'Amount Fulfilled',
                                     'shipping_zip_postal_code': 'Postal',
                                     'as_of_date': 'Date'
                                     })
ppedf.head()

Unnamed: 0,County,Product,Amount Fulfilled,Postal,Date
0,Non-Governmental Entity,Surgical Masks,,92064,2020-08-12
1,Non-Governmental Entity,Hand Sanitizers,,92064,2020-08-12
2,San Diego,Test Kits,,92123,2020-08-12
3,San Diego,Coveralls (Hospitals or EMS),,92123,2020-08-12
4,San Diego,Coveralls (Hospitals or EMS),,92123,2020-08-12


In [93]:
# Drop Unnecessary rows/columns

# Drop Postal column
ppedf.drop(['Postal'], axis = 1, inplace = True)

# Drop NaN rows
ppedf = ppedf.dropna()

# Drop rows with 0 fulfilled equipment
ppedf = ppedf[ppedf['Amount Fulfilled'] > 0]
ppedf

Unnamed: 0,County,Product,Amount Fulfilled,Date
28,Fresno,N-95 Respirators,17400.0,2020-08-12
29,Los Angeles,Hand Sanitizers,216.0,2020-08-12
30,Los Angeles,Surgical Masks,2000.0,2020-08-12
31,Los Angeles,Cloth Masks,22500.0,2020-08-12
32,Los Angeles,Face Shields (Disposable),200.0,2020-08-12
...,...,...,...,...
3398257,San Mateo,N-95 Respirators,35000.0,2020-09-26
3399269,Kern,Examination Gloves,20000.0,2020-09-26
3399270,Kern,Examination Gloves,10000.0,2020-09-26
3399271,Kern,Surgical Masks,20000.0,2020-09-26


In [94]:
# Change Date column to be by month
ppedf['Date'] = pd.DatetimeIndex(ppedf['Date']).month
ppedf

Unnamed: 0,County,Product,Amount Fulfilled,Date
28,Fresno,N-95 Respirators,17400.0,8
29,Los Angeles,Hand Sanitizers,216.0,8
30,Los Angeles,Surgical Masks,2000.0,8
31,Los Angeles,Cloth Masks,22500.0,8
32,Los Angeles,Face Shields (Disposable),200.0,8
...,...,...,...,...
3398257,San Mateo,N-95 Respirators,35000.0,9
3399269,Kern,Examination Gloves,20000.0,9
3399270,Kern,Examination Gloves,10000.0,9
3399271,Kern,Surgical Masks,20000.0,9


In [95]:
# use groupby, combine rows that have the same County, Product, and Month, and Sum the amount fulfilled
ppedf.groupby(['County', 'Product', 'Date'])['Amount Fulfilled'].sum().to_frame().reset_index()
ppedf

Unnamed: 0,County,Product,Amount Fulfilled,Date
28,Fresno,N-95 Respirators,17400.0,8
29,Los Angeles,Hand Sanitizers,216.0,8
30,Los Angeles,Surgical Masks,2000.0,8
31,Los Angeles,Cloth Masks,22500.0,8
32,Los Angeles,Face Shields (Disposable),200.0,8
...,...,...,...,...
3398257,San Mateo,N-95 Respirators,35000.0,9
3399269,Kern,Examination Gloves,20000.0,9
3399270,Kern,Examination Gloves,10000.0,9
3399271,Kern,Surgical Masks,20000.0,9


In [96]:
# Sort the values alphabetically by County
ppedf = ppedf.sort_values('County')
ppedf

Unnamed: 0,County,Product,Amount Fulfilled,Date
2991588,Alameda,KN95 Respirators,1260.0,10
1971415,Alameda,Cloth Masks,31500.0,9
1971414,Alameda,N-95 Respirators,960.0,9
1224416,Alameda,Examination Gloves,50000.0,9
2917184,Alameda,N-95 Respirators,1260.0,10
...,...,...,...,...
2485234,Yuba,Pharmaceuticals,6.0,9
2831114,Yuba,Viral Testing Media,1600.0,9
2160369,Yuba,Hand Sanitizers,64.0,10
799613,Yuba,Viral Testing Media,500.0,8


In [97]:
# Check value counts of County column to see which counties to exclude
ppedf['County'].value_counts()

State Agency               368301
Non-Governmental Entity    351948
Los Angeles                232215
State Agency or Other      100992
Sacramento                  71862
                            ...  
Glenn                        4874
Sierra                       2677
Alpine                       2341
Governmental Entity           290
Other                         122
Name: County, Length: 64, dtype: int64

In [98]:
# Drop data that does not belong to any county
countyToDrop = ppedf[(ppedf['County'] == 'Governmental Entity') | 
                     (ppedf['County'] == 'State Agency') |
                    (ppedf['County'] == 'Non-Governmental Entity') |
                    (ppedf['County'] == 'State Agency or Other') |
                    (ppedf['County'] == 'Other')
                    
                    ].index
ppedf.drop(countyToDrop, inplace = True)
ppedf['County'].value_counts()

Los Angeles        232215
Sacramento          71862
Orange              66331
San Diego           58801
Alameda             56324
Riverside           46660
San Joaquin         45770
San Bernardino      44867
Monterey            42884
Fresno              39335
Tulare              39229
Santa Clara         32912
Imperial            32335
Stanislaus          31244
San Francisco       29787
Contra Costa        29529
Sonoma              28328
San Mateo           27798
Kern                26919
Yolo                26832
Ventura             26778
Santa Cruz          26419
Santa Barbara       26090
Tribal              25316
Placer              24532
Humboldt            23931
Del Norte           23305
Napa                23229
Solano              21872
San Luis Obispo     20854
Lake                20642
Merced              19291
Butte               18883
Marin               18518
Inyo                16731
Kings               15928
Tuolumne            15775
Mono                14293
Mendocino   

In [103]:
# Reset the index
ppedf = ppedf.reset_index()
ppedf

Unnamed: 0,index,County,Product,Amount Fulfilled,Date
0,2991588,Alameda,KN95 Respirators,1260.0,10
1,1971415,Alameda,Cloth Masks,31500.0,9
2,1971414,Alameda,N-95 Respirators,960.0,9
3,1224416,Alameda,Examination Gloves,50000.0,9
4,2917184,Alameda,N-95 Respirators,1260.0,10
...,...,...,...,...,...
1564703,2485234,Yuba,Pharmaceuticals,6.0,9
1564704,2831114,Yuba,Viral Testing Media,1600.0,9
1564705,2160369,Yuba,Hand Sanitizers,64.0,10
1564706,799613,Yuba,Viral Testing Media,500.0,8


In [101]:
# Output the final cleaned file to its csv
ppedf.to_csv('PPE_data_clean.csv')