In [1]:
# Import dependencies
import pandas as pd

import numpy as np
import matplotlib as plt

In [2]:
# Store large csv file into dataframe
filepath = 'Resources/logistics_ppe.csv'
ppe_data = pd.read_csv(filepath)
ppe_data.head()

Unnamed: 0,county,product_family,quantity_filled,shipping_zip_postal_code,as_of_date
0,Non-Governmental Entity,Surgical Masks,,92064,2020-08-12
1,Non-Governmental Entity,Hand Sanitizers,,92064,2020-08-12
2,San Diego,Test Kits,,92123,2020-08-12
3,San Diego,Coveralls (Hospitals or EMS),,92123,2020-08-12
4,San Diego,Coveralls (Hospitals or EMS),,92123,2020-08-12


In [3]:
# Rename data columns
ppedf = pd.DataFrame()

ppedf = ppe_data.rename(columns = {'county':'County', 
                                      'product_family':'Product',
                                     'quantity_filled': 'Amount Fulfilled',
                                     'shipping_zip_postal_code': 'Postal',
                                     'as_of_date': 'Date'
                                     })
ppedf.head()

Unnamed: 0,County,Product,Amount Fulfilled,Postal,Date
0,Non-Governmental Entity,Surgical Masks,,92064,2020-08-12
1,Non-Governmental Entity,Hand Sanitizers,,92064,2020-08-12
2,San Diego,Test Kits,,92123,2020-08-12
3,San Diego,Coveralls (Hospitals or EMS),,92123,2020-08-12
4,San Diego,Coveralls (Hospitals or EMS),,92123,2020-08-12


In [4]:
# Drop Unnecessary rows/columns

# Drop Postal column
ppedf.drop(['Postal'], axis = 1, inplace = True)

# Drop NaN rows
ppedf = ppedf.dropna()

# Drop rows with 0 fulfilled equipment
ppedf = ppedf[ppedf['Amount Fulfilled'] > 0]
ppedf.head()

Unnamed: 0,County,Product,Amount Fulfilled,Date
28,Fresno,N-95 Respirators,17400.0,2020-08-12
29,Los Angeles,Hand Sanitizers,216.0,2020-08-12
30,Los Angeles,Surgical Masks,2000.0,2020-08-12
31,Los Angeles,Cloth Masks,22500.0,2020-08-12
32,Los Angeles,Face Shields (Disposable),200.0,2020-08-12


In [5]:
# Change Date column to be by month

# Change Date to datetime format
ppedf['Date'] = pd.to_datetime(ppedf['Date'])

# Extract Month String from datetime object
ppedf['Date'] = ppedf['Date'].dt.strftime('%B')
ppedf.head()

Unnamed: 0,County,Product,Amount Fulfilled,Date
28,Fresno,N-95 Respirators,17400.0,August
29,Los Angeles,Hand Sanitizers,216.0,August
30,Los Angeles,Surgical Masks,2000.0,August
31,Los Angeles,Cloth Masks,22500.0,August
32,Los Angeles,Face Shields (Disposable),200.0,August


In [6]:
agg_func = {'Amount Fulfilled':'sum'}
ppedf = ppedf.groupby(['County','Product','Date']).aggregate(agg_func).reset_index()
ppedf.head()

Unnamed: 0,County,Product,Date,Amount Fulfilled
0,Alameda,Cloth Masks,August,32805900.0
1,Alameda,Cloth Masks,July,8400300.0
2,Alameda,Cloth Masks,June,2872860.0
3,Alameda,Cloth Masks,October,24112800.0
4,Alameda,Cloth Masks,September,43823900.0


In [8]:
# Sort the values alphabetically by County and Date
ppedf = ppedf.sort_values(['County', 'Date'])
ppedf.head()

Unnamed: 0,County,Product,Date,Amount Fulfilled
0,Alameda,Cloth Masks,August,32805900.0
5,Alameda,Coveralls (Hospitals or EMS),August,187096.0
10,Alameda,Examination Gloves,August,102643800.0
15,Alameda,Face Shields (Disposable),August,5439884.0
20,Alameda,Goggles,August,369600.0


In [9]:
# Check value counts of County column to see which counties to exclude
ppedf['County'].value_counts()

State Agency             119
San Diego                116
Orange                   113
Riverside                113
Tulare                   113
                        ... 
Sierra                    52
State Agency or Other     50
Tribal                    24
Governmental Entity       14
Other                      6
Name: County, Length: 64, dtype: int64

In [10]:
# Drop data that does not belong to any county
countyToDrop = ppedf[(ppedf['County'] == 'Governmental Entity') | 
                     (ppedf['County'] == 'State Agency') |
                    (ppedf['County'] == 'Non-Governmental Entity') |
                    (ppedf['County'] == 'State Agency or Other') |
                    (ppedf['County'] == 'Other')
                    
                    ].index
ppedf.drop(countyToDrop, inplace = True)
ppedf['County'].value_counts()

San Diego          116
Riverside          113
Orange             113
Tulare             113
Stanislaus         112
Inyo               112
Los Angeles        109
Alameda            105
Contra Costa       104
Sacramento         104
Santa Clara        103
Monterey            99
Santa Cruz          98
San Mateo           98
San Joaquin         98
Fresno              98
Santa Barbara       97
Imperial            96
San Bernardino      93
San Francisco       92
Sonoma              91
Solano              91
Humboldt            89
Marin               89
Yolo                87
San Luis Obispo     87
Merced              87
Butte               86
Placer              86
Ventura             86
Mendocino           84
Calaveras           84
Mono                84
San Benito          84
Kings               84
Lake                83
Nevada              82
Napa                82
Madera              82
Kern                80
Amador              78
Del Norte           78
El Dorado           76
Yuba       

In [11]:
# Reset the index
ppedf = ppedf.reset_index()
ppedf.head()

Unnamed: 0,index,County,Product,Date,Amount Fulfilled
0,0,Alameda,Cloth Masks,August,32805900.0
1,5,Alameda,Coveralls (Hospitals or EMS),August,187096.0
2,10,Alameda,Examination Gloves,August,102643800.0
3,15,Alameda,Face Shields (Disposable),August,5439884.0
4,20,Alameda,Goggles,August,369600.0


In [12]:
# Drop Extraneous 'Index' column
ppedf.drop(['index'], axis = 1, inplace = True)
ppedf.head()

Unnamed: 0,County,Product,Date,Amount Fulfilled
0,Alameda,Cloth Masks,August,32805900.0
1,Alameda,Coveralls (Hospitals or EMS),August,187096.0
2,Alameda,Examination Gloves,August,102643800.0
3,Alameda,Face Shields (Disposable),August,5439884.0
4,Alameda,Goggles,August,369600.0


In [13]:
# Output the final cleaned file to its csv
ppedf.to_csv('CleanCSV/PPE_data_clean.csv')