In [16]:
# Import dependencies
import pandas as pd

import numpy as np
import matplotlib as plt

In [17]:
# Store large csv file into dataframe
filepath = 'Resources/logistics_ppe.csv'
ppe_data = pd.read_csv(filepath)
ppe_data.head()

Unnamed: 0,county,product_family,quantity_filled,shipping_zip_postal_code,as_of_date
0,Non-Governmental Entity,Surgical Masks,,92064,2020-08-12
1,Non-Governmental Entity,Hand Sanitizers,,92064,2020-08-12
2,San Diego,Test Kits,,92123,2020-08-12
3,San Diego,Coveralls (Hospitals or EMS),,92123,2020-08-12
4,San Diego,Coveralls (Hospitals or EMS),,92123,2020-08-12


In [18]:
# Rename data columns
ppedf = pd.DataFrame()

ppedf = ppe_data.rename(columns = {'county':'County', 
                                      'product_family':'Product',
                                     'quantity_filled': 'Amount Fulfilled',
                                     'shipping_zip_postal_code': 'Postal',
                                     'as_of_date': 'Date'
                                     })
ppedf.head()

Unnamed: 0,County,Product,Amount Fulfilled,Postal,Date
0,Non-Governmental Entity,Surgical Masks,,92064,2020-08-12
1,Non-Governmental Entity,Hand Sanitizers,,92064,2020-08-12
2,San Diego,Test Kits,,92123,2020-08-12
3,San Diego,Coveralls (Hospitals or EMS),,92123,2020-08-12
4,San Diego,Coveralls (Hospitals or EMS),,92123,2020-08-12


In [19]:
# Drop Unnecessary rows/columns

# Drop Postal column
ppedf.drop(['Postal'], axis = 1, inplace = True)

# Drop NaN rows
ppedf = ppedf.dropna()

# Drop rows with 0 fulfilled equipment
ppedf = ppedf[ppedf['Amount Fulfilled'] > 0]
ppedf

Unnamed: 0,County,Product,Amount Fulfilled,Date
28,Fresno,N-95 Respirators,17400.0,2020-08-12
29,Los Angeles,Hand Sanitizers,216.0,2020-08-12
30,Los Angeles,Surgical Masks,2000.0,2020-08-12
31,Los Angeles,Cloth Masks,22500.0,2020-08-12
32,Los Angeles,Face Shields (Disposable),200.0,2020-08-12
...,...,...,...,...
3398257,San Mateo,N-95 Respirators,35000.0,2020-09-26
3399269,Kern,Examination Gloves,20000.0,2020-09-26
3399270,Kern,Examination Gloves,10000.0,2020-09-26
3399271,Kern,Surgical Masks,20000.0,2020-09-26


In [20]:
# Change Date column to be by month

# Change Date to datetime format
ppedf['Date'] = pd.to_datetime(ppedf['Date'])

# Extract Month String from datetime object
ppedf['Date'] = ppedf['Date'].dt.strftime('%B')
ppedf

Unnamed: 0,County,Product,Amount Fulfilled,Date
28,Fresno,N-95 Respirators,17400.0,August
29,Los Angeles,Hand Sanitizers,216.0,August
30,Los Angeles,Surgical Masks,2000.0,August
31,Los Angeles,Cloth Masks,22500.0,August
32,Los Angeles,Face Shields (Disposable),200.0,August
...,...,...,...,...
3398257,San Mateo,N-95 Respirators,35000.0,September
3399269,Kern,Examination Gloves,20000.0,September
3399270,Kern,Examination Gloves,10000.0,September
3399271,Kern,Surgical Masks,20000.0,September


In [21]:
agg_func = {'Amount Fulfilled':'sum'}
ppedf = ppedf.groupby(['County','Product','Date']).aggregate(agg_func).reset_index()
ppedf

Unnamed: 0,County,Product,Date,Amount Fulfilled
0,Alameda,Cloth Masks,August,32805900.0
1,Alameda,Cloth Masks,July,8400300.0
2,Alameda,Cloth Masks,June,2872860.0
3,Alameda,Cloth Masks,October,24112800.0
4,Alameda,Cloth Masks,September,43823900.0
...,...,...,...,...
5263,Yuba,Viral Testing Media,August,290592.0
5264,Yuba,Viral Testing Media,July,195884.0
5265,Yuba,Viral Testing Media,June,83500.0
5266,Yuba,Viral Testing Media,October,169536.0


In [22]:
# use groupby, combine rows that have the same County, Product, and Month, and Sum the amount fulfilled

# agg_func = {'Amount Fulfilled':'sum'}
# ppedf.groupby(['County', 'Product', 'Date'])['Amount Fulfilled'].sum().to_frame().reset_index()
# ppedf

In [23]:
# Sort the values alphabetically by County and Date
ppedf = ppedf.sort_values(['County', 'Date'])
ppedf

Unnamed: 0,County,Product,Date,Amount Fulfilled
0,Alameda,Cloth Masks,August,32805900.0
5,Alameda,Coveralls (Hospitals or EMS),August,187096.0
10,Alameda,Examination Gloves,August,102643800.0
15,Alameda,Face Shields (Disposable),August,5439884.0
20,Alameda,Goggles,August,369600.0
...,...,...,...,...
5247,Yuba,Surgical Masks,September,20286000.0
5252,Yuba,Surgical or Examination Gowns,September,1875240.0
5257,Yuba,Swabs,September,900060.0
5262,Yuba,Test Kits,September,96000.0


In [24]:
# Check value counts of County column to see which counties to exclude
ppedf['County'].value_counts()

State Agency             119
San Diego                116
Riverside                113
Tulare                   113
Orange                   113
                        ... 
Sierra                    52
State Agency or Other     50
Tribal                    24
Governmental Entity       14
Other                      6
Name: County, Length: 64, dtype: int64

In [25]:
# Drop data that does not belong to any county
countyToDrop = ppedf[(ppedf['County'] == 'Governmental Entity') | 
                     (ppedf['County'] == 'State Agency') |
                    (ppedf['County'] == 'Non-Governmental Entity') |
                    (ppedf['County'] == 'State Agency or Other') |
                    (ppedf['County'] == 'Other')
                    
                    ].index
ppedf.drop(countyToDrop, inplace = True)
ppedf['County'].value_counts()

San Diego          116
Tulare             113
Orange             113
Riverside          113
Stanislaus         112
Inyo               112
Los Angeles        109
Alameda            105
Sacramento         104
Contra Costa       104
Santa Clara        103
Monterey            99
Fresno              98
San Joaquin         98
Santa Cruz          98
San Mateo           98
Santa Barbara       97
Imperial            96
San Bernardino      93
San Francisco       92
Solano              91
Sonoma              91
Marin               89
Humboldt            89
San Luis Obispo     87
Merced              87
Yolo                87
Butte               86
Placer              86
Ventura             86
San Benito          84
Mendocino           84
Kings               84
Mono                84
Calaveras           84
Lake                83
Madera              82
Nevada              82
Napa                82
Kern                80
Del Norte           78
Amador              78
El Dorado           76
Yuba       

In [26]:
# Reset the index
ppedf = ppedf.reset_index()
ppedf

Unnamed: 0,index,County,Product,Date,Amount Fulfilled
0,0,Alameda,Cloth Masks,August,32805900.0
1,5,Alameda,Coveralls (Hospitals or EMS),August,187096.0
2,10,Alameda,Examination Gloves,August,102643800.0
3,15,Alameda,Face Shields (Disposable),August,5439884.0
4,20,Alameda,Goggles,August,369600.0
...,...,...,...,...,...
5004,5247,Yuba,Surgical Masks,September,20286000.0
5005,5252,Yuba,Surgical or Examination Gowns,September,1875240.0
5006,5257,Yuba,Swabs,September,900060.0
5007,5262,Yuba,Test Kits,September,96000.0


In [27]:
# Output the final cleaned file to its csv
ppedf.to_csv('CleanCSV/PPE_data_clean.csv')