In [1]:
import pandas as pd
import numpy as np
import csv

In [2]:
cali_layoffs = pd.read_csv('california_warn_raw.csv')
cali_population = pd.read_csv('county_population.csv', delimiter='\t', thousands=',')

- California WARN data: https://edd.ca.gov/Jobs_and_Training/Layoff_Services_WARN.htm
- population data from: https://www.census.gov/data/tables/time-series/demo/popest/2010s-counties-total.html#par_textimage_242301767

In [3]:
cali_layoffs.head()

Unnamed: 0,Notice Date,Effective Date,Received Date,Company,City,County,Employees,Layoff/Closure
0,10/26/2020,12/28/2020,11/10/2020,DAMAC Products LLC,La Mirada,Los Angeles County,79.0,Closure Permanent
1,10/27/2020,12/31/2020,11/10/2020,Gulfstream,Long Beach,Los Angeles County,608.0,Closure Permanent
2,10/26/2020,12/16/2020,11/10/2020,Renaissance Newport Beach Hotel,Newport Beach,Orange County,104.0,Closure Permanent
3,10/01/2020,10/05/2020,11/10/2020,Howmet Aerospace,Rancho Cucamonga,San Bernardino County,66.0,Layoff Temporary
4,10/26/2020,07/07/2020,11/10/2020,"Newport Diversified, Inc.",El Cajon,San Diego County,91.0,Layoff Temporary


In [4]:
cali_layoffs.tail()

Unnamed: 0,Notice Date,Effective Date,Received Date,Company,City,County,Employees,Layoff/Closure
9502,06/25/2015,08/24/2015,06/30/2015,"Intuit, Inc.",San Francisco,No County Name,86.0,Layoff Permanent
9503,06/25/2015,08/24/2015,06/30/2015,"Intuit, Inc.",Santa Monica,No County Name,49.0,Closure Permanent
9504,06/25/2015,08/24/2015,06/30/2015,"Intuit, Inc.",Venice,No County Name,11.0,Closure Permanent
9505,06/29/2015,08/28/2015,06/30/2015,"Safeway, Inc.",Pleasanton,No County Name,18.0,Layoff Unknown at this time
9506,06/30/2015,07/22/2015,06/30/2015,Medtronic Ablation Frontiers LLC,Carlsbad,No County Name,50.0,Closure Permanent


## California Layoffs

For the purposes of this analysis, we are selecting 2020 records by isolating the year on the `Notice date` column.
By choosing `Notice date` we make sure that these layoffs were issued after the start of shelter in place for 2020, as some layoff notices can be issues much earlier the layoff date. Here, we are also doing a bit of data cleaning to standardize county names.

In [5]:
cali_layoffs['Year'] = cali_layoffs['Notice Date'].str[-4:]
cali_layoffs['Layoff/Closure'] = cali_layoffs['Layoff/Closure'].str.replace('at thistime', 'at this time')
cali_layoffs['County'] = cali_layoffs['County'].str.strip()
cali_layoffs.loc[~(cali_layoffs['County'].str.endswith('County')),'County'] = cali_layoffs['County'] + " County"
cali_layoffs['County'] = cali_layoffs['County'].str.replace('No County Name County', 'No Name')

## Cleaning Layoff Types

Type Unknown and unknown at this time will be combined.

In [6]:
cali_layoffs['Layoff/Closure'].unique()

array(['Closure Permanent', 'Layoff Temporary', 'Closure Temporary',
       'Layoff Permanent', 'Layoff Type Unknown', 'Closure Type Unknown',
       'Closure Unknown at this time', 'Layoff Unknown at this time'],
      dtype=object)

In [7]:
cali_layoffs['Layoff/Closure'] = cali_layoffs['Layoff/Closure'].str.strip()
cali_layoffs['Layoff/Closure'] = cali_layoffs['Layoff/Closure'].str.lower()
cali_layoffs['Layoff/Closure'] = cali_layoffs['Layoff/Closure'].str.replace('unknown at this time', 'type unknown')
cali_layoffs['Layoff/Closure'].unique()

array(['closure permanent', 'layoff temporary', 'closure temporary',
       'layoff permanent', 'layoff type unknown', 'closure type unknown'],
      dtype=object)

In [8]:
cali_2020 = cali_layoffs[cali_layoffs['Year'] == '2020']
cali_2020['Employees'] = cali_2020['Employees'].astype('Int64')
cali_2020.tail()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cali_2020['Employees'] = cali_2020['Employees'].astype('Int64')


Unnamed: 0,Notice Date,Effective Date,Received Date,Company,City,County,Employees,Layoff/Closure,Year
5636,01/03/2020,03/06/2020,01/09/2020,"Medtronic, Inc.",Goleta,Santa Barbara County,33,layoff permanent,2020
5637,01/06/2020,03/07/2020,01/08/2020,"Edmunds.com, Inc.",Santa Monica,Los Angeles County,122,layoff permanent,2020
5638,01/06/2020,03/09/2020,01/06/2020,SunSelect Produce (California) Inc.,Tehachapi,Kern County,236,closure permanent,2020
5639,01/03/2020,03/04/2020,01/03/2020,"Aramark Services, Inc.",Anaheim,Orange County,9,closure permanent,2020
5640,01/03/2020,03/04/2020,01/03/2020,"Aramark Services, Inc.",Rancho SantaMargarita,Orange County,13,closure permanent,2020


In [9]:
cali_layoffs.tail()

Unnamed: 0,Notice Date,Effective Date,Received Date,Company,City,County,Employees,Layoff/Closure,Year
9502,06/25/2015,08/24/2015,06/30/2015,"Intuit, Inc.",San Francisco,No Name,86.0,layoff permanent,2015
9503,06/25/2015,08/24/2015,06/30/2015,"Intuit, Inc.",Santa Monica,No Name,49.0,closure permanent,2015
9504,06/25/2015,08/24/2015,06/30/2015,"Intuit, Inc.",Venice,No Name,11.0,closure permanent,2015
9505,06/29/2015,08/28/2015,06/30/2015,"Safeway, Inc.",Pleasanton,No Name,18.0,layoff type unknown,2015
9506,06/30/2015,07/22/2015,06/30/2015,Medtronic Ablation Frontiers LLC,Carlsbad,No Name,50.0,closure permanent,2015


In [10]:
cali_layoffs['County'].unique()

array(['Los Angeles County', 'Orange County', 'San Bernardino County',
       'San Diego County', 'Sacramento County', 'Alameda County',
       'Santa Clara County', 'San Francisco County', 'Stanislaus County',
       'Contra Costa County', 'Riverside County', 'Solano County',
       'Ventura County', 'Monterey County', 'Placer County',
       'San Mateo County', 'Santa Barbara County', 'Fresno County',
       'Kings County', 'Kern County', 'Santa Cruz County',
       'Tehama County', 'Mendocino County', 'San Joaquin County',
       'Butte County', 'Sonoma County', 'Napa County', 'Calaveras County',
       'San Luis Obispo County', 'Siskiyou County', 'Yolo County',
       'Yuba County', 'Sutter County', 'Tulare County',
       'El Dorado County', 'Marin County', 'Mariposa County',
       'Imperial County', 'Shasta County', 'Madera County',
       'Del Norte County', 'Mono County', 'Merced County',
       'Humboldt County', 'San Benito County', 'Nevada County',
       'Tuolumne County',

In [11]:
years = ['2020', '2019']

## Extracting Only Necessary Records

Now that the data has been somewhat standardized, we will be selecting the records that have a notice date from the year 2020 or 2019. This will allow for comparisons between a pandemic and non-pandemic year in terms of layoff notices. 

In [12]:
cali_2020_2019 = cali_layoffs[cali_layoffs["Year"].isin(years)]
cali_2020_2019

Unnamed: 0,Notice Date,Effective Date,Received Date,Company,City,County,Employees,Layoff/Closure,Year
0,10/26/2020,12/28/2020,11/10/2020,DAMAC Products LLC,La Mirada,Los Angeles County,79.0,closure permanent,2020
1,10/27/2020,12/31/2020,11/10/2020,Gulfstream,Long Beach,Los Angeles County,608.0,closure permanent,2020
2,10/26/2020,12/16/2020,11/10/2020,Renaissance Newport Beach Hotel,Newport Beach,Orange County,104.0,closure permanent,2020
3,10/01/2020,10/05/2020,11/10/2020,Howmet Aerospace,Rancho Cucamonga,San Bernardino County,66.0,layoff temporary,2020
4,10/26/2020,07/07/2020,11/10/2020,"Newport Diversified, Inc.",El Cajon,San Diego County,91.0,layoff temporary,2020
...,...,...,...,...,...,...,...,...,...
6740,06/28/2019,08/28/2019,06/28/2019,"California Comfort Systems USA, Inc.",North Highlands,Sacramento County,15.0,closure permanent,2019
6741,06/28/2019,08/28/2019,06/28/2019,"California Comfort Systems USA, Inc.",Ontario,San Bernardino County,16.0,closure permanent,2019
6742,06/28/2019,08/28/2019,06/28/2019,"California Comfort Systems USA, Inc.",San Diego,San Diego County,84.0,closure permanent,2019
6743,06/28/2019,08/30/2019,06/28/2019,"Coloredge, Inc.",Burbank,Los Angeles County,32.0,closure permanent,2019


## More Data Cleaning

A few records have the incorrect county, based on the information in the city column. These cities include: Sacramento, Hayward, Indian Wells, Cerritos, Vista, Roseville, City of Industry, Del Mar, Walnut Creek, and Los Gatos.

In [13]:
dict_cities = {
    'Sacramento':'Sacramento County',
    'San Francisco':'San Francisco County',
    'Hayward':'Alameda County',
    'Indian Wells':'Riverside County',
    'Cerritos':'Los Angeles County',
    'Vista':'San Diego County',
    'Roseville':'Placer County',
    'City of Industry':'Los Angeles County',
    'Del Mar':'San Diego County',
    'Walnut Creek':'Contra Costa County',
    'Los Gatos':'Santa Clara County'
}

In [14]:
# Make a copy of this column instead

wrong_county = cali_2020_2019[(cali_2020_2019['City'] == 'Sacramento') & (cali_2020_2019['County'] == 'Del Norte County')]
cali_2020_2019.loc[1793,'County'] = 'Sacramento County'
wrong_county = cali_2020_2019[(cali_2020_2019['City'] == 'Hayward') & (cali_2020_2019['County'] == 'Contra Costa County')]
cali_2020_2019.loc[5749,'County'] = 'Alameda County'
wrong_county = cali_2020_2019[(cali_2020_2019['City'] == 'Indian Wells') & (cali_2020_2019['County'] == 'San Benito County')]
cali_2020_2019.loc[5246,'County'] = 'Riverside County'
wrong_county = cali_2020_2019[(cali_2020_2019['City'] == 'Cerritos') & (cali_2020_2019['County'] == 'Orange County')]
cali_2020_2019.loc[3168,'County'] = 'Los Angeles County'
wrong_county = cali_2020_2019[(cali_2020_2019['City'] == 'Vista') & (cali_2020_2019['County'] == 'San Francisco County')]
cali_2020_2019.loc[5202,'County'] = 'San Diego County'
wrong_county = cali_2020_2019[(cali_2020_2019['City'] == 'Roseville') & (cali_2020_2019['County'] == 'Sacramento County')]
cali_2020_2019.loc[4748,'County'] = 'Placer County'
wrong_county = cali_2020_2019[(cali_2020_2019['City'] == 'City of Industry') & (cali_2020_2019['County'] == 'Orange County')]
cali_2020_2019.loc[1772,'County'] = 'Los Angeles County'
wrong_county = cali_2020_2019[(cali_2020_2019['City'] == 'Del Mar') & (cali_2020_2019['County'] == 'Orange County')]
cali_2020_2019.loc[5190,'County'] = 'San Diego County'
wrong_county = cali_2020_2019[(cali_2020_2019['City'] == 'Walnut Creek') & (cali_2020_2019['County'] == 'Alameda County')]
cali_2020_2019.loc[4433,'County'] = 'Contra Costa County'
wrong_county = cali_2020_2019[(cali_2020_2019['City'] == 'Los Gatos') & (cali_2020_2019['County'] == 'Santa Cruz County')]
cali_2020_2019.loc[2648,'County'] = 'Santa Clara County'

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(loc, value)


## Next Steps
- [x] clean layoffs types
- [x] make sure this is the most up to date data or get the most up to date data
- [x] calculate per capita instead of percentage
- [ ] clean company names in open refine

## Exporting Standardized Data

Now that we have 2020 and 2019 data in an isolated dataframe, we can export it and clean the company names in Open Refine.
The data with clean company names will be re-read into the notebook on the line after the export. 

In [15]:
cali_2020_2019.to_csv('tobecleaned_california_warn.csv')

In [16]:
open_refine_clean = pd.read_csv('cleaned_california_warn.csv')

In [17]:
open_refine_clean.head()

Unnamed: 0,Column,Notice Date,Effective Date,Received Date,Company,Company 2,City,County,Employees,Layoff/Closure,Year
0,0,10/26/2020,12/28/2020,11/10/2020,DAMAC Products LLC,DAMAC Products LLC,La Mirada,Los Angeles County,79.0,closure permanent,2020
1,1,10/27/2020,12/31/2020,11/10/2020,Gulfstream,Gulfstream,Long Beach,Los Angeles County,608.0,closure permanent,2020
2,2,10/26/2020,12/16/2020,11/10/2020,Renaissance Newport Beach Hotel,Renaissance Hotel,Newport Beach,Orange County,104.0,closure permanent,2020
3,3,10/01/2020,10/05/2020,11/10/2020,Howmet Aerospace,Howmet Aerospace,Rancho Cucamonga,San Bernardino County,66.0,layoff temporary,2020
4,4,10/26/2020,07/07/2020,11/10/2020,"Newport Diversified, Inc.","Newport Diversified, Inc.",El Cajon,San Diego County,91.0,layoff temporary,2020


In [18]:
open_refine_clean.drop(columns="Column", inplace=True)
open_refine_clean.head()

Unnamed: 0,Notice Date,Effective Date,Received Date,Company,Company 2,City,County,Employees,Layoff/Closure,Year
0,10/26/2020,12/28/2020,11/10/2020,DAMAC Products LLC,DAMAC Products LLC,La Mirada,Los Angeles County,79.0,closure permanent,2020
1,10/27/2020,12/31/2020,11/10/2020,Gulfstream,Gulfstream,Long Beach,Los Angeles County,608.0,closure permanent,2020
2,10/26/2020,12/16/2020,11/10/2020,Renaissance Newport Beach Hotel,Renaissance Hotel,Newport Beach,Orange County,104.0,closure permanent,2020
3,10/01/2020,10/05/2020,11/10/2020,Howmet Aerospace,Howmet Aerospace,Rancho Cucamonga,San Bernardino County,66.0,layoff temporary,2020
4,10/26/2020,07/07/2020,11/10/2020,"Newport Diversified, Inc.","Newport Diversified, Inc.",El Cajon,San Diego County,91.0,layoff temporary,2020


## Adding Population Data

The data has been grabbed from the census link above. These figures will be used to calculate per capita layoff numbers per county.

In [19]:
cali_population.rename(columns={'California':'Counties','39,512,223':'Population'}, inplace=True)
cali_population['County Names'] = cali_population['Counties'].str[1:]
cali_population['County Names'] = cali_population['County Names'].str.split(',').str[0]
cali_population.head(4)

Unnamed: 0,Counties,Population,County Names
0,".Alameda County, California",1671329,Alameda County
1,".Alpine County, California",1129,Alpine County
2,".Amador County, California",39752,Amador County
3,".Butte County, California",219186,Butte County


In [64]:
cali_merge = pd.merge(open_refine_clean, cali_population, how="left", left_on="County", right_on="County Names")
cali_merge.drop(columns='Counties', inplace=True)
cali_merge['City 2'] = cali_merge['City'].str.lower()
cali_merge.head()

Unnamed: 0,Notice Date,Effective Date,Received Date,Company,Company 2,City,County,Employees,Layoff/Closure,Year,Population,County Names,City 2
0,10/26/2020,12/28/2020,11/10/2020,DAMAC Products LLC,DAMAC Products LLC,La Mirada,Los Angeles County,79.0,closure permanent,2020,10039107,Los Angeles County,la mirada
1,10/27/2020,12/31/2020,11/10/2020,Gulfstream,Gulfstream,Long Beach,Los Angeles County,608.0,closure permanent,2020,10039107,Los Angeles County,long beach
2,10/26/2020,12/16/2020,11/10/2020,Renaissance Newport Beach Hotel,Renaissance Hotel,Newport Beach,Orange County,104.0,closure permanent,2020,3175692,Orange County,newport beach
3,10/01/2020,10/05/2020,11/10/2020,Howmet Aerospace,Howmet Aerospace,Rancho Cucamonga,San Bernardino County,66.0,layoff temporary,2020,2180085,San Bernardino County,rancho cucamonga
4,10/26/2020,07/07/2020,11/10/2020,"Newport Diversified, Inc.","Newport Diversified, Inc.",El Cajon,San Diego County,91.0,layoff temporary,2020,3338330,San Diego County,el cajon


In [67]:
pd.options.display.max_rows = 999
dupes = cali_merge[cali_merge[['Company 2', 'City 2', 'County', 'Employees', 'Year']].duplicated(keep=False)].sort_values(by='Company 2', ascending=True)
dupes.head(10)

Unnamed: 0,Notice Date,Effective Date,Received Date,Company,Company 2,City,County,Employees,Layoff/Closure,Year,Population,County Names,City 2
810,07/23/2020,07/23/2020,08/18/2020,"24 HOUR FITNESS, USA, INC.",24 Hour Fitness USA Inc.,Carlsbad,San Diego County,39.0,layoff temporary,2020,3338330,San Diego County,carlsbad
917,08/05/2020,09/09/2020,08/10/2020,"24 Hour Fitness, USA, Inc",24 Hour Fitness USA Inc.,Carlsbad,San Diego County,39.0,layoff type unknown,2020,3338330,San Diego County,carlsbad
3990,04/03/2020,04/03/2020,04/17/2020,"4LEAF, Inc.","4LEAF, Inc",Pleasanton,Alameda County,53.0,layoff temporary,2020,1671329,Alameda County,pleasanton
3989,04/03/2020,04/03/2020,04/17/2020,"4LEAF, Inc","4LEAF, Inc",Pleasanton,Alameda County,53.0,layoff temporary,2020,1671329,Alameda County,pleasanton
5019,03/24/2020,03/27/2020,04/07/2020,ABM AVIATION,"ABM Aviation, Inc.",Los Angeles,Los Angeles County,50.0,layoff temporary,2020,10039107,Los Angeles County,los angeles
161,10/06/2020,10/09/2020,10/08/2020,ABM Aviation,"ABM Aviation, Inc.",Los Angeles,Los Angeles County,50.0,layoff permanent,2020,10039107,Los Angeles County,los angeles
1394,06/15/2020,06/30/2020,06/25/2020,AEG LA Youth Soccer Academy LLC,AEG LA Youth Soccer Academy LLC,Carson,Los Angeles County,3.0,layoff temporary,2020,10039107,Los Angeles County,carson
1543,06/08/2020,06/30/2020,06/15/2020,AEG LA Youth Soccer Academy LLC,AEG LA Youth Soccer Academy LLC,Carson,Los Angeles County,3.0,layoff permanent,2020,10039107,Los Angeles County,carson
3397,04/11/2020,04/14/2020,04/24/2020,AIO Acquisition Inc.,AIO Acquisition Inc.,Ontario,San Bernardino County,5.0,layoff permanent,2020,2180085,San Bernardino County,ontario
72,05/22/2020,05/22/2020,10/26/2020,AIO Acquisition Inc.,AIO Acquisition Inc.,Ontario,San Bernardino County,5.0,layoff permanent,2020,2180085,San Bernardino County,ontario


In [78]:
cali_merge[cali_merge[['Company 2', 'City 2', 'County', 'Employees', 'Year']].duplicated() == True]

Unnamed: 0,Notice Date,Effective Date,Received Date,Company,Company 2,City,County,Employees,Layoff/Closure,Year,Population,County Names,City 2
16,10/01/2020,12/11/2020,11/09/2020,Ace Beverage LLC,Ace Beverage LLC,Los Angeles,Los Angeles County,200.0,closure permanent,2020,10039107,Los Angeles County,los angeles
86,10/15/2020,02/22/2021,10/22/2020,"Martiz Holdings, Inc.","Maritz Holdings, Inc.",Torrance,Los Angeles County,2.0,layoff permanent,2020,10039107,Los Angeles County,torrance
163,10/01/2020,10/01/2020,10/08/2020,Electro Rent,Electro Rent,Canoga Park,Los Angeles County,25.0,layoff temporary,2020,10039107,Los Angeles County,canoga park
168,09/30/2020,09/28/2020,10/07/2020,Centric Brands Inc.,Centric Brands Inc.,Los Angeles,Los Angeles County,1.0,layoff permanent,2020,10039107,Los Angeles County,los angeles
228,06/26/2020,08/09/2020,10/02/2020,nVent Thermal,nVent Thermal LLC,Redwood City,San Mateo County,62.0,layoff temporary,2020,766573,San Mateo County,redwood city
361,06/09/2020,06/15/2020,09/26/2020,JC Resorts LLC Surf and Sand Resort,JC RESORTS,Laguna Beach,Orange County,40.0,layoff permanent,2020,3175692,Orange County,laguna beach
368,06/09/2020,04/30/2020,09/26/2020,Golden Valley Health Center,Golden Valley Health Center,Modesto,Stanislaus County,1.0,closure temporary,2020,550660,Stanislaus County,modesto
369,06/09/2020,05/04/2020,09/26/2020,Golden Valley Health Centers,Golden Valley Health Center,Modesto,Stanislaus County,1.0,layoff temporary,2020,550660,Stanislaus County,modesto
370,06/09/2020,06/01/2020,09/26/2020,Golden Valley Health Centers,Golden Valley Health Center,Ceres,Stanislaus County,1.0,layoff temporary,2020,550660,Stanislaus County,ceres
452,06/03/2020,06/05/2020,09/20/2020,Wyndham Vacation Ownership - Vino Napa Bello R...,Wyndham Vacation Ownership,Napa,Napa County,2.0,layoff permanent,2020,137744,Napa County,napa


In [66]:
# employees_sum = cali_merge[['County','Employees']]
# layoffs = employees_sum.groupby('County').agg('sum')
# layoffs.reset_index()

In [59]:
# cali_info_and_sums = pd.merge(cali_merge, layoffs, how="left", left_on="County", right_on="County")
# cali_info_and_sums.rename(columns={'Employees_y':'Total Emp. Laid Off in County', 'Employees_x':'Employees'}, inplace=True)
# cali_info_and_sums['Per Capita Laid Off'] = (cali_info_and_sums['Total Emp. Laid Off in County']/cali_info_and_sums['Population']) * 100000
# cali_info_and_sums['City 2'] = cali_info_and_sums['City'].str.lower()
# cali_info_and_sums.head()

In [23]:
#possible_dupes = If the company name, city and county name, layoff # + year # are the same, 
#but the layoff type is different, flag it, add it to a different dataframe.

In [60]:
# pd.options.display.max_rows = 999
# dupes = cali_info_and_sums[cali_info_and_sums[['Company 2', 'City 2', 'County', 'Employees', 'Year']].duplicated(keep=False)].sort_values(by='Company 2', ascending=True)
# dupes.head(20)

In [77]:
## This dataframe will hold the data that is non duplicated.

# cali_no_dupes = cali_info_and_sums[~cali_info_and_sums.isin(dupes)].dropna()

In [76]:
## Here we group by key columns in the dupes dataframe to get one single entry of the dupes and then add it back to cali_no_dupes as a new dataframe

# dupes = cali_info_and_sums[cali_info_and_sums[['Company 2', 'City 2', 'County', 'Employees', 'Year']].duplicated(keep=False)].sort_values(by='Company 2', ascending=True)
# dupes

6369