In [1]:
import pandas as pd
import numpy as np
import csv

In [2]:
cali_layoffs = pd.read_csv('../data/cleaning/california_warn_raw_recent.csv')
cali_population = pd.read_csv('../data/cleaning/county_population.csv', delimiter='\t', thousands=',')

In [3]:
cali_layoffs.head()

Unnamed: 0,Notice Date,Effective Date,Received Date,Company,City,County,Employees,Layoff/Closure
0,12/04/2020,12/06/2020,12/30/2020,Blackhawk Country Club,Danville,Contra Costa County,3.0,Layoff Temporary
1,12/07/2020,12/09/2020,12/30/2020,"Fullerton Baekjeong, LLC",Buena Park,Orange County,67.0,Closure Permanent
2,12/07/2020,12/09/2020,12/30/2020,"Irvine Baekjeong, LLC",Irvine,Orange County,68.0,Closure Permanent
3,12/09/2020,12/10/2020,12/30/2020,OCMC Inc. dba Orange County Mining Co.,Santa Ana,Orange County,59.0,Closure Temporary
4,12/07/2020,12/07/2020,12/30/2020,"Parker Palm Springs, LLC",Palm Springs,Riverside County,79.0,Layoff Temporary


### Ambiguous Cities

In doing some preliminary exploration of the data, we learned that some cities have been assigned the incorrect county name. The code below finds the cities with more than one county assigned. We use the results produced to go back and clean those records in the WARN Layoffs analysis notebook.

### Clean County Names

In [4]:
cali_layoffs['County'] = cali_layoffs['County'].str.strip()
cali_layoffs['County Orig'] = cali_layoffs['County']
cali_layoffs.loc[~(cali_layoffs['County'].str.endswith('County')),'County'] = cali_layoffs['County'] + " County"
cali_layoffs['County'] = cali_layoffs['County'].str.replace('No County Name County', 'No Name')
cali_layoffs['Year'] = cali_layoffs['Notice Date'].str[-4:]
cali_layoffs['Year'] = cali_layoffs['Year'].astype(np.int64)
cali_layoffs.head()

Unnamed: 0,Notice Date,Effective Date,Received Date,Company,City,County,Employees,Layoff/Closure,County Orig,Year
0,12/04/2020,12/06/2020,12/30/2020,Blackhawk Country Club,Danville,Contra Costa County,3.0,Layoff Temporary,Contra Costa County,2020
1,12/07/2020,12/09/2020,12/30/2020,"Fullerton Baekjeong, LLC",Buena Park,Orange County,67.0,Closure Permanent,Orange County,2020
2,12/07/2020,12/09/2020,12/30/2020,"Irvine Baekjeong, LLC",Irvine,Orange County,68.0,Closure Permanent,Orange County,2020
3,12/09/2020,12/10/2020,12/30/2020,OCMC Inc. dba Orange County Mining Co.,Santa Ana,Orange County,59.0,Closure Temporary,Orange County,2020
4,12/07/2020,12/07/2020,12/30/2020,"Parker Palm Springs, LLC",Palm Springs,Riverside County,79.0,Layoff Temporary,Riverside County,2020


### Cleaning Layoff Types

Type Unknown and unknown at this time will be combined.

In [5]:
cali_layoffs['Layoff/Closure'].unique()

array(['Layoff Temporary', 'Closure Permanent', 'Closure Temporary',
       'Layoff Permanent', 'Layoff Type Unknown', 'Closure Type Unknown',
       'Closure Unknown at thistime', 'Layoff Unknown at thistime',
       'Layoff Unknown at this time', 'Closure Unknown at this time'],
      dtype=object)

In [6]:
cali_layoffs['Layoff/Closure clean'] = cali_layoffs['Layoff/Closure'].str.replace('at thistime', 'at this time')
cali_layoffs['Layoff/Closure clean'] = cali_layoffs['Layoff/Closure clean'].str.strip()
cali_layoffs['Layoff/Closure clean'] = cali_layoffs['Layoff/Closure clean'].str.lower()
cali_layoffs['Layoff/Closure clean'] = cali_layoffs['Layoff/Closure clean'].str.replace('unknown at this time', 'type unknown')
cali_layoffs['Layoff/Closure clean'] = cali_layoffs['Layoff/Closure clean'].str.replace('type unknown', 'type uncategorized')
cali_layoffs['Layoff/Closure clean'].unique()

array(['layoff temporary', 'closure permanent', 'closure temporary',
       'layoff permanent', 'layoff type uncategorized',
       'closure type uncategorized'], dtype=object)

In [7]:
cali_dict = cali_layoffs.to_dict(orient='records')
cali_dict[0:2]

[{'Notice Date': '12/04/2020',
  'Effective Date': '12/06/2020',
  'Received Date': '12/30/2020',
  'Company': 'Blackhawk Country Club',
  'City': 'Danville',
  'County': 'Contra Costa County',
  'Employees': 3.0,
  'Layoff/Closure': 'Layoff Temporary',
  'County Orig': 'Contra Costa County',
  'Year': 2020,
  'Layoff/Closure clean': 'layoff temporary'},
 {'Notice Date': '12/07/2020',
  'Effective Date': '12/09/2020',
  'Received Date': '12/30/2020',
  'Company': 'Fullerton Baekjeong, LLC',
  'City': 'Buena Park',
  'County': 'Orange County',
  'Employees': 67.0,
  'Layoff/Closure': 'Closure Permanent',
  'County Orig': 'Orange County',
  'Year': 2020,
  'Layoff/Closure clean': 'closure permanent'}]

In [8]:
cali_layoffs['County'].unique()

array(['Contra Costa County', 'Orange County', 'Riverside County',
       'Santa Clara County', 'San Diego County', 'San Mateo County',
       'Sacramento County', 'San Luis Obispo County', 'Sonoma County',
       'San Francisco County', 'Ventura County', 'Inyo County',
       'Los Angeles County', 'San Joaquin County', 'Stanislaus County',
       'Alameda County', 'San Bernardino County', 'Napa County',
       'Santa Barbara County', 'Marin County', 'Fresno County',
       'Santa Cruz County', 'Solano County', 'Butte County',
       'Glenn County', 'Tulare County', 'Kings County', 'Monterey County',
       'Shasta County', 'Placer County', 'Kern County', 'Tehama County',
       'Mendocino County', 'Calaveras County', 'Siskiyou County',
       'Yolo County', 'Yuba County', 'Sutter County', 'El Dorado County',
       'Mariposa County', 'Imperial County', 'Madera County',
       'Del Norte County', 'Mono County', 'Merced County',
       'Humboldt County', 'San Benito County', 'Nevada Cou

In [9]:
rows_with_no_county = [r for r in cali_dict if r["County"] == "No Name"]
len(rows_with_no_county)

2117

In [10]:
rows_with_no_county[0]

{'Notice Date': '06/30/2016',
 'Effective Date': '08/31/2016',
 'Received Date': '07/01/2016',
 'Company': 'DCS Facility Services',
 'City': 'Antelope',
 'County': 'No Name',
 'Employees': 55.0,
 'Layoff/Closure': 'Closure Permanent',
 'County Orig': 'No County Name',
 'Year': 2016,
 'Layoff/Closure clean': 'closure permanent'}

In [11]:
cities = [r["City"] for r in rows_with_no_county]

In [12]:
len(cities)

2117

In [13]:
cities = set(cities)

In [14]:
len(cities)

355

In [15]:
rows_with_county = [r for r in cali_dict if r["County"] != "No Name"]

In [16]:
len(rows_with_county)

7633

In [17]:
rows_with_county[334]

{'Notice Date': '08/03/2020',
 'Effective Date': '07/31/2020',
 'Received Date': '10/23/2020',
 'Company': 'Galpin Motors, Inc.',
 'City': 'North Hills',
 'County': 'Los Angeles County',
 'Employees': 47.0,
 'Layoff/Closure': 'Layoff Permanent',
 'County Orig': 'Los Angeles County',
 'Year': 2020,
 'Layoff/Closure clean': 'layoff permanent'}

In [18]:
city_to_counties = {}
for row in rows_with_county:
    city = row["City"]
    county = row["County"]
    if city not in city_to_counties:
        city_to_counties[city] = set()
    city_to_counties[city].add(county)

In [19]:
unambiguous_cities = [pair for pair in city_to_counties.items() if len(pair[1]) == 1]

In [20]:
len(unambiguous_cities)

525

In [21]:
unambiguous_cities = {pair[0]: list(pair[1])[0] for pair in city_to_counties.items() if len(pair[1]) == 1}

In [22]:
for row in rows_with_no_county:
    if row["City"] in unambiguous_cities:
        row["County"] = unambiguous_cities[row["City"]]

In [23]:
rows_with_no_county[:2]

[{'Notice Date': '06/30/2016',
  'Effective Date': '08/31/2016',
  'Received Date': '07/01/2016',
  'Company': 'DCS Facility Services',
  'City': 'Antelope',
  'County': 'No Name',
  'Employees': 55.0,
  'Layoff/Closure': 'Closure Permanent',
  'County Orig': 'No County Name',
  'Year': 2016,
  'Layoff/Closure clean': 'closure permanent'},
 {'Notice Date': '06/30/2016',
  'Effective Date': '08/31/2016',
  'Received Date': '07/01/2016',
  'Company': 'DCS Facility Services',
  'City': 'Bakersfield',
  'County': 'Kern County',
  'Employees': 22.0,
  'Layoff/Closure': 'Closure Permanent',
  'County Orig': 'No County Name',
  'Year': 2016,
  'Layoff/Closure clean': 'closure permanent'}]

In [24]:
ambiguous_cities = [pair for pair in city_to_counties.items() if len(pair[1]) > 1]

In [25]:
ambiguous_cities

[('Sacramento', {'Del Norte County', 'Sacramento County'}),
 ('San Francisco', {'San Francisco County', 'San Mateo County'}),
 ('Roseville', {'Placer County', 'Sacramento County'}),
 ('Walnut Creek', {'Alameda County', 'Contra Costa County'}),
 ('Los Angeles', {'Alameda County', 'Los Angeles County'}),
 ('Berkeley', {'Alameda County', 'Contra Costa County'}),
 ('Cerritos', {'Los Angeles County', 'Orange County'}),
 ('Hayward', {'Alameda County', 'Contra Costa County'}),
 ('Indian Wells', {'Riverside County', 'San Benito County'}),
 ('Vista', {'San Diego County', 'San Francisco County'}),
 ('City of Industry', {'Los Angeles County', 'Orange County'}),
 ('Del Mar', {'Orange County', 'San Diego County'}),
 ('Los Gatos', {'Santa Clara County', 'Santa Cruz County'}),
 ('Rancho', {'Los Angeles County', 'San Bernardino County'})]

### Manual research narrowed down which cities actually belonged to which counties
resulting in our updated `ambiguous_cities` dictionary. `Rancho` is the only city on the list where the two counties actually have cities with the same name. It will be left off the list. 

***The dictionary below may need further manual adjusting if this notebook is being run with updated data.***

In [26]:
dict_cities = {
    'Sacramento':'Sacramento County',
    'San Francisco':'San Francisco County',
    'Hayward':'Alameda County',
    'Indian Wells':'Riverside County',
    'Cerritos':'Los Angeles County',
    'Vista':'San Diego County',
    'Roseville':'Placer County',
    'City of Industry':'Los Angeles County',
    'Del Mar':'San Diego County',
    'Walnut Creek':'Contra Costa County',
    'Los Gatos':'Santa Clara County',
    'Los Angeles':'Los Angeles County',
    'Berkeley': 'Alameda County'
}

### Clean County Names
Below is our code to correct County Names based on City Names. 
We are taking the dictionary above, looping through the city and county names, which are the keys and values, replacing the incorrect County Name with the correct one for each city's record.

In [27]:
for city, county in dict_cities.items():
    cali_layoffs.loc[cali_layoffs['City'] == city, ['County']] = county

In [28]:
los_gatos_clean = cali_layoffs[cali_layoffs['City'] == 'Los Gatos']
los_gatos_clean.head(8)

Unnamed: 0,Notice Date,Effective Date,Received Date,Company,City,County,Employees,Layoff/Closure,County Orig,Year,Layoff/Closure clean
1594,06/23/2020,08/20/2020,06/29/2020,Boston Scientific Corporation,Los Gatos,Santa Clara County,60.0,Closure Permanent,Santa Clara County,2020,closure permanent
1686,06/09/2020,06/07/2020,06/24/2020,Bay Club Courtside,Los Gatos,Santa Clara County,294.0,Layoff Permanent,Santa Clara County,2020,layoff permanent
2592,03/30/2020,03/17/2020,05/20/2020,BACK OF THE HOUSE DBA SUPER DUPER,Los Gatos,Santa Clara County,30.0,Layoff Temporary,Santa Clara County,2020,layoff temporary
2891,05/07/2020,03/17/2020,05/13/2020,Diocese of Monterey,Los Gatos,Santa Clara County,1.0,Layoff Temporary,Santa Cruz County,2020,layoff temporary
3232,04/03/2020,03/31/2020,05/01/2020,Insight Health Corp.,Los Gatos,Santa Clara County,6.0,Layoff Temporary,Santa Clara County,2020,layoff temporary
4126,04/03/2020,04/12/2020,04/18/2020,Addison-Penzak Community Center of SiliconValley,Los Gatos,Santa Clara County,130.0,Layoff Temporary,Santa Clara County,2020,layoff temporary
4730,03/30/2020,03/16/2020,04/13/2020,Kronos LLC,Los Gatos,Santa Clara County,65.0,Layoff Unknown at thistime,Santa Clara County,2020,layoff type uncategorized
5175,03/25/2020,03/31/2020,04/09/2020,Testarossa Winery,Los Gatos,Santa Clara County,134.0,Layoff Temporary,Santa Clara County,2020,layoff temporary


### Adding Population Data

The data has been grabbed from the census link above. These figures will be used to calculate per capita layoff numbers per county.

In [29]:
cali_population.rename(columns={'California':'Counties','39,512,223':'Population'}, inplace=True)
cali_population['County Names'] = cali_population['Counties'].str[1:]
cali_population['County Names'] = cali_population['County Names'].str.split(',').str[0]
cali_population.head(4)

Unnamed: 0,Counties,Population,County Names
0,".Alameda County, California",1671329,Alameda County
1,".Alpine County, California",1129,Alpine County
2,".Amador County, California",39752,Amador County
3,".Butte County, California",219186,Butte County


In [30]:
cali_merge = pd.merge(cali_layoffs, cali_population, how="left", left_on="County", right_on="County Names")
cali_merge.drop(columns='Counties', inplace=True)
cali_merge['City 2'] = cali_merge['City'].str.lower()
cali_merge.head()

Unnamed: 0,Notice Date,Effective Date,Received Date,Company,City,County,Employees,Layoff/Closure,County Orig,Year,Layoff/Closure clean,Population,County Names,City 2
0,12/04/2020,12/06/2020,12/30/2020,Blackhawk Country Club,Danville,Contra Costa County,3.0,Layoff Temporary,Contra Costa County,2020,layoff temporary,1153526.0,Contra Costa County,danville
1,12/07/2020,12/09/2020,12/30/2020,"Fullerton Baekjeong, LLC",Buena Park,Orange County,67.0,Closure Permanent,Orange County,2020,closure permanent,3175692.0,Orange County,buena park
2,12/07/2020,12/09/2020,12/30/2020,"Irvine Baekjeong, LLC",Irvine,Orange County,68.0,Closure Permanent,Orange County,2020,closure permanent,3175692.0,Orange County,irvine
3,12/09/2020,12/10/2020,12/30/2020,OCMC Inc. dba Orange County Mining Co.,Santa Ana,Orange County,59.0,Closure Temporary,Orange County,2020,closure temporary,3175692.0,Orange County,santa ana
4,12/07/2020,12/07/2020,12/30/2020,"Parker Palm Springs, LLC",Palm Springs,Riverside County,79.0,Layoff Temporary,Riverside County,2020,layoff temporary,2470546.0,Riverside County,palm springs


### Extracting Only Necessary Records

Now that the data has been standardized, we will be selecting the records that have a notice date from the year 2020 or 2019. This will allow for comparisons between a pandemic and non-pandemic year in terms of layoff notices. 

In [31]:
years = ['2020', '2019']

cali_2020_2019 = cali_merge[cali_merge["Year"].isin(years)]
cali_2020_2019.drop(columns='County Names', inplace=True)
cali_2020_2019.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


Unnamed: 0,Notice Date,Effective Date,Received Date,Company,City,County,Employees,Layoff/Closure,County Orig,Year,Layoff/Closure clean,Population,City 2
0,12/04/2020,12/06/2020,12/30/2020,Blackhawk Country Club,Danville,Contra Costa County,3.0,Layoff Temporary,Contra Costa County,2020,layoff temporary,1153526.0,danville
1,12/07/2020,12/09/2020,12/30/2020,"Fullerton Baekjeong, LLC",Buena Park,Orange County,67.0,Closure Permanent,Orange County,2020,closure permanent,3175692.0,buena park
2,12/07/2020,12/09/2020,12/30/2020,"Irvine Baekjeong, LLC",Irvine,Orange County,68.0,Closure Permanent,Orange County,2020,closure permanent,3175692.0,irvine
3,12/09/2020,12/10/2020,12/30/2020,OCMC Inc. dba Orange County Mining Co.,Santa Ana,Orange County,59.0,Closure Temporary,Orange County,2020,closure temporary,3175692.0,santa ana
4,12/07/2020,12/07/2020,12/30/2020,"Parker Palm Springs, LLC",Palm Springs,Riverside County,79.0,Layoff Temporary,Riverside County,2020,layoff temporary,2470546.0,palm springs


In [32]:
len(cali_2020_2019)

6612

In [33]:
cali_2020_2019.to_csv('../data/harmonizing/clean_warn_data.csv', index=False) # old version of `clean_warn_data.csv` without the deduping