In [1]:
import pandas as pd
import numpy as np
import csv

In [2]:
cali_layoffs = pd.read_csv('../data/cleaning/california_warn_raw_recent.csv')
cali_population = pd.read_csv('../data/cleaning/county_population.csv', delimiter='\t', thousands=',')

In [3]:
cali_layoffs.head()

Unnamed: 0,Notice Date,Effective Date,Received Date,Company,City,County,Employees,Layoff/Closure
0,06/09/2020,06/07/2020,07/01/2020,Bay Club Redondo Beach,Redondo Beach,Los Angeles County,102.0,Layoff Permanent
1,06/09/2020,06/07/2020,07/01/2020,Bay Club Rolling Hills,Rolling Hills Estates,Los Angeles County,64.0,Layoff Permanent
2,06/09/2020,06/07/2020,07/01/2020,Bay Club Santa Monica,Santa Monica,Los Angeles County,82.0,Layoff Permanent
3,06/19/2020,08/21/2020,07/01/2020,"Weber Metals, Inc",Paramount,Los Angeles County,169.0,Layoff Permanent
4,06/09/2020,06/07/2020,07/01/2020,StoneTree Golf Club,Novato,Marin County,32.0,Layoff Permanent


### Ambiguous Cities

In doing some preliminary exploration of the data, we learned that some cities have been assigned the incorrect county name. The code below finds the cities with more than one county assigned. We use the results produced to go back and clean those records in the WARN Layoffs analysis notebook.

### Clean County, Company and City Columns

In [4]:
cali_layoffs['Company'] = cali_layoffs['Company'].str.replace('\\r', '')
cali_layoffs['Company'] = cali_layoffs['Company'].str.replace('\\n', '')
cali_layoffs['City'] = cali_layoffs['City'].str.replace('\\r', '')
cali_layoffs['City'] = cali_layoffs['City'].str.replace('\\n', '')
cali_layoffs['Layoff/Closure'] = cali_layoffs['Layoff/Closure'].str.replace('\\r', '')
cali_layoffs['Layoff/Closure'] = cali_layoffs['Layoff/Closure'].str.replace('\\n', '')

In [5]:
cali_layoffs['County'] = cali_layoffs['County'].str.strip()
cali_layoffs['County Orig'] = cali_layoffs['County']
cali_layoffs['County'].fillna('Not Available', inplace=True)

In [6]:
cali_layoffs.loc[~(cali_layoffs['County'].str.endswith('County')) & (cali_layoffs['County'] != 'Not Available'),'County'] += " County"
cali_layoffs['County'].unique()

array(['Los Angeles County', 'Marin County', 'Orange County',
       'Sacramento County', 'San Bernardino County',
       'San Francisco County', 'San Mateo County', 'Santa Clara County',
       'Ventura County', 'Kern County', 'Stanislaus County',
       'Contra Costa County', 'Alameda County', 'San Diego County',
       'Kings County', 'San Joaquin County', 'Sonoma County',
       'Tulare County', 'Napa County', 'Monterey County', 'Placer County',
       'Mono County', 'Riverside County', 'Fresno County',
       'Mendocino County', 'Del Norte County', 'Yolo County',
       'San Luis Obispo County', 'Madera County', 'Shasta County',
       'Solano County', 'Santa Barbara County', 'Imperial County',
       'Mariposa County', 'Santa Cruz County', 'El Dorado County',
       'Butte County', 'Sutter County', 'Yuba County', 'Siskiyou County',
       'Calaveras County', 'Tehama County', 'Glenn County', 'Inyo County',
       'Not Available', 'San Benito County', 'Merced County',
       'Nevad

In [7]:
cali_layoffs['Year'] = cali_layoffs['Notice Date'].str[-4:]
cali_layoffs['Year'] = cali_layoffs['Year'].astype(np.int64)
cali_layoffs.head()

Unnamed: 0,Notice Date,Effective Date,Received Date,Company,City,County,Employees,Layoff/Closure,County Orig,Year
0,06/09/2020,06/07/2020,07/01/2020,Bay Club Redondo Beach,Redondo Beach,Los Angeles County,102.0,Layoff Permanent,Los Angeles County,2020
1,06/09/2020,06/07/2020,07/01/2020,Bay Club Rolling Hills,Rolling Hills Estates,Los Angeles County,64.0,Layoff Permanent,Los Angeles County,2020
2,06/09/2020,06/07/2020,07/01/2020,Bay Club Santa Monica,Santa Monica,Los Angeles County,82.0,Layoff Permanent,Los Angeles County,2020
3,06/19/2020,08/21/2020,07/01/2020,"Weber Metals, Inc",Paramount,Los Angeles County,169.0,Layoff Permanent,Los Angeles County,2020
4,06/09/2020,06/07/2020,07/01/2020,StoneTree Golf Club,Novato,Marin County,32.0,Layoff Permanent,Marin County,2020


In [8]:
cali_layoffs['County'].unique()

array(['Los Angeles County', 'Marin County', 'Orange County',
       'Sacramento County', 'San Bernardino County',
       'San Francisco County', 'San Mateo County', 'Santa Clara County',
       'Ventura County', 'Kern County', 'Stanislaus County',
       'Contra Costa County', 'Alameda County', 'San Diego County',
       'Kings County', 'San Joaquin County', 'Sonoma County',
       'Tulare County', 'Napa County', 'Monterey County', 'Placer County',
       'Mono County', 'Riverside County', 'Fresno County',
       'Mendocino County', 'Del Norte County', 'Yolo County',
       'San Luis Obispo County', 'Madera County', 'Shasta County',
       'Solano County', 'Santa Barbara County', 'Imperial County',
       'Mariposa County', 'Santa Cruz County', 'El Dorado County',
       'Butte County', 'Sutter County', 'Yuba County', 'Siskiyou County',
       'Calaveras County', 'Tehama County', 'Glenn County', 'Inyo County',
       'Not Available', 'San Benito County', 'Merced County',
       'Nevad

### Cleaning Layoff Types

Type Unknown and unknown at this time will be combined.

In [9]:
cali_layoffs['Layoff/Closure'].unique()

array(['Layoff Permanent', 'Layoff Type Unknown', 'Layoff Temporary',
       'Closure Temporary', 'Closure Permanent', 'Closure Type Unknown',
       'Layoff Unknown at this time', 'Closure Unknown at this time',
       'Closure Unknown at thistime', 'Layoff Unknown at thistime'],
      dtype=object)

In [10]:
cali_layoffs['Layoff/Closure clean'] = cali_layoffs['Layoff/Closure'].str.replace('at thistime', 'at this time')
cali_layoffs['Layoff/Closure clean'] = cali_layoffs['Layoff/Closure clean'].str.strip()
cali_layoffs['Layoff/Closure clean'] = cali_layoffs['Layoff/Closure clean'].str.lower()
cali_layoffs['Layoff/Closure clean'] = cali_layoffs['Layoff/Closure clean'].str.replace('unknown at this time', 'type unknown')
cali_layoffs['Layoff/Closure clean'] = cali_layoffs['Layoff/Closure clean'].str.replace('type unknown', 'type uncategorized')
cali_layoffs['Layoff/Closure clean'].unique()

array(['layoff permanent', 'layoff type uncategorized',
       'layoff temporary', 'closure temporary', 'closure permanent',
       'closure type uncategorized'], dtype=object)

In [11]:
cali_dict = cali_layoffs.to_dict(orient='records')
cali_dict[0:2]

[{'Notice Date': '06/09/2020',
  'Effective Date': '06/07/2020',
  'Received Date': '07/01/2020',
  'Company': 'Bay Club Redondo Beach',
  'City': 'Redondo Beach',
  'County': 'Los Angeles County',
  'Employees': 102.0,
  'Layoff/Closure': 'Layoff Permanent',
  'County Orig': 'Los Angeles County',
  'Year': 2020,
  'Layoff/Closure clean': 'layoff permanent'},
 {'Notice Date': '06/09/2020',
  'Effective Date': '06/07/2020',
  'Received Date': '07/01/2020',
  'Company': 'Bay Club Rolling Hills',
  'City': 'Rolling Hills Estates',
  'County': 'Los Angeles County',
  'Employees': 64.0,
  'Layoff/Closure': 'Layoff Permanent',
  'County Orig': 'Los Angeles County',
  'Year': 2020,
  'Layoff/Closure clean': 'layoff permanent'}]

In [12]:
cali_layoffs['County'].unique()

array(['Los Angeles County', 'Marin County', 'Orange County',
       'Sacramento County', 'San Bernardino County',
       'San Francisco County', 'San Mateo County', 'Santa Clara County',
       'Ventura County', 'Kern County', 'Stanislaus County',
       'Contra Costa County', 'Alameda County', 'San Diego County',
       'Kings County', 'San Joaquin County', 'Sonoma County',
       'Tulare County', 'Napa County', 'Monterey County', 'Placer County',
       'Mono County', 'Riverside County', 'Fresno County',
       'Mendocino County', 'Del Norte County', 'Yolo County',
       'San Luis Obispo County', 'Madera County', 'Shasta County',
       'Solano County', 'Santa Barbara County', 'Imperial County',
       'Mariposa County', 'Santa Cruz County', 'El Dorado County',
       'Butte County', 'Sutter County', 'Yuba County', 'Siskiyou County',
       'Calaveras County', 'Tehama County', 'Glenn County', 'Inyo County',
       'Not Available', 'San Benito County', 'Merced County',
       'Nevad

In [13]:
rows_with_no_county = [r for r in cali_dict if r["County"] == "Not Available"]
len(rows_with_no_county)

2117

In [14]:
rows_with_no_county[0]

{'Notice Date': '06/22/2015',
 'Effective Date': '03/25/2016',
 'Received Date': '07/01/2015',
 'Company': 'Maxim Integrated Product',
 'City': 'San Jose',
 'County': 'Not Available',
 'Employees': 150.0,
 'Layoff/Closure': 'Closure Permanent',
 'County Orig': nan,
 'Year': 2015,
 'Layoff/Closure clean': 'closure permanent'}

In [15]:
cities = [r["City"] for r in rows_with_no_county]

In [16]:
len(cities)

2117

In [17]:
cities = set(cities)

In [18]:
len(cities)

355

In [19]:
rows_with_county = [r for r in cali_dict if r["County"] != "Not Available"]

In [20]:
len(rows_with_county)

7747

In [21]:
rows_with_county[334]

{'Notice Date': '07/03/2020',
 'Effective Date': '07/03/2020',
 'Received Date': '08/03/2020',
 'Company': 'Wyndham San Diego Bayside',
 'City': 'San Diego',
 'County': 'San Diego County',
 'Employees': 88.0,
 'Layoff/Closure': 'Layoff Temporary',
 'County Orig': 'San Diego County',
 'Year': 2020,
 'Layoff/Closure clean': 'layoff temporary'}

In [22]:
city_to_counties = {}
for row in rows_with_county:
    city = row["City"]
    county = row["County"]
    if city not in city_to_counties:
        city_to_counties[city] = set()
    city_to_counties[city].add(county)

In [23]:
unambiguous_cities = [pair for pair in city_to_counties.items() if len(pair[1]) == 1]

In [24]:
len(unambiguous_cities)

525

In [25]:
unambiguous_cities = {pair[0]: list(pair[1])[0] for pair in city_to_counties.items() if len(pair[1]) == 1}

In [26]:
for row in rows_with_no_county:
    if row["City"] in unambiguous_cities:
        row["County"] = unambiguous_cities[row["City"]]

In [27]:
rows_with_no_county[:2]

[{'Notice Date': '06/22/2015',
  'Effective Date': '03/25/2016',
  'Received Date': '07/01/2015',
  'Company': 'Maxim Integrated Product',
  'City': 'San Jose',
  'County': 'Santa Clara County',
  'Employees': 150.0,
  'Layoff/Closure': 'Closure Permanent',
  'County Orig': nan,
  'Year': 2015,
  'Layoff/Closure clean': 'closure permanent'},
 {'Notice Date': '06/30/2015',
  'Effective Date': '08/29/2015',
  'Received Date': '07/01/2015',
  'Company': 'McGraw-Hill Education',
  'City': 'Monterey',
  'County': 'Monterey County',
  'Employees': 137.0,
  'Layoff/Closure': 'Layoff Unknown at this time',
  'County Orig': nan,
  'Year': 2015,
  'Layoff/Closure clean': 'layoff type uncategorized'}]

In [28]:
ambiguous_cities = [pair for pair in city_to_counties.items() if len(pair[1]) > 1]

In [29]:
ambiguous_cities

[('Sacramento', {'Del Norte County', 'Sacramento County'}),
 ('San Francisco', {'San Francisco County', 'San Mateo County'}),
 ('Walnut Creek', {'Alameda County', 'Contra Costa County'}),
 ('Los Angeles', {'Alameda County', 'Los Angeles County'}),
 ('Hayward', {'Alameda County', 'Contra Costa County'}),
 ('City of Industry', {'Los Angeles County', 'Orange County'}),
 ('Roseville', {'Placer County', 'Sacramento County'}),
 ('Berkeley', {'Alameda County', 'Contra Costa County'}),
 ('Cerritos', {'Los Angeles County', 'Orange County'}),
 ('Vista', {'San Diego County', 'San Francisco County'}),
 ('Del Mar', {'Orange County', 'San Diego County'}),
 ('Indian Wells', {'Riverside County', 'San Benito County'}),
 ('Rancho', {'Los Angeles County', 'San Bernardino County'}),
 ('Los Gatos', {'Santa Clara County', 'Santa Cruz County'})]

### Manual research narrowed down which cities actually belonged to which counties
resulting in our updated `ambiguous_cities` dictionary. `Rancho` is the only city on the list where the two counties actually have cities with the same name. It will be left off the list. 

***The dictionary below may need further manual adjusting if this notebook is being run with updated data.***

In [30]:
dict_cities = {
    'Sacramento':'Sacramento County',
    'San Francisco':'San Francisco County',
    'Hayward':'Alameda County',
    'Indian Wells':'Riverside County',
    'Cerritos':'Los Angeles County',
    'Vista':'San Diego County',
    'Roseville':'Placer County',
    'City of Industry':'Los Angeles County',
    'Del Mar':'San Diego County',
    'Walnut Creek':'Contra Costa County',
    'Los Gatos':'Santa Clara County',
    'Los Angeles':'Los Angeles County',
    'Berkeley': 'Alameda County'
}

### Clean County Names
Below is our code to correct County Names based on City Names. 
We are taking the dictionary above, looping through the city and county names, which are the keys and values, replacing the incorrect County Name with the correct one for each city's record.

In [31]:
for city, county in dict_cities.items():
    cali_layoffs.loc[cali_layoffs['City'] == city, ['County']] = county

In [32]:
los_gatos_clean = cali_layoffs[cali_layoffs['City'] == 'Los Gatos']
los_gatos_clean.head(8)

Unnamed: 0,Notice Date,Effective Date,Received Date,Company,City,County,Employees,Layoff/Closure,County Orig,Year,Layoff/Closure clean
3060,04/26/2019,06/30/2019,04/26/2019,Covia Communities,Los Gatos,Santa Clara County,66.0,Closure Permanent,Santa Clara,2019,closure permanent
3173,06/03/2019,10/01/2019,06/04/2019,"Roche Molecular Systems, Inc.",Los Gatos,Santa Clara County,17.0,Closure Permanent,Santa Clara,2019,closure permanent
3327,09/01/2016,11/01/2016,09/02/2016,Valley Medical Oncology Consultants,Los Gatos,Santa Clara County,29.0,Layoff Permanent,,2016,layoff permanent
4043,10/01/2014,12/31/2014,10/16/2014,Prometheus Real Estate Group,Los Gatos,Santa Clara County,8.0,Layoff Permanent,,2014,layoff permanent
4406,03/27/2015,03/16/2015,03/27/2015,"Cryptic Studios, Inc.",Los Gatos,Santa Clara County,27.0,Layoff Permanent,,2015,layoff permanent
4658,06/23/2020,08/20/2020,06/29/2020,Boston Scientific Corporation,Los Gatos,Santa Clara County,60.0,Closure Permanent,Santa Clara County,2020,closure permanent
4750,06/09/2020,06/07/2020,06/24/2020,Bay Club Courtside,Los Gatos,Santa Clara County,294.0,Layoff Permanent,Santa Clara County,2020,layoff permanent
5656,03/30/2020,03/17/2020,05/20/2020,BACK OF THE HOUSE DBA SUPER DUPER,Los Gatos,Santa Clara County,30.0,Layoff Temporary,Santa Clara County,2020,layoff temporary


### Adding Population Data

The data has been grabbed from the census link above. These figures will be used to calculate per capita layoff numbers per county.

In [33]:
cali_population.rename(columns={'California':'Counties','39,512,223':'Population'}, inplace=True)
cali_population['County Names'] = cali_population['Counties'].str[1:]
cali_population['County Names'] = cali_population['County Names'].str.split(',').str[0]
cali_population.head(4)

Unnamed: 0,Counties,Population,County Names
0,".Alameda County, California",1671329,Alameda County
1,".Alpine County, California",1129,Alpine County
2,".Amador County, California",39752,Amador County
3,".Butte County, California",219186,Butte County


In [34]:
cali_merge = pd.merge(cali_layoffs, cali_population, how="left", left_on="County", right_on="County Names")
cali_merge.drop(columns='Counties', inplace=True)
cali_merge['City 2'] = cali_merge['City'].str.lower()
cali_merge.head()

Unnamed: 0,Notice Date,Effective Date,Received Date,Company,City,County,Employees,Layoff/Closure,County Orig,Year,Layoff/Closure clean,Population,County Names,City 2
0,06/09/2020,06/07/2020,07/01/2020,Bay Club Redondo Beach,Redondo Beach,Los Angeles County,102.0,Layoff Permanent,Los Angeles County,2020,layoff permanent,10039107.0,Los Angeles County,redondo beach
1,06/09/2020,06/07/2020,07/01/2020,Bay Club Rolling Hills,Rolling Hills Estates,Los Angeles County,64.0,Layoff Permanent,Los Angeles County,2020,layoff permanent,10039107.0,Los Angeles County,rolling hills estates
2,06/09/2020,06/07/2020,07/01/2020,Bay Club Santa Monica,Santa Monica,Los Angeles County,82.0,Layoff Permanent,Los Angeles County,2020,layoff permanent,10039107.0,Los Angeles County,santa monica
3,06/19/2020,08/21/2020,07/01/2020,"Weber Metals, Inc",Paramount,Los Angeles County,169.0,Layoff Permanent,Los Angeles County,2020,layoff permanent,10039107.0,Los Angeles County,paramount
4,06/09/2020,06/07/2020,07/01/2020,StoneTree Golf Club,Novato,Marin County,32.0,Layoff Permanent,Marin County,2020,layoff permanent,258826.0,Marin County,novato


### Extracting Only Necessary Records

Now that the data has been standardized, we will be selecting the records that have a notice date from the year 2020 or 2019. This will allow for comparisons between a pandemic and non-pandemic year in terms of layoff notices. 

In [35]:
years = ['2020', '2019']

cali_2020_2019 = cali_merge[cali_merge["Year"].isin(years)]
cali_2020_2019.drop(columns='County Names', inplace=True)
cali_2020_2019.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


Unnamed: 0,Notice Date,Effective Date,Received Date,Company,City,County,Employees,Layoff/Closure,County Orig,Year,Layoff/Closure clean,Population,City 2
0,06/09/2020,06/07/2020,07/01/2020,Bay Club Redondo Beach,Redondo Beach,Los Angeles County,102.0,Layoff Permanent,Los Angeles County,2020,layoff permanent,10039107.0,redondo beach
1,06/09/2020,06/07/2020,07/01/2020,Bay Club Rolling Hills,Rolling Hills Estates,Los Angeles County,64.0,Layoff Permanent,Los Angeles County,2020,layoff permanent,10039107.0,rolling hills estates
2,06/09/2020,06/07/2020,07/01/2020,Bay Club Santa Monica,Santa Monica,Los Angeles County,82.0,Layoff Permanent,Los Angeles County,2020,layoff permanent,10039107.0,santa monica
3,06/19/2020,08/21/2020,07/01/2020,"Weber Metals, Inc",Paramount,Los Angeles County,169.0,Layoff Permanent,Los Angeles County,2020,layoff permanent,10039107.0,paramount
4,06/09/2020,06/07/2020,07/01/2020,StoneTree Golf Club,Novato,Marin County,32.0,Layoff Permanent,Marin County,2020,layoff permanent,258826.0,novato


In [36]:
len(cali_2020_2019)

6714

In [37]:
cali_2020_2019.to_csv('../data/harmonizing/clean_warn_data.csv', index=False)