# Data Cleaning

## Import Libraires and Job Data

In [56]:
import pandas as pd
import re

In [99]:
df = pd.read_csv('glassdoor_jobs.csv')
df

Unnamed: 0,Job Title,Salary Estimate,Job Description,Rating,Company Name,Location,Headquarters,Size,Founded,Type of ownership,Industry,Sector,Revenue,Competitors
0,Applied Scientist,$127K - $203K (Employer est.),About the team\nZEXP is hiring an Applied Scie...,3.9,Zillow\n3.9,Remote,-1,5001 to 10000 Employees,2005,Company - Public,Real Estate,Real Estate,$1 to $5 billion (USD),-1
1,"Data Scientist, Marketing & Online (Remote)",$127K - $203K (Employer est.),Position Purpose:\nThe Data Scientist is respo...,3.8,The Home Depot\n3.8,"Atlanta, GA",-1,10000+ Employees,1978,Company - Public,Home Furniture & Housewares Stores,Retail & Wholesale,$10+ billion (USD),-1
2,Data Scientist I,$127K - $203K (Employer est.),"YOUR LIFE'S MISSION: POSSIBLE\nYou have goals,...",4.1,Navy Federal Credit Union\n4.1,"Vienna, VA",-1,10000+ Employees,1933,Self-employed,Banking & Lending,Financial Services,Unknown / Non-Applicable,-1
3,Data Science - Sanofi mRNA Virtual Career Fair...,$127K - $203K (Employer est.),We would like to invite you to Sanofi’s mRNA V...,3.9,Sanofi\n3.9,"Waltham, MA",-1,10000+ Employees,1973,Company - Public,Biotech & Pharmaceuticals,Pharmaceutical & Biotechnology,$10+ billion (USD),-1
4,Computer Vision / AI Data Scientist,$127K - $203K (Employer est.),About care.ai\ncare.ai is the leading provider...,-1.0,care.ai,Remote,-1,-1,-1,-1,-1,-1,-1,-1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,"Principal Data Scientist, Online Marketing (Re...",$127K (Employer est.),The Home Depot is able to offer virtual employ...,3.8,The Home Depot\n3.8,"Atlanta, GA",-1,10000+ Employees,1978,Company - Public,Home Furniture & Housewares Stores,Retail & Wholesale,$10+ billion (USD),-1
996,"Data Scientist, Audience Data Science - Market...",$127K (Employer est.),"JOB SUMMARY: Design, develop and evaluate adva...",4.0,Universal Orlando\n4.0,"Orlando, FL",-1,10000+ Employees,1987,Subsidiary or Business Segment,Hotels & Resorts,Hotels & Travel Accommodation,$1 to $5 billion (USD),-1
997,AI/ML Health Data Scientist – Senior Consultant,$127K (Employer est.),Overview\n\nGuidehouse is a leading global pro...,3.7,Guidehouse\n3.7,"Washington Harbor, WA",-1,1001 to 5000 Employees,2018,Company - Private,Business Consulting,Management & Consulting,Unknown / Non-Applicable,-1
998,Lead Data Scientist - Forecasting (Full-Time R...,$127K (Employer est.),"Location: 7000 Target Pkwy N, Brooklyn Park, M...",3.6,Target\n3.6,"Brooklyn Park, MN",-1,10000+ Employees,1962,Company - Public,General Merchandise & Superstores,Retail & Wholesale,$10+ billion (USD),-1


In [None]:
# salary parsing
# Job title and seniority
# Fix state LA
# JD lenth
# hourly wage -> annual
# remove new line

## Rename Columns
- Lowercase column names
- Rename column names for better analyzing and understanding

In [103]:
col_name = list(df.columns)
df.columns = [name.lower().replace(' ', '_') for name in col_name]
list(df.columns)

['job_title',
 'salary_estimate',
 'job_description',
 'rating',
 'company_name',
 'location',
 'headquarters',
 'size',
 'founded',
 'type_of_ownership',
 'industry',
 'sector',
 'revenue',
 'competitors']

## Clean the Dataset

### Clean the *`salary_estimate`*
We will only keep the numbers and -

In [105]:
df['salary_estimate'].value_counts()

$118K - $189K (Glassdoor est.)    360
$127K (Employer est.)             340
$135K - $175K (Employer est.)      60
$127K - $203K (Employer est.)      30
$96K - $131K (Glassdoor est.)      30
$77K - $116K (Glassdoor est.)      30
$113K - $177K (Glassdoor est.)     30
$104K - $167K (Glassdoor est.)     30
$86K - $132K (Glassdoor est.)      30
$92K - $131K (Glassdoor est.)      30
$97K - $155K (Glassdoor est.)      30
Name: salary_estimate, dtype: int64

In [106]:
# clean salary (only keep numbers and -)
salary = df['salary_estimate'].str.extract('(.*) \(')[0]
salary = salary.str.replace('[$K ]*', '', regex=True)
df['salary_estimate'] = salary

# show results
df['salary_estimate'].value_counts()

### Clean the *`company_name`*
We will remove the new line and rating

In [111]:
df['company_name']

0                         Zillow\n3.9
1                 The Home Depot\n3.8
2      Navy Federal Credit Union\n4.1
3                         Sanofi\n3.9
4                             care.ai
                    ...              
995               The Home Depot\n3.8
996            Universal Orlando\n4.0
997                   Guidehouse\n3.7
998                       Target\n3.6
999                       Target\n3.6
Name: company_name, Length: 1000, dtype: object

In [112]:
# remove new line and rating if rating exists
df['company_name'] = df.apply(lambda x: x['company_name'] if x['rating'] < 0 else x['company_name'][:-4], axis=1)

# show results
df['company_name']

### Clean the *`location`*
We will only keep the state abbreviations

In [143]:
df['location'].value_counts()

Remote                   176
Atlanta, GA              143
Hartford, CT              75
Washington Harbor, WA     58
Brooklyn Park, MN         53
                        ... 
Urbandale, IA              1
Tysons Corner, VA          1
Waltham, MA                1
Arizona                    1
Tennessee City, TN         1
Name: location, Length: 67, dtype: int64

In [145]:
# keep the abbreviations
df['location'] = df['location'].apply(lambda x: x.split(', ')[1] if x.find(',') != -1 else x)

# show results
df['location'].value_counts()

Remote            176
GA                145
CA                116
WA                 81
CT                 75
VA                 75
MN                 53
TX                 39
NY                 39
CO                 38
FL                 33
NJ                 31
IL                 31
MA                 16
DC                 14
AZ                  9
New York State      9
NV                  4
OH                  3
MO                  2
United States       2
MD                  2
DE                  2
IA                  1
TN                  1
SC                  1
NC                  1
Arizona             1
Name: location, dtype: int64

Fix errors:
- *`New York State`* &rarr; *`NY`*
- *`United States`* &rarr; *`Remote`*
- *`Arizona`* &rarr; *`AZ`*

In [148]:
fix = {'New York State': 'NY',
       'United States': 'Remote',
       'Arizona': 'AZ'}

df['location'] = df['location'].replace(fix)

# show results
df['location'].value_counts()

Remote    178
GA        145
CA        116
WA         81
CT         75
VA         75
MN         53
NY         48
TX         39
CO         38
FL         33
IL         31
NJ         31
MA         16
DC         14
AZ         10
NV          4
OH          3
DE          2
MD          2
MO          2
TN          1
SC          1
IA          1
NC          1
Name: location, dtype: int64

### Clean *`headquarters`*
Drop the whole column because there is no information

In [150]:
df['headquarters'].value_counts()

-1    1000
Name: headquarters, dtype: int64

In [154]:
df.drop(['headquarters'], axis=1, inplace=True)

# show results
list(df.columns)

['job_title',
 'salary_estimate',
 'job_description',
 'rating',
 'company_name',
 'location',
 'size',
 'founded',
 'type_of_ownership',
 'industry',
 'sector',
 'revenue',
 'competitors']