# Data Cleaning

## Import Libraires and Job Data

In [1]:
import pandas as pd
import re

In [2]:
df = pd.read_csv('glassdoor_jobs.csv')
df

Unnamed: 0,Job Title,Salary Estimate,Job Description,Rating,Company Name,Location,Headquarters,Size,Founded,Type of ownership,Industry,Sector,Revenue,Competitors
0,Applied Scientist,$127K - $203K (Employer est.),About the team\nZEXP is hiring an Applied Scie...,3.9,Zillow\n3.9,Remote,-1,5001 to 10000 Employees,2005,Company - Public,Real Estate,Real Estate,$1 to $5 billion (USD),-1
1,"Data Scientist, Marketing & Online (Remote)",$127K - $203K (Employer est.),Position Purpose:\nThe Data Scientist is respo...,3.8,The Home Depot\n3.8,"Atlanta, GA",-1,10000+ Employees,1978,Company - Public,Home Furniture & Housewares Stores,Retail & Wholesale,$10+ billion (USD),-1
2,Data Scientist I,$127K - $203K (Employer est.),"YOUR LIFE'S MISSION: POSSIBLE\nYou have goals,...",4.1,Navy Federal Credit Union\n4.1,"Vienna, VA",-1,10000+ Employees,1933,Self-employed,Banking & Lending,Financial Services,Unknown / Non-Applicable,-1
3,Data Science - Sanofi mRNA Virtual Career Fair...,$127K - $203K (Employer est.),We would like to invite you to Sanofi’s mRNA V...,3.9,Sanofi\n3.9,"Waltham, MA",-1,10000+ Employees,1973,Company - Public,Biotech & Pharmaceuticals,Pharmaceutical & Biotechnology,$10+ billion (USD),-1
4,Computer Vision / AI Data Scientist,$127K - $203K (Employer est.),About care.ai\ncare.ai is the leading provider...,-1.0,care.ai,Remote,-1,-1,-1,-1,-1,-1,-1,-1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,"Principal Data Scientist, Online Marketing (Re...",$127K (Employer est.),The Home Depot is able to offer virtual employ...,3.8,The Home Depot\n3.8,"Atlanta, GA",-1,10000+ Employees,1978,Company - Public,Home Furniture & Housewares Stores,Retail & Wholesale,$10+ billion (USD),-1
996,"Data Scientist, Audience Data Science - Market...",$127K (Employer est.),"JOB SUMMARY: Design, develop and evaluate adva...",4.0,Universal Orlando\n4.0,"Orlando, FL",-1,10000+ Employees,1987,Subsidiary or Business Segment,Hotels & Resorts,Hotels & Travel Accommodation,$1 to $5 billion (USD),-1
997,AI/ML Health Data Scientist – Senior Consultant,$127K (Employer est.),Overview\n\nGuidehouse is a leading global pro...,3.7,Guidehouse\n3.7,"Washington Harbor, WA",-1,1001 to 5000 Employees,2018,Company - Private,Business Consulting,Management & Consulting,Unknown / Non-Applicable,-1
998,Lead Data Scientist - Forecasting (Full-Time R...,$127K (Employer est.),"Location: 7000 Target Pkwy N, Brooklyn Park, M...",3.6,Target\n3.6,"Brooklyn Park, MN",-1,10000+ Employees,1962,Company - Public,General Merchandise & Superstores,Retail & Wholesale,$10+ billion (USD),-1


## Rename Columns
- Lowercase column names
- Rename column names for better analyzing and understanding

In [3]:
col_name = list(df.columns)
df.columns = [name.lower().replace(' ', '_') for name in col_name]
list(df.columns)

['job_title',
 'salary_estimate',
 'job_description',
 'rating',
 'company_name',
 'location',
 'headquarters',
 'size',
 'founded',
 'type_of_ownership',
 'industry',
 'sector',
 'revenue',
 'competitors']

## Clean the Dataset

### *`job_title`*
- From *`job_title`*, create a new feature *`seniority`* with 3 values : `na`, `junior`, and `senior`
- Simplified *`job_title`* to 6 values : `na`, `data analyst`, `data scientist`, `data engineer`, `machine learning engineer`, and `manager`

In [4]:
def seniority(title):
    title = title.lower()
    if 'sr' in title or 'sr.' in title or 'senior' in title or 'lead' in title or 'manager' in title or 'principal' in title or 'director' in title: 
        return 'senior'
    elif 'jr' in title or 'jr.' in title or 'junior' in title:
        return 'junior'
    else:
        return 'na'
    
   
def title_simplify(title):
    title = title.lower()
    if 'analyst' in title:
        return 'data analyst'
    elif 'data scientist' in title:
        return 'data scientist'
    elif 'data engineer' in title:
        return 'data engineer'
    elif 'machine learning' in title:
        return 'machine learning engineer'
    elif 'lead' in title or 'manager' in title or 'principal' in title or 'director' in title:
        return 'manager'
    else:
        return 'na'


# from job_title, create a new feature seniority with three values : 'na', 'junior', and 'senior'   
df['seniority'] = df['job_title'].apply(seniority)
# simplified job_title to 6 values : 'na', 'data analyst', 'data scientist', 'data engineer', 'machine learning engineer`, and 'manager'
df['job_title'] = df['job_title'].apply(title_simplify)

# check results
print(df['job_title'].value_counts())
print('\n')
print(df['seniority'].value_counts())

data scientist               743
machine learning engineer     97
na                            95
data engineer                 32
data analyst                  29
manager                        4
Name: job_title, dtype: int64


na        595
senior    376
junior     29
Name: seniority, dtype: int64


### *`salary_estimate`*
- Only keep the format `xxx-xxx`
- From *`salary_estimate`*, create new features *`salary_min`*, *`salary_max`*, and *`salary_avg`* (and turn into int type)

In [5]:
df['salary_estimate'].value_counts()

$118K - $189K (Glassdoor est.)    360
$127K (Employer est.)             340
$135K - $175K (Employer est.)      60
$127K - $203K (Employer est.)      30
$96K - $131K (Glassdoor est.)      30
$77K - $116K (Glassdoor est.)      30
$113K - $177K (Glassdoor est.)     30
$104K - $167K (Glassdoor est.)     30
$86K - $132K (Glassdoor est.)      30
$92K - $131K (Glassdoor est.)      30
$97K - $155K (Glassdoor est.)      30
Name: salary_estimate, dtype: int64

In [6]:
# clean salary (only keep numbers and -)
salary = df['salary_estimate'].str.extract('(.*) \(')[0]
salary = salary.str.replace('[$K ]*', '', regex=True)
# 127 -> 127-127
salary = salary.str.replace('^127(?!.)', '127-127', regex=True)

df['salary_estimate'] = salary

# check results
df['salary_estimate'].value_counts()

118-189    360
127-127    340
135-175     60
127-203     30
96-131      30
77-116      30
113-177     30
104-167     30
86-132      30
92-131      30
97-155      30
Name: salary_estimate, dtype: int64

In [7]:
# create new features salary_min and salary_max in another dataframe
pattern = '(?P<salary_min>.*)-(?P<salary_max>.*)'
new_salary = df['salary_estimate'].str.extract(pattern).astype('int')
# concat df and new_salary
df = pd.concat([df, new_salary], axis=1)
# create new feature salary_avg
df['salary_avg'] = (df['salary_min'] + df['salary_max']) / 2
# drop salary_estimate
df.drop(['salary_estimate'], axis=1, inplace=True)

# check results
df

Unnamed: 0,job_title,job_description,rating,company_name,location,headquarters,size,founded,type_of_ownership,industry,sector,revenue,competitors,seniority,salary_min,salary_max,salary_avg
0,na,About the team\nZEXP is hiring an Applied Scie...,3.9,Zillow\n3.9,Remote,-1,5001 to 10000 Employees,2005,Company - Public,Real Estate,Real Estate,$1 to $5 billion (USD),-1,na,127,203,165.0
1,data scientist,Position Purpose:\nThe Data Scientist is respo...,3.8,The Home Depot\n3.8,"Atlanta, GA",-1,10000+ Employees,1978,Company - Public,Home Furniture & Housewares Stores,Retail & Wholesale,$10+ billion (USD),-1,na,127,203,165.0
2,data scientist,"YOUR LIFE'S MISSION: POSSIBLE\nYou have goals,...",4.1,Navy Federal Credit Union\n4.1,"Vienna, VA",-1,10000+ Employees,1933,Self-employed,Banking & Lending,Financial Services,Unknown / Non-Applicable,-1,na,127,203,165.0
3,na,We would like to invite you to Sanofi’s mRNA V...,3.9,Sanofi\n3.9,"Waltham, MA",-1,10000+ Employees,1973,Company - Public,Biotech & Pharmaceuticals,Pharmaceutical & Biotechnology,$10+ billion (USD),-1,na,127,203,165.0
4,data scientist,About care.ai\ncare.ai is the leading provider...,-1.0,care.ai,Remote,-1,-1,-1,-1,-1,-1,-1,-1,na,127,203,165.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,data scientist,The Home Depot is able to offer virtual employ...,3.8,The Home Depot\n3.8,"Atlanta, GA",-1,10000+ Employees,1978,Company - Public,Home Furniture & Housewares Stores,Retail & Wholesale,$10+ billion (USD),-1,senior,127,127,127.0
996,data scientist,"JOB SUMMARY: Design, develop and evaluate adva...",4.0,Universal Orlando\n4.0,"Orlando, FL",-1,10000+ Employees,1987,Subsidiary or Business Segment,Hotels & Resorts,Hotels & Travel Accommodation,$1 to $5 billion (USD),-1,na,127,127,127.0
997,data scientist,Overview\n\nGuidehouse is a leading global pro...,3.7,Guidehouse\n3.7,"Washington Harbor, WA",-1,1001 to 5000 Employees,2018,Company - Private,Business Consulting,Management & Consulting,Unknown / Non-Applicable,-1,senior,127,127,127.0
998,data scientist,"Location: 7000 Target Pkwy N, Brooklyn Park, M...",3.6,Target\n3.6,"Brooklyn Park, MN",-1,10000+ Employees,1962,Company - Public,General Merchandise & Superstores,Retail & Wholesale,$10+ billion (USD),-1,senior,127,127,127.0


### *`job_description`*
- From *`job_description`*, create new feature *`description_len`* to count the length of the job description

In [8]:
# create new feature description_len to count the length of the job description
df['description_len'] = df['job_description'].apply(len) 

# check results
df['description_len']

0      1727
1      1508
2       550
3       873
4       580
       ... 
995    1787
996     967
997     910
998    1138
999    1011
Name: description_len, Length: 1000, dtype: int64

### *`skills`* (fix glassdoor scraper later)

In [9]:
#df['is_excel'] = df['skills'].apply(lambda x: 1 if re.search('\Wexcel\W', x.lower()) else 0)
#df['is_sql'] = df['skills'].apply(lambda x: 1 if 'sql' in x.lower() else 0)
#df['is_python'] = df['skills'].apply(lambda x: 1 if 'python' in x.lower() else 0)
#df['is_r'] = df['skills'].apply(lambda x: 1 if re.search('\Wr\W', x.lower()) else 0)
#df['is_tableau'] = df['skills'].apply(lambda x: 1 if 'tableau' in x.lower() else 0)
#df['is_spark'] = df['skills'].apply(lambda x: 1 if 'spark' in x.lower() else 0)
#df['is_aws'] = df['skills'].apply(lambda x: 1 if 'aws' in x.lower() else 0)

### *`company_name`*
- Remove the new line and rating

In [10]:
df['company_name']

0                         Zillow\n3.9
1                 The Home Depot\n3.8
2      Navy Federal Credit Union\n4.1
3                         Sanofi\n3.9
4                             care.ai
                    ...              
995               The Home Depot\n3.8
996            Universal Orlando\n4.0
997                   Guidehouse\n3.7
998                       Target\n3.6
999                       Target\n3.6
Name: company_name, Length: 1000, dtype: object

In [11]:
# remove new line and rating if rating exists
df['company_name'] = df.apply(lambda x: x['company_name'] if x['rating'] < 0 else x['company_name'][:-4], axis=1)

# check results
df['company_name']

0                         Zillow
1                 The Home Depot
2      Navy Federal Credit Union
3                         Sanofi
4                        care.ai
                 ...            
995               The Home Depot
996            Universal Orlando
997                   Guidehouse
998                       Target
999                       Target
Name: company_name, Length: 1000, dtype: object

### *`location`*
- Keeps the `Remote` and the 2-digit state abbreviations
- Fix values which are not mentioned above

In [12]:
df['location'].value_counts()

Remote                   176
Atlanta, GA              143
Hartford, CT              75
Washington Harbor, WA     58
Brooklyn Park, MN         53
                        ... 
Urbandale, IA              1
Tysons Corner, VA          1
Waltham, MA                1
Arizona                    1
Tennessee City, TN         1
Name: location, Length: 67, dtype: int64

In [13]:
# keeps the remote and abbreviations
df['location'] = df['location'].apply(lambda x: x.split(', ')[1] if x.find(',') != -1 else x)

# check results
df['location'].value_counts()

Remote            176
GA                145
CA                116
WA                 81
CT                 75
VA                 75
MN                 53
TX                 39
NY                 39
CO                 38
FL                 33
NJ                 31
IL                 31
MA                 16
DC                 14
AZ                  9
New York State      9
NV                  4
OH                  3
MO                  2
United States       2
MD                  2
DE                  2
IA                  1
TN                  1
SC                  1
NC                  1
Arizona             1
Name: location, dtype: int64

Fix errors:
- `New York State` &rarr; `NY`
- *`United States`* &rarr; *`Remote`*
- *`Arizona`* &rarr; *`AZ`*

In [14]:
fix = {'New York State': 'NY',
       'United States': 'Remote',
       'Arizona': 'AZ'}

df['location'] = df['location'].replace(fix)

# check results
df['location'].value_counts()

Remote    178
GA        145
CA        116
WA         81
CT         75
VA         75
MN         53
NY         48
TX         39
CO         38
FL         33
IL         31
NJ         31
MA         16
DC         14
AZ         10
NV          4
OH          3
DE          2
MD          2
MO          2
TN          1
SC          1
IA          1
NC          1
Name: location, dtype: int64

### *`headquarters`*
 - Drop the whole column because there is no information

In [15]:
print(df['headquarters'].value_counts())
df.drop(['headquarters'], axis=1, inplace=True)

# check results
'headquarters' in df.columns

-1    1000
Name: headquarters, dtype: int64


False

### *`size`*
- `-1` &rarr; `Unknown`

In [16]:
df['size'].value_counts()

10000+ Employees           547
1001 to 5000 Employees      97
51 to 200 Employees         91
-1                          85
5001 to 10000 Employees     74
201 to 500 Employees        45
1 to 50 Employees           43
Unknown                     12
501 to 1000 Employees        6
Name: size, dtype: int64

In [17]:
df['size'] = df['size'].str.replace('-1', 'Unknown')

# check results
df['size'].value_counts()

10000+ Employees           547
Unknown                     97
1001 to 5000 Employees      97
51 to 200 Employees         91
5001 to 10000 Employees     74
201 to 500 Employees        45
1 to 50 Employees           43
501 to 1000 Employees        6
Name: size, dtype: int64

### *`founded`*
- 2022 - founded year

In [18]:
df['founded']

0      2005
1      1978
2      1933
3      1973
4        -1
       ... 
995    1978
996    1987
997    2018
998    1962
999    1962
Name: founded, Length: 1000, dtype: int64

In [19]:
df['age'] = df['founded'].apply(lambda x: 2022-x if x != -1 else x)
df.drop(['founded'], axis=1, inplace=True)
# check results
print('headquarters' in df.columns)
print('\n')
df['age']

False




0      17
1      44
2      89
3      49
4      -1
       ..
995    44
996    35
997     4
998    60
999    60
Name: age, Length: 1000, dtype: int64

### *`revenue`*
- `-1` &rarr; `Unknown / Non-Applicable`

In [20]:
df['revenue'].value_counts()

$10+ billion (USD)                  426
Unknown / Non-Applicable            232
$1 to $5 billion (USD)               87
-1                                   85
$25 to $100 million (USD)            53
$5 to $10 billion (USD)              33
$5 to $25 million (USD)              31
$100 to $500 million (USD)           25
$1 to $5 million (USD)               18
$500 million to $1 billion (USD)      9
Less than $1 million (USD)            1
Name: revenue, dtype: int64

In [21]:
df['revenue'] = df['revenue'].str.replace('-1', 'Unknown / Non-Applicable')

# check results
df['revenue'].value_counts()

$10+ billion (USD)                  426
Unknown / Non-Applicable            317
$1 to $5 billion (USD)               87
$25 to $100 million (USD)            53
$5 to $10 billion (USD)              33
$5 to $25 million (USD)              31
$100 to $500 million (USD)           25
$1 to $5 million (USD)               18
$500 million to $1 billion (USD)      9
Less than $1 million (USD)            1
Name: revenue, dtype: int64

### *`competitors`*
- Drop the whole column because there is no information

In [22]:
print(df['competitors'].value_counts())
df.drop(['competitors'], axis=1, inplace=True)

# check results
'competitors' in df.columns

-1    1000
Name: competitors, dtype: int64


False

In [None]:
df.to_csv('glassdoor_jobs_cleaned.csv', index=False)