In [282]:
import pandas as pd
import numpy as np
import re
from datetime import date
import matplotlib.pyplot as plt

In [283]:
# read in the scraped data file
df = pd.read_csv("glassdoor_jobs.csv", names = ['index', 'job_title', 'salary', 'job_description',  'company_rating', 'company_name', 'location', 'headquarters', 'company_size', 'founded', 'ownership_type', 'industry', 'sector', 'revenue', 'competitors' ])

# drop entries with no salary data
df = df[df['salary']!= "-1"]

# drop the first row and first column from the dataset
df = df.iloc[1:, 1:]

df.head()


Unnamed: 0,job_title,salary,job_description,company_rating,company_name,location,headquarters,company_size,founded,ownership_type,industry,sector,revenue,competitors
1,Data Scientist,$53K-$91K (Glassdoor est.),"Data Scientist\nLocation: Albuquerque, NM\nEdu...",3.8,Tecolote Research\n3.8,"Albuquerque, NM","Goleta, CA",501 to 1000 employees,1973,Company - Private,Aerospace & Defense,Aerospace & Defense,$50 to $100 million (USD),-1
2,Healthcare Data Scientist,$63K-$112K (Glassdoor est.),What You Will Do:\n\nI. General Summary\n\nThe...,3.4,University of Maryland Medical System\n3.4,"Linthicum, MD","Baltimore, MD",10000+ employees,1984,Other Organization,Health Care Services & Hospitals,Health Care,$2 to $5 billion (USD),-1
3,Data Scientist,$80K-$90K (Glassdoor est.),"KnowBe4, Inc. is a high growth information sec...",4.8,KnowBe4\n4.8,"Clearwater, FL","Clearwater, FL",501 to 1000 employees,2010,Company - Private,Security Services,Business Services,$100 to $500 million (USD),-1
4,Data Scientist,$56K-$97K (Glassdoor est.),*Organization and Job ID**\nJob ID: 310709\n\n...,3.8,PNNL\n3.8,"Richland, WA","Richland, WA",1001 to 5000 employees,1965,Government,Energy,"Oil, Gas, Energy & Utilities",$500 million to $1 billion (USD),"Oak Ridge National Laboratory, National Renewa..."
5,Data Scientist,$86K-$143K (Glassdoor est.),Data Scientist\nAffinity Solutions / Marketing...,2.9,Affinity Solutions\n2.9,"New York, NY","New York, NY",51 to 200 employees,1998,Company - Private,Advertising & Marketing,Business Services,Unknown / Non-Applicable,"Commerce Signals, Cardlytics, Yodlee"


In [284]:
# view the data type of each column of the dataframe
df.dtypes

job_title          object
salary             object
job_description    object
company_rating     object
company_name       object
location           object
headquarters       object
company_size       object
founded            object
ownership_type     object
industry           object
sector             object
revenue            object
competitors        object
dtype: object

In [285]:
df = df.replace(to_replace = -1, value = np.nan)
df.isnull().sum()

job_title          0
salary             0
job_description    0
company_rating     0
company_name       0
location           0
headquarters       0
company_size       0
founded            0
ownership_type     0
industry           0
sector             0
revenue            0
competitors        0
dtype: int64

In [286]:
df['job_title'].value_counts()

Data Scientist                  131
Data Engineer                    53
Senior Data Scientist            34
Data Analyst                     15
Senior Data Engineer             14
                               ... 
Healthcare Data Scientist         1
R&D Sr Data Scientist             1
Associate Scientist               1
Senior Quantitative Analyst       1
Scientist 2, QC Viral Vector      1
Name: job_title, Length: 264, dtype: int64

In [287]:
df['city'] = df['location'].apply(lambda x: x.split(',')[0])
cities_5_jobs = df.city.value_counts()[0:35].keys()
df['city'] = df['city'].apply(lambda x: x if x in cities_5_jobs else 'other')

consolidated_city_dict = {'South San Francisco': 'San Francisco', 
                          'Mountain View': 'Bay Area', 'Palo Alto': 'Bay Area', 'Washington': 'DC',
                          'Cambridge': 'Boston', 'San Jose': 'Bay Area'}

df['city'] = df['city'].replace(consolidated_city_dict)
df['city'].value_counts()

other             309
Boston             71
San Francisco      57
New York           55
Chicago            32
Bay Area           28
Pittsburgh         12
Rockville          11
DC                 11
Winston-Salem      10
Springfield        10
Herndon            10
Richland           10
Indianapolis        9
San Diego           9
Austin              8
Rochester           7
Charlotte           6
Phoenix             6
Gaithersburg        6
Chantilly           6
Salt Lake City      6
Dallas              6
Huntsville          6
Marlborough         6
Nashville           5
Cincinnati          5
Seattle             5
Denver              5
Milwaukee           5
Worcester           5
Philadelphia        5
Name: city, dtype: int64

In [288]:

def role_abstractor(job_title):
    if 'data scientist' in job_title.lower():
        return 'data scientist'
    elif 'data engineer' in job_title.lower():
        return 'data engineer'
    elif 'analyst' in job_title.lower() or \
        'business intelligence' in job_title.lower() or \
        'bi' in job_title.lower():
        return 'data analyst'
    elif 'research' in job_title.lower() or 'R&D' in job_title.lower():
        return 'research scientist'
    elif 'machine learning' in job_title.lower():
        return 'machine learning engineer'
    elif 'manager' in job_title.lower():
        return 'manager'
    elif 'director' in job_title.lower():
        return 'director'
    elif 'product' in job_title.lower():
        return 'product analyst'
    elif 'analytics' in job_title.lower():
        return 'data analyst'
    elif 'software' in job_title.lower():
        return 'software engineer'
    else:
        return 'other'

df['role'] = df['job_title'].apply(role_abstractor)
df['role'].value_counts()

data scientist               279
data analyst                 143
other                        121
data engineer                119
research scientist            22
manager                       20
machine learning engineer     15
director                      13
product analyst                6
software engineer              4
Name: role, dtype: int64

In [289]:
low_sr = ['associate', 'jr', 'I', 'entry', 'junior']
med_sr = ['senior', 'II', 'III', 'lead', 'head', 'sr']
high_sr = ['staff', 'principal', 'director', 'chief']

def find_seniority(job_title):
    title_text = job_title.lower()
    if any(ele in title_text for ele in high_sr):
        return 3
    elif any(ele in title_text for ele in med_sr):
        return 2
    elif any(ele in title_text for ele in low_sr):
        return 1
    else:
        return 2
    

df['seniority'] = df['job_title'].apply(find_seniority)
df['seniority'].value_counts()
    

2    654
3     63
1     25
Name: seniority, dtype: int64

In [290]:
df['company_rating'] = df['company_rating'].apply(float)


In [291]:
def find_avg_salary(string_salary):
    char_list = ['$', 'k', 'K', ' ', 'P', 'e', 'r', 'H', 'o', 'u', 'r']
    
    
    # if weird format "Employer Provided Salary:$x-$y" (943 and 936 hourly)
    if "Employer" in string_salary[0:8]:
        # if hourly in the salary string
        if 'hour' in string_salary.lower():
            str_list = string_salary.split(':')
            salary_range = str_list[1]
            salary_range=re.sub("|".join(char_list), "", salary_range)
            salary_range = salary_range.replace('$', '')
            salary_range_split = salary_range.split('-')
            salary_range_split[0] = (int(salary_range_split[0]) * 2080)/1000
            salary_range_split[1] = (int(salary_range_split[1]) * 2080)/1000
        else:
            str_list = string_salary.split(':')
            salary_range = str_list[1]
            salary_range=re.sub("|".join(char_list), "", salary_range)
            salary_range = salary_range.replace('$', '')
            salary_range_split = salary_range.split('-')
    
    # if default salary format "$x-$y (Glassdoor est.)" (916 and 919 per hour)
    else:
        # if hourly in the salary string
        if 'hour' in string_salary.lower():
            str_list = string_salary.split('Per')
            salary_range = str_list[0]
            salary_range=re.sub("|".join(char_list), "", salary_range)
            salary_range = salary_range.replace('$', '')
            salary_range_split = salary_range.split('-')
            salary_range_split[0] = (int(salary_range_split[0]) * 2080)/1000
            salary_range_split[1] = (int(salary_range_split[1]) * 2080)/1000
            
        else:
            
            str_list = string_salary.split('(')
            salary_range = str_list[0]
            salary_range=re.sub("|".join(char_list), "", salary_range)
            salary_range = salary_range.replace('$', '')
            salary_range_split = salary_range.split('-')

    avg_salary = (int(salary_range_split[0]) + int(salary_range_split[1])) * 0.5
    return avg_salary



In [292]:
df['avg_salary'] = df['salary'].apply(find_avg_salary)
df.head()

Unnamed: 0,job_title,salary,job_description,company_rating,company_name,location,headquarters,company_size,founded,ownership_type,industry,sector,revenue,competitors,city,role,seniority,avg_salary
1,Data Scientist,$53K-$91K (Glassdoor est.),"Data Scientist\nLocation: Albuquerque, NM\nEdu...",3.8,Tecolote Research\n3.8,"Albuquerque, NM","Goleta, CA",501 to 1000 employees,1973,Company - Private,Aerospace & Defense,Aerospace & Defense,$50 to $100 million (USD),-1,other,data scientist,2,72.0
2,Healthcare Data Scientist,$63K-$112K (Glassdoor est.),What You Will Do:\n\nI. General Summary\n\nThe...,3.4,University of Maryland Medical System\n3.4,"Linthicum, MD","Baltimore, MD",10000+ employees,1984,Other Organization,Health Care Services & Hospitals,Health Care,$2 to $5 billion (USD),-1,other,data scientist,2,87.5
3,Data Scientist,$80K-$90K (Glassdoor est.),"KnowBe4, Inc. is a high growth information sec...",4.8,KnowBe4\n4.8,"Clearwater, FL","Clearwater, FL",501 to 1000 employees,2010,Company - Private,Security Services,Business Services,$100 to $500 million (USD),-1,other,data scientist,2,85.0
4,Data Scientist,$56K-$97K (Glassdoor est.),*Organization and Job ID**\nJob ID: 310709\n\n...,3.8,PNNL\n3.8,"Richland, WA","Richland, WA",1001 to 5000 employees,1965,Government,Energy,"Oil, Gas, Energy & Utilities",$500 million to $1 billion (USD),"Oak Ridge National Laboratory, National Renewa...",Richland,data scientist,2,76.5
5,Data Scientist,$86K-$143K (Glassdoor est.),Data Scientist\nAffinity Solutions / Marketing...,2.9,Affinity Solutions\n2.9,"New York, NY","New York, NY",51 to 200 employees,1998,Company - Private,Advertising & Marketing,Business Services,Unknown / Non-Applicable,"Commerce Signals, Cardlytics, Yodlee",New York,data scientist,2,114.5


In [293]:
def get_company_name(string_name):
    name_list = string_name.split('\n')
    return name_list[0]

def get_state (str):
    str_list = str.split(',')
    return str_list[1].strip(' ')

df['company_name'] = df['company_name'].apply(get_company_name)

df['state'] = df['location'].apply(get_state)
df.head()

Unnamed: 0,job_title,salary,job_description,company_rating,company_name,location,headquarters,company_size,founded,ownership_type,industry,sector,revenue,competitors,city,role,seniority,avg_salary,state
1,Data Scientist,$53K-$91K (Glassdoor est.),"Data Scientist\nLocation: Albuquerque, NM\nEdu...",3.8,Tecolote Research,"Albuquerque, NM","Goleta, CA",501 to 1000 employees,1973,Company - Private,Aerospace & Defense,Aerospace & Defense,$50 to $100 million (USD),-1,other,data scientist,2,72.0,NM
2,Healthcare Data Scientist,$63K-$112K (Glassdoor est.),What You Will Do:\n\nI. General Summary\n\nThe...,3.4,University of Maryland Medical System,"Linthicum, MD","Baltimore, MD",10000+ employees,1984,Other Organization,Health Care Services & Hospitals,Health Care,$2 to $5 billion (USD),-1,other,data scientist,2,87.5,MD
3,Data Scientist,$80K-$90K (Glassdoor est.),"KnowBe4, Inc. is a high growth information sec...",4.8,KnowBe4,"Clearwater, FL","Clearwater, FL",501 to 1000 employees,2010,Company - Private,Security Services,Business Services,$100 to $500 million (USD),-1,other,data scientist,2,85.0,FL
4,Data Scientist,$56K-$97K (Glassdoor est.),*Organization and Job ID**\nJob ID: 310709\n\n...,3.8,PNNL,"Richland, WA","Richland, WA",1001 to 5000 employees,1965,Government,Energy,"Oil, Gas, Energy & Utilities",$500 million to $1 billion (USD),"Oak Ridge National Laboratory, National Renewa...",Richland,data scientist,2,76.5,WA
5,Data Scientist,$86K-$143K (Glassdoor est.),Data Scientist\nAffinity Solutions / Marketing...,2.9,Affinity Solutions,"New York, NY","New York, NY",51 to 200 employees,1998,Company - Private,Advertising & Marketing,Business Services,Unknown / Non-Applicable,"Commerce Signals, Cardlytics, Yodlee",New York,data scientist,2,114.5,NY


In [294]:
df['company_size'].value_counts()

def company_size_classification(text):
    if text in ['1 to 50 employees', '51 to 200 employees']:
        return 'small'
    elif text in ['201 to 500 employees', '501 to 1000 employees']:
        return 'medium'
    elif text in ['1001 to 5000 employees','5001 to 10000 employees', '10000+ employees']:
        return 'large'
    else:
        return 'N/A'
df['company_size'] = df['company_size'].apply(company_size_classification)


In [295]:
# impute missing values for company rating and company size
mean_rating = df['company_rating'].mean()
df['company_rating'] = df['company_rating'].replace(to_replace = np.nan, value = mean_rating)
mode_size = df['company_size'].mode()
df['company_size'] = df['company_size'].replace(to_replace = np.nan, value = mode_size)

df.head()

Unnamed: 0,job_title,salary,job_description,company_rating,company_name,location,headquarters,company_size,founded,ownership_type,industry,sector,revenue,competitors,city,role,seniority,avg_salary,state
1,Data Scientist,$53K-$91K (Glassdoor est.),"Data Scientist\nLocation: Albuquerque, NM\nEdu...",3.8,Tecolote Research,"Albuquerque, NM","Goleta, CA",medium,1973,Company - Private,Aerospace & Defense,Aerospace & Defense,$50 to $100 million (USD),-1,other,data scientist,2,72.0,NM
2,Healthcare Data Scientist,$63K-$112K (Glassdoor est.),What You Will Do:\n\nI. General Summary\n\nThe...,3.4,University of Maryland Medical System,"Linthicum, MD","Baltimore, MD",large,1984,Other Organization,Health Care Services & Hospitals,Health Care,$2 to $5 billion (USD),-1,other,data scientist,2,87.5,MD
3,Data Scientist,$80K-$90K (Glassdoor est.),"KnowBe4, Inc. is a high growth information sec...",4.8,KnowBe4,"Clearwater, FL","Clearwater, FL",medium,2010,Company - Private,Security Services,Business Services,$100 to $500 million (USD),-1,other,data scientist,2,85.0,FL
4,Data Scientist,$56K-$97K (Glassdoor est.),*Organization and Job ID**\nJob ID: 310709\n\n...,3.8,PNNL,"Richland, WA","Richland, WA",large,1965,Government,Energy,"Oil, Gas, Energy & Utilities",$500 million to $1 billion (USD),"Oak Ridge National Laboratory, National Renewa...",Richland,data scientist,2,76.5,WA
5,Data Scientist,$86K-$143K (Glassdoor est.),Data Scientist\nAffinity Solutions / Marketing...,2.9,Affinity Solutions,"New York, NY","New York, NY",small,1998,Company - Private,Advertising & Marketing,Business Services,Unknown / Non-Applicable,"Commerce Signals, Cardlytics, Yodlee",New York,data scientist,2,114.5,NY


In [296]:
df['sector'].value_counts()

def sector_aggregator(sector):
    if sector in [np.nan, 'Non-Profit', 'Transportation & Logistics',
                  'Real Estate', 'Travel & Tourism', 'Media', 'Telecommunications',
                  'Arts, Entertainment & Recreation', 'Customer Services', 'Construction',
                  'Repair & Maintenance', 'Mining & Metals', 'Agriculture & Forestry',
                  'Accounting & Legal']:
        return 'other'
    else:
        return sector
    
df['sector'] = df['sector'].apply(sector_aggregator)
df.head()

Unnamed: 0,job_title,salary,job_description,company_rating,company_name,location,headquarters,company_size,founded,ownership_type,industry,sector,revenue,competitors,city,role,seniority,avg_salary,state
1,Data Scientist,$53K-$91K (Glassdoor est.),"Data Scientist\nLocation: Albuquerque, NM\nEdu...",3.8,Tecolote Research,"Albuquerque, NM","Goleta, CA",medium,1973,Company - Private,Aerospace & Defense,Aerospace & Defense,$50 to $100 million (USD),-1,other,data scientist,2,72.0,NM
2,Healthcare Data Scientist,$63K-$112K (Glassdoor est.),What You Will Do:\n\nI. General Summary\n\nThe...,3.4,University of Maryland Medical System,"Linthicum, MD","Baltimore, MD",large,1984,Other Organization,Health Care Services & Hospitals,Health Care,$2 to $5 billion (USD),-1,other,data scientist,2,87.5,MD
3,Data Scientist,$80K-$90K (Glassdoor est.),"KnowBe4, Inc. is a high growth information sec...",4.8,KnowBe4,"Clearwater, FL","Clearwater, FL",medium,2010,Company - Private,Security Services,Business Services,$100 to $500 million (USD),-1,other,data scientist,2,85.0,FL
4,Data Scientist,$56K-$97K (Glassdoor est.),*Organization and Job ID**\nJob ID: 310709\n\n...,3.8,PNNL,"Richland, WA","Richland, WA",large,1965,Government,Energy,"Oil, Gas, Energy & Utilities",$500 million to $1 billion (USD),"Oak Ridge National Laboratory, National Renewa...",Richland,data scientist,2,76.5,WA
5,Data Scientist,$86K-$143K (Glassdoor est.),Data Scientist\nAffinity Solutions / Marketing...,2.9,Affinity Solutions,"New York, NY","New York, NY",small,1998,Company - Private,Advertising & Marketing,Business Services,Unknown / Non-Applicable,"Commerce Signals, Cardlytics, Yodlee",New York,data scientist,2,114.5,NY


In [297]:
(df['founded'].apply(int).isnull()).sum()

0

In [298]:
df['founded'] = df['founded'].apply(int)
mean_founded = df['founded'].mean()
df['founded'] = df['founded'].replace(to_replace = np.nan, value = mean_founded)
current_year = date.today().year
df['company_age'] = current_year - df['founded']
df['company_age'].head()



1    47
2    36
3    10
4    55
5    22
Name: company_age, dtype: int64

In [299]:

df_final = df[['role', 'seniority', 'avg_salary', 'state', 'company_rating', 'company_size', 'sector', 'company_age', 'city']]
df_final.head()

Unnamed: 0,role,seniority,avg_salary,state,company_rating,company_size,sector,company_age,city
1,data scientist,2,72.0,NM,3.8,medium,Aerospace & Defense,47,other
2,data scientist,2,87.5,MD,3.4,large,Health Care,36,other
3,data scientist,2,85.0,FL,4.8,medium,Business Services,10,other
4,data scientist,2,76.5,WA,3.8,large,"Oil, Gas, Energy & Utilities",55,Richland
5,data scientist,2,114.5,NY,2.9,small,Business Services,22,New York


In [300]:
df_final.to_csv("cleaned_data.csv", index = False)