In [58]:
import pandas as pd
import numpy as np
import re

In [24]:
# read in the scraped data file
df = pd.read_csv("glassdoor_jobs.csv", names = ['index', 'job_title', 'salary', 'job_description',  'company_rating', 'company_name', 'location', 'headquarters', 'company_size', 'founded', 'ownership_type', 'industry', 'sector', 'revenue', 'competitors' ])

# drop entries with no salary data
df = df[df['salary']!= "-1"]

# drop the first row and first column from the dataset
df = df.iloc[1:, 1:]

df.head()


Unnamed: 0,job_title,salary,job_description,company_rating,company_name,location,headquarters,company_size,founded,ownership_type,industry,sector,revenue,competitors
1,Data Scientist,$53K-$91K (Glassdoor est.),"Data Scientist\nLocation: Albuquerque, NM\nEdu...",3.8,Tecolote Research\n3.8,"Albuquerque, NM","Goleta, CA",501 to 1000 employees,1973,Company - Private,Aerospace & Defense,Aerospace & Defense,$50 to $100 million (USD),-1
2,Healthcare Data Scientist,$63K-$112K (Glassdoor est.),What You Will Do:\n\nI. General Summary\n\nThe...,3.4,University of Maryland Medical System\n3.4,"Linthicum, MD","Baltimore, MD",10000+ employees,1984,Other Organization,Health Care Services & Hospitals,Health Care,$2 to $5 billion (USD),-1
3,Data Scientist,$80K-$90K (Glassdoor est.),"KnowBe4, Inc. is a high growth information sec...",4.8,KnowBe4\n4.8,"Clearwater, FL","Clearwater, FL",501 to 1000 employees,2010,Company - Private,Security Services,Business Services,$100 to $500 million (USD),-1
4,Data Scientist,$56K-$97K (Glassdoor est.),*Organization and Job ID**\nJob ID: 310709\n\n...,3.8,PNNL\n3.8,"Richland, WA","Richland, WA",1001 to 5000 employees,1965,Government,Energy,"Oil, Gas, Energy & Utilities",$500 million to $1 billion (USD),"Oak Ridge National Laboratory, National Renewa..."
5,Data Scientist,$86K-$143K (Glassdoor est.),Data Scientist\nAffinity Solutions / Marketing...,2.9,Affinity Solutions\n2.9,"New York, NY","New York, NY",51 to 200 employees,1998,Company - Private,Advertising & Marketing,Business Services,Unknown / Non-Applicable,"Commerce Signals, Cardlytics, Yodlee"


In [25]:
# view the data type of each column of the dataframe
df.dtypes

job_title          object
salary             object
job_description    object
company_rating     object
company_name       object
location           object
headquarters       object
company_size       object
founded            object
ownership_type     object
industry           object
sector             object
revenue            object
competitors        object
dtype: object

In [28]:
df['job_title'].value_counts()

Data Scientist                            131
Data Engineer                              53
Senior Data Scientist                      34
Data Analyst                               15
Senior Data Engineer                       14
                                         ... 
RESEARCH SCIENTIST - BIOLOGICAL SAFETY      1
Data Analyst / Scientist                    1
Senior Data Scientist - Algorithms          1
Data Scientist II                           1
Data Scientist - Bioinformatics             1
Name: job_title, Length: 264, dtype: int64

In [45]:

def role_abstractor(job_title):
    if 'data scientist' in job_title.lower():
        return 'data scientist'
    elif 'data engineer' in job_title.lower():
        return 'data engineer'
    elif 'analyst' in job_title.lower() or \
        'business intelligence' in job_title.lower() or \
        'bi' in job_title.lower():
        return 'data analyst'
    elif 'research' in job_title.lower() or 'R&D' in job_title.lower():
        return 'research scientist'
    elif 'machine learning' in job_title.lower():
        return 'machine learning engineer'
    elif 'manager' in job_title.lower():
        return 'manager'
    elif 'director' in job_title.lower():
        return 'director'
    elif 'product' in job_title.lower():
        return 'product analyst'
    elif 'analytics' in job_title.lower():
        return 'data analyst'
    elif 'software' in job_title.lower():
        return 'software engineer'
    else:
        return 'other'

df['role'] = df['job_title'].apply(role_abstractor)
df['role'].value_counts()

data scientist               279
data analyst                 143
other                        121
data engineer                119
research scientist            22
manager                       20
machine learning engineer     15
director                      13
product analyst                6
software engineer              4
Name: role, dtype: int64

In [46]:
low_sr = ['associate', 'jr', 'I', 'entry', 'junior']
med_sr = ['senior', 'II', 'III', 'lead', 'head', 'sr']
high_sr = ['staff', 'principal']

def find_seniority(job_title):
    title_text = job_title.lower()
    if any(ele in job_title for ele in high_sr):
        return 3
    elif any(ele in job_title for ele in med_sr):
        return 2
    elif any(ele in job_title for ele in low_sr):
        return 1
    else:
        return 2
    

df['seniority'] = df['job_title'].apply(find_seniority)
df['seniority'].value_counts()
    

2    665
1     77
Name: seniority, dtype: int64

In [47]:
df['company_rating'] = df['company_rating'].apply(float)


1    3.8
2    3.4
3    4.8
4    3.8
5    2.9
Name: company_rating, dtype: float64

In [104]:
def find_avg_salary(string_salary):
    char_list = ['$', 'k', 'K', ' ', 'P', 'e', 'r', 'H', 'o', 'u', 'r']
    
    
    # if weird format "Employer Provided Salary:$x-$y" (943 and 936 hourly)
    if "Employer" in string_salary[0:8]:
        # if hourly in the salary string
        if 'hour' in string_salary.lower():
            str_list = string_salary.split(':')
            salary_range = str_list[1]
            salary_range=re.sub("|".join(char_list), "", salary_range)
            salary_range = salary_range.replace('$', '')
            salary_range_split = salary_range.split('-')
            salary_range_split[0] = (int(salary_range_split[0]) * 2080)/1000
            salary_range_split[1] = (int(salary_range_split[1]) * 2080)/1000
        else:
            str_list = string_salary.split(':')
            salary_range = str_list[1]
            salary_range=re.sub("|".join(char_list), "", salary_range)
            salary_range = salary_range.replace('$', '')
            salary_range_split = salary_range.split('-')
    
    # if default salary format "$x-$y (Glassdoor est.)" (916 and 919 per hour)
    else:
        # if hourly in the salary string
        if 'hour' in string_salary.lower():
            str_list = string_salary.split('Per')
            salary_range = str_list[0]
            salary_range=re.sub("|".join(char_list), "", salary_range)
            salary_range = salary_range.replace('$', '')
            salary_range_split = salary_range.split('-')
            salary_range_split[0] = (int(salary_range_split[0]) * 2080)/1000
            salary_range_split[1] = (int(salary_range_split[1]) * 2080)/1000
            
        else:
            
            str_list = string_salary.split('(')
            salary_range = str_list[0]
            salary_range=re.sub("|".join(char_list), "", salary_range)
            salary_range = salary_range.replace('$', '')
            salary_range_split = salary_range.split('-')

    avg_salary = (int(salary_range_split[0]) + int(salary_range_split[1])) * 0.5
    return avg_salary



In [105]:
df['avg_salary'] = df['salary'].apply(find_avg_salary)
df.head()

Unnamed: 0,job_title,salary,job_description,company_rating,company_name,location,headquarters,company_size,founded,ownership_type,industry,sector,revenue,competitors,role,seniority,avg_salary
1,Data Scientist,$53K-$91K (Glassdoor est.),"Data Scientist\nLocation: Albuquerque, NM\nEdu...",3.8,Tecolote Research\n3.8,"Albuquerque, NM","Goleta, CA",501 to 1000 employees,1973,Company - Private,Aerospace & Defense,Aerospace & Defense,$50 to $100 million (USD),-1,data scientist,2,72.0
2,Healthcare Data Scientist,$63K-$112K (Glassdoor est.),What You Will Do:\n\nI. General Summary\n\nThe...,3.4,University of Maryland Medical System\n3.4,"Linthicum, MD","Baltimore, MD",10000+ employees,1984,Other Organization,Health Care Services & Hospitals,Health Care,$2 to $5 billion (USD),-1,data scientist,2,87.5
3,Data Scientist,$80K-$90K (Glassdoor est.),"KnowBe4, Inc. is a high growth information sec...",4.8,KnowBe4\n4.8,"Clearwater, FL","Clearwater, FL",501 to 1000 employees,2010,Company - Private,Security Services,Business Services,$100 to $500 million (USD),-1,data scientist,2,85.0
4,Data Scientist,$56K-$97K (Glassdoor est.),*Organization and Job ID**\nJob ID: 310709\n\n...,3.8,PNNL\n3.8,"Richland, WA","Richland, WA",1001 to 5000 employees,1965,Government,Energy,"Oil, Gas, Energy & Utilities",$500 million to $1 billion (USD),"Oak Ridge National Laboratory, National Renewa...",data scientist,2,76.5
5,Data Scientist,$86K-$143K (Glassdoor est.),Data Scientist\nAffinity Solutions / Marketing...,2.9,Affinity Solutions\n2.9,"New York, NY","New York, NY",51 to 200 employees,1998,Company - Private,Advertising & Marketing,Business Services,Unknown / Non-Applicable,"Commerce Signals, Cardlytics, Yodlee",data scientist,2,114.5


In [110]:
def get_company_name(string_name):
    name_list = string_name.split('\n')
    return name_list[0]

def get_state (str):
    str_list = str.split(',')
    return str_list[1]

df['company_name'] = df['company_name'].apply(get_company_name)

df['state'] = df['location'].apply(get_state)
df.head()

Unnamed: 0,job_title,salary,job_description,company_rating,company_name,location,headquarters,company_size,founded,ownership_type,industry,sector,revenue,competitors,role,seniority,avg_salary,state
1,Data Scientist,$53K-$91K (Glassdoor est.),"Data Scientist\nLocation: Albuquerque, NM\nEdu...",3.8,Tecolote Research,"Albuquerque, NM","Goleta, CA",501 to 1000 employees,1973,Company - Private,Aerospace & Defense,Aerospace & Defense,$50 to $100 million (USD),-1,data scientist,2,72.0,NM
2,Healthcare Data Scientist,$63K-$112K (Glassdoor est.),What You Will Do:\n\nI. General Summary\n\nThe...,3.4,University of Maryland Medical System,"Linthicum, MD","Baltimore, MD",10000+ employees,1984,Other Organization,Health Care Services & Hospitals,Health Care,$2 to $5 billion (USD),-1,data scientist,2,87.5,MD
3,Data Scientist,$80K-$90K (Glassdoor est.),"KnowBe4, Inc. is a high growth information sec...",4.8,KnowBe4,"Clearwater, FL","Clearwater, FL",501 to 1000 employees,2010,Company - Private,Security Services,Business Services,$100 to $500 million (USD),-1,data scientist,2,85.0,FL
4,Data Scientist,$56K-$97K (Glassdoor est.),*Organization and Job ID**\nJob ID: 310709\n\n...,3.8,PNNL,"Richland, WA","Richland, WA",1001 to 5000 employees,1965,Government,Energy,"Oil, Gas, Energy & Utilities",$500 million to $1 billion (USD),"Oak Ridge National Laboratory, National Renewa...",data scientist,2,76.5,WA
5,Data Scientist,$86K-$143K (Glassdoor est.),Data Scientist\nAffinity Solutions / Marketing...,2.9,Affinity Solutions,"New York, NY","New York, NY",51 to 200 employees,1998,Company - Private,Advertising & Marketing,Business Services,Unknown / Non-Applicable,"Commerce Signals, Cardlytics, Yodlee",data scientist,2,114.5,NY


In [103]:
df.iloc[44,1]

'$110K-$150K(Employer est.)'