In [1]:
import pandas as pd
pd.set_option('display.max_columns', None)
#pd.set_option('display.max_rows', None)

In [2]:
glassdoorjobs = pd.read_csv('Glassdoor_Job_Postings.csv')
df = glassdoorjobs.copy()

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 900 entries, 0 to 899
Data columns (total 18 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   company                      899 non-null    object 
 1   job_title                    900 non-null    object 
 2   company_rating               656 non-null    float64
 3   job_description              888 non-null    object 
 4   location                     900 non-null    object 
 5   salary_avg_estimate          636 non-null    object 
 6   salary_estimate_payperiod    636 non-null    object 
 7   company_size                 774 non-null    object 
 8   company_founded              774 non-null    object 
 9   employment_type              774 non-null    object 
 10  industry                     774 non-null    object 
 11  sector                       774 non-null    object 
 12  revenue                      774 non-null    object 
 13  career_opportunities

In [4]:
df.head(5)

Unnamed: 0,company,job_title,company_rating,job_description,location,salary_avg_estimate,salary_estimate_payperiod,company_size,company_founded,employment_type,industry,sector,revenue,career_opportunities_rating,comp_and_benefits_rating,culture_and_values_rating,senior_management_rating,work_life_balance_rating
0,ABB,Junior Data Analyst,4.0,Junior Data Analyst\nTake your next career ste...,Bengaluru,"₹3,25,236",/yr (est.),10000+ Employees,1883,Company - Public,Electronics Manufacturing,Manufacturing,$10+ billion (USD),3.7,3.6,4.0,3.5,3.9
1,Philips,Data Scientist - AI/ML,4.0,Job Title\nData Scientist - AI/ML\nJob Descrip...,Bengaluru,,,10000+ Employees,1891,Company - Public,Healthcare Services & Hospitals,Healthcare,$10+ billion (USD),3.8,3.7,4.0,3.5,4.0
2,HSBC,Data Science GSC’s,3.9,Job description\nGraduate/ Post-graduate degre...,Bengaluru,,,10000+ Employees,1865,Company - Public,Banking & Lending,Finance,$10+ billion (USD),3.6,3.6,3.8,3.4,3.7
3,Facctum Solutions,Data Analyst,,Job Description\nExperience: 0 - 2 years in da...,Karnataka,,,1 to 50 Employees,--,Company - Private,--,--,Unknown / Non-Applicable,,,,,
4,JPMorgan Chase & Co,Data and Analytics - Associate,4.0,JOB DESCRIPTION\n\nYou are a strategic thinker...,India,,,10000+ Employees,1799,Company - Public,Banking & Lending,Finance,$10+ billion (USD),4.0,3.9,3.9,3.6,3.7


## Find nulls and 'Unknown' records

In [5]:
# to find records that are odd - worked through every column
df['work_life_balance_rating'].unique()
#df['company'].value_counts()

array([3.9, 4. , 3.7, nan, 4.6, 3.5, 3.8, 3.1, 3.2, 4.1, 2. , 3. , 3.4,
       2.9, 4.3, 4.2, 1. , 3.6, 3.3, 4.4, 2.5, 2.2, 4.5, 2.8, 4.8, 4.9,
       4.7, 2.4, 5. , 2.3, 2.7, 2.6, 1.8])

In [6]:
# Looking at percentage of nulls
nulls = df.isna().sum()
nulls[nulls > 0] / df.shape[0] * 100

company                         0.111111
company_rating                 27.111111
job_description                 1.333333
salary_avg_estimate            29.333333
salary_estimate_payperiod      29.333333
company_size                   14.000000
company_founded                14.000000
employment_type                14.000000
industry                       14.000000
sector                         14.000000
revenue                        14.000000
career_opportunities_rating    18.777778
comp_and_benefits_rating       18.777778
culture_and_values_rating      18.777778
senior_management_rating       18.777778
work_life_balance_rating       18.777778
dtype: float64

In [7]:
# replace nulls
def replaceNulls(column, value):
    df[column] = df[column].fillna(value)

replaceNulls('company', 'Unknown')
replaceNulls('company_rating', -99) # fill with -99 to stand out from other ratings, indicating there is no rating
replaceNulls('job_description', 'Unknown')
replaceNulls('salary_avg_estimate', 'Unknown')
replaceNulls('salary_estimate_payperiod', 'Unknown')
replaceNulls('company_size', 'Unknown')
replaceNulls('company_founded', '0000')
replaceNulls('employment_type', 'Unknown')
replaceNulls('industry', 'Unknown')
replaceNulls('sector', 'Unknown')
replaceNulls('revenue', 'Unknown / Non-Applicable') # to match what's already there in that column
replaceNulls('career_opportunities_rating', -99)
replaceNulls('comp_and_benefits_rating', -99)
replaceNulls('culture_and_values_rating', -99)
replaceNulls('senior_management_rating', -99)
replaceNulls('work_life_balance_rating', -99)

In [8]:
df.isna().sum()

company                        0
job_title                      0
company_rating                 0
job_description                0
location                       0
salary_avg_estimate            0
salary_estimate_payperiod      0
company_size                   0
company_founded                0
employment_type                0
industry                       0
sector                         0
revenue                        0
career_opportunities_rating    0
comp_and_benefits_rating       0
culture_and_values_rating      0
senior_management_rating       0
work_life_balance_rating       0
dtype: int64

### Replace any anomalies
During EDA, I noticed that some records had '--' instead of NaN or 'Unknown'

In [9]:
# to find columns with the '--'
def findValue(df, value):
    for column in df.columns:
        if (df[column].unique() == value).any():
            print(f'{column}')

findValue(df, '--')

company_founded
industry
sector


In [10]:
# change those records appropriately
df['company_founded'] = df['company_founded'].replace('--', '0000')
df['industry'] = df['industry'].replace('--', 'Unknown')
df['sector'] = df['sector'].replace('--', 'Unknown')

In [11]:
df

Unnamed: 0,company,job_title,company_rating,job_description,location,salary_avg_estimate,salary_estimate_payperiod,company_size,company_founded,employment_type,industry,sector,revenue,career_opportunities_rating,comp_and_benefits_rating,culture_and_values_rating,senior_management_rating,work_life_balance_rating
0,ABB,Junior Data Analyst,4.0,Junior Data Analyst\nTake your next career ste...,Bengaluru,"₹3,25,236",/yr (est.),10000+ Employees,1883,Company - Public,Electronics Manufacturing,Manufacturing,$10+ billion (USD),3.7,3.6,4.0,3.5,3.9
1,Philips,Data Scientist - AI/ML,4.0,Job Title\nData Scientist - AI/ML\nJob Descrip...,Bengaluru,Unknown,Unknown,10000+ Employees,1891,Company - Public,Healthcare Services & Hospitals,Healthcare,$10+ billion (USD),3.8,3.7,4.0,3.5,4.0
2,HSBC,Data Science GSC’s,3.9,Job description\nGraduate/ Post-graduate degre...,Bengaluru,Unknown,Unknown,10000+ Employees,1865,Company - Public,Banking & Lending,Finance,$10+ billion (USD),3.6,3.6,3.8,3.4,3.7
3,Facctum Solutions,Data Analyst,-99.0,Job Description\nExperience: 0 - 2 years in da...,Karnataka,Unknown,Unknown,1 to 50 Employees,0000,Company - Private,Unknown,Unknown,Unknown / Non-Applicable,-99.0,-99.0,-99.0,-99.0,-99.0
4,JPMorgan Chase & Co,Data and Analytics - Associate,4.0,JOB DESCRIPTION\n\nYou are a strategic thinker...,India,Unknown,Unknown,10000+ Employees,1799,Company - Public,Banking & Lending,Finance,$10+ billion (USD),4.0,3.9,3.9,3.6,3.7
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
895,Kpro Solutions,Analytics & Data Science,4.0,Experience & Qualification:\n\nMinimum of 9 ye...,Bengaluru,"₹4,83,915",/yr (est.),1 to 50 Employees,0000,Company - Private,Unknown,Unknown,Unknown / Non-Applicable,4.0,4.0,4.0,4.0,4.0
896,Athena Global Technologies,Data Engineer,-99.0,Immediate requirement for Data Engineer\nExper...,Hyderābād,"₹6,78,949",/yr (est.),51 to 200 Employees,0000,Company - Private,Unknown,Unknown,Unknown / Non-Applicable,4.2,3.5,4.0,3.9,3.9
897,Wesco,India-Bangalore: Data Engineer,3.7,This person will work independently or with a ...,Bengaluru,"₹6,51,920",/yr (est.),10000+ Employees,1922,Company - Public,Wholesale,Retail & Wholesale,$10+ billion (USD),3.5,3.4,3.6,3.3,3.8
898,Course5,Data Scientist,4.2,If you meet our position requirements and can ...,Bengaluru,"₹4,24,426",/yr (est.),1001 to 5000 Employees,2000,Company - Private,Business Consulting,Management & Consulting,$100 to $500 million (USD),4.3,4.1,4.3,4.1,4.2


### Cast to appropriate data types

In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 900 entries, 0 to 899
Data columns (total 18 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   company                      900 non-null    object 
 1   job_title                    900 non-null    object 
 2   company_rating               900 non-null    float64
 3   job_description              900 non-null    object 
 4   location                     900 non-null    object 
 5   salary_avg_estimate          900 non-null    object 
 6   salary_estimate_payperiod    900 non-null    object 
 7   company_size                 900 non-null    object 
 8   company_founded              900 non-null    object 
 9   employment_type              900 non-null    object 
 10  industry                     900 non-null    object 
 11  sector                       900 non-null    object 
 12  revenue                      900 non-null    object 
 13  career_opportunities

In [13]:
df['company_founded'] = df['company_founded'].astype(int)

In [14]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 900 entries, 0 to 899
Data columns (total 18 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   company                      900 non-null    object 
 1   job_title                    900 non-null    object 
 2   company_rating               900 non-null    float64
 3   job_description              900 non-null    object 
 4   location                     900 non-null    object 
 5   salary_avg_estimate          900 non-null    object 
 6   salary_estimate_payperiod    900 non-null    object 
 7   company_size                 900 non-null    object 
 8   company_founded              900 non-null    int32  
 9   employment_type              900 non-null    object 
 10  industry                     900 non-null    object 
 11  sector                       900 non-null    object 
 12  revenue                      900 non-null    object 
 13  career_opportunities

### Check for duplicates and delete them

In [15]:
df[df.duplicated()]

Unnamed: 0,company,job_title,company_rating,job_description,location,salary_avg_estimate,salary_estimate_payperiod,company_size,company_founded,employment_type,industry,sector,revenue,career_opportunities_rating,comp_and_benefits_rating,culture_and_values_rating,senior_management_rating,work_life_balance_rating
481,"Medpace, Inc.",Data Coordinator - Core Laboratory,3.4,Job Summary :\nOur corporate activities are gr...,Thāne,"₹5,89,237",/yr (est.),5001 to 10000 Employees,1992,Company - Public,Biotech & Pharmaceuticals,Pharmaceutical & Biotechnology,$2 to $5 billion (USD),3.3,3.0,3.2,3.0,3.2
570,"Medpace, Inc.",Data Coordinator - Core Laboratory,3.4,Job Summary :\nOur corporate activities are gr...,Thāne,"₹5,89,237",/yr (est.),5001 to 10000 Employees,1992,Company - Public,Biotech & Pharmaceuticals,Pharmaceutical & Biotechnology,$2 to $5 billion (USD),3.3,3.0,3.2,3.0,3.2
600,Sandvik,Data Entry Operator,4.2,"Sandvik Mining & Rock Solutions, the leading g...",Udaipur,"₹4,15,692",/yr (est.),10000+ Employees,1862,Company - Public,Machinery Manufacturing,Manufacturing,$10+ billion (USD),3.8,3.9,4.0,3.5,3.9
780,Macgence Technologies (OPC) PVT LTD,Internship (Data Annotation),4.8,Selected intern's day-to-day responsibilities ...,Remote,"₹5,000",/mo (est.),Unknown,0,Company - Public,Unknown,Unknown,Unknown / Non-Applicable,5.0,5.0,5.0,5.0,5.0
782,Empower,Sr Analyst Data Science,3.9,Grow your career with a growing organization\n...,Bengaluru,"₹3,94,308",/yr (est.),10000+ Employees,1907,Company - Private,Investment & Asset Management,Finance,$2 to $5 billion (USD),3.9,3.6,3.9,3.6,4.0


In [16]:
df[df['company'] == 'Medpace, Inc.']
df = df.drop(index=[570, 481])
df.shape

(898, 18)

In [17]:
df[df['company'] == 'Sandvik']
df = df.drop(index=[480, 600])
df.shape

(896, 18)

In [18]:
df[df['company'] == 'Macgence Technologies (OPC) PVT LTD']
df = df.drop(index=780)
df

Unnamed: 0,company,job_title,company_rating,job_description,location,salary_avg_estimate,salary_estimate_payperiod,company_size,company_founded,employment_type,industry,sector,revenue,career_opportunities_rating,comp_and_benefits_rating,culture_and_values_rating,senior_management_rating,work_life_balance_rating
0,ABB,Junior Data Analyst,4.0,Junior Data Analyst\nTake your next career ste...,Bengaluru,"₹3,25,236",/yr (est.),10000+ Employees,1883,Company - Public,Electronics Manufacturing,Manufacturing,$10+ billion (USD),3.7,3.6,4.0,3.5,3.9
1,Philips,Data Scientist - AI/ML,4.0,Job Title\nData Scientist - AI/ML\nJob Descrip...,Bengaluru,Unknown,Unknown,10000+ Employees,1891,Company - Public,Healthcare Services & Hospitals,Healthcare,$10+ billion (USD),3.8,3.7,4.0,3.5,4.0
2,HSBC,Data Science GSC’s,3.9,Job description\nGraduate/ Post-graduate degre...,Bengaluru,Unknown,Unknown,10000+ Employees,1865,Company - Public,Banking & Lending,Finance,$10+ billion (USD),3.6,3.6,3.8,3.4,3.7
3,Facctum Solutions,Data Analyst,-99.0,Job Description\nExperience: 0 - 2 years in da...,Karnataka,Unknown,Unknown,1 to 50 Employees,0,Company - Private,Unknown,Unknown,Unknown / Non-Applicable,-99.0,-99.0,-99.0,-99.0,-99.0
4,JPMorgan Chase & Co,Data and Analytics - Associate,4.0,JOB DESCRIPTION\n\nYou are a strategic thinker...,India,Unknown,Unknown,10000+ Employees,1799,Company - Public,Banking & Lending,Finance,$10+ billion (USD),4.0,3.9,3.9,3.6,3.7
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
895,Kpro Solutions,Analytics & Data Science,4.0,Experience & Qualification:\n\nMinimum of 9 ye...,Bengaluru,"₹4,83,915",/yr (est.),1 to 50 Employees,0,Company - Private,Unknown,Unknown,Unknown / Non-Applicable,4.0,4.0,4.0,4.0,4.0
896,Athena Global Technologies,Data Engineer,-99.0,Immediate requirement for Data Engineer\nExper...,Hyderābād,"₹6,78,949",/yr (est.),51 to 200 Employees,0,Company - Private,Unknown,Unknown,Unknown / Non-Applicable,4.2,3.5,4.0,3.9,3.9
897,Wesco,India-Bangalore: Data Engineer,3.7,This person will work independently or with a ...,Bengaluru,"₹6,51,920",/yr (est.),10000+ Employees,1922,Company - Public,Wholesale,Retail & Wholesale,$10+ billion (USD),3.5,3.4,3.6,3.3,3.8
898,Course5,Data Scientist,4.2,If you meet our position requirements and can ...,Bengaluru,"₹4,24,426",/yr (est.),1001 to 5000 Employees,2000,Company - Private,Business Consulting,Management & Consulting,$100 to $500 million (USD),4.3,4.1,4.3,4.1,4.2


In [19]:
df[df['company'] == 'Empower']
df = df.drop(index=782)
df.shape

(894, 18)

Now the data cleaned up! Nulls are filled, duplicates are removed and any anomalies were replaced appropriately for consistency with unknown values.