In [2]:
import pandas as pd

In [3]:
data = pd.read_csv('glassdoor_ds_jobs.csv')
data.head()

Unnamed: 0,Job Title,Salary Estimate,Job Description,Rating,Company Name,Location,Size,Founded,Type of ownership,Industry,Sector,Revenue,Competitors
0,Intern - Data Science Engineer,-1,Our vision is to transform how the world uses ...,3.7,Micron\n3.7,Hyderābād,10000+ Employees,1978,Company - Public,Computer Hardware Development,Information Technology,$10+ billion (USD),-1
1,Assistant Data Science Engineer,Employer Provided Salary:₹7L - ₹8L,"Established in 2007, Educate Girls’ is a non-p...",3.9,Educate Girls\n3.9,Mumbai,1001 to 5000 Employees,-1,Company - Private,-1,-1,Unknown / Non-Applicable,-1
2,Data Scientist,₹6L - ₹8L (Glassdoor Est.),About Us\n\nRamSoft is a global leader in Web ...,3.8,RamSoft\n3.8,Bengaluru,51 to 200 Employees,1994,Company - Private,Software Development,Information Technology,Unknown / Non-Applicable,-1
3,Data Science Internship,-1,Selected intern's day-to-day responsibilities ...,4.3,UpTricks\n4.3,Pune,51 to 200 Employees,2013,Self-employed,Information Technology Support Services,Information Technology,$5 to $25 million (USD),-1
4,Data Analytics,₹4L - ₹7L (Glassdoor Est.),Job Description\n\n\n\n\nDomain\n\n\nData Anal...,3.7,Tata Technologies Europe\n3.7,Pune,10000+ Employees,1989,Company - Private,Research & Development,Management & Consulting,$100 to $500 million (USD),-1


##### Data Cleaning
- ✅ Salary Parsing
- ✅ Company Name text only
- ~~Location Field~~
- ✅ Age of Company
- Parsing of Job Description (Python, etc)

In [4]:
data = data[data['Salary Estimate'] != '-1']

#adding new column for Employer provided salary
data['Employer Provided Salary'] = data['Salary Estimate'].apply(lambda x: 1 if 'employer provided salary:' in x.lower() else 0)
data.head()

AttributeError: module 'numpy' has no attribute 'matrix'

In [None]:
# removing text from Salary Estimate
def text_replacements(original: str, replacements: dict):
    for str, replace_with in replacements.items():
        original = original.replace(str, replace_with)
    return original

In [None]:
replacements = {
    'Employer Provided Salary:': '',
    ' (Glassdoor Est.)': '',
    '₹': ''
}
data['Salary Estimate'] = data['Salary Estimate'].apply(lambda x: text_replacements(x, replacements))

In [None]:
data.head()

Unnamed: 0,Job Title,Salary Estimate,Job Description,Rating,Company Name,Location,Size,Founded,Type of ownership,Industry,Sector,Revenue,Competitors,Employer Provided Salary
1,Assistant Data Science Engineer,7L - 8L,"Established in 2007, Educate Girls’ is a non-p...",3.9,Educate Girls\n3.9,Mumbai,1001 to 5000 Employees,-1,Company - Private,-1,-1,Unknown / Non-Applicable,-1,1
2,Data Scientist,6L - 8L,About Us\n\nRamSoft is a global leader in Web ...,3.8,RamSoft\n3.8,Bengaluru,51 to 200 Employees,1994,Company - Private,Software Development,Information Technology,Unknown / Non-Applicable,-1,0
4,Data Analytics,4L - 7L,Job Description\n\n\n\n\nDomain\n\n\nData Anal...,3.7,Tata Technologies Europe\n3.7,Pune,10000+ Employees,1989,Company - Private,Research & Development,Management & Consulting,$100 to $500 million (USD),-1,0
5,Data Science Intern,12T - 21T,"ABOUT COTO\ncoto is a strong, growing deep tec...",4.3,Coto\n4.3,Mumbai,51 to 200 Employees,2021,Company - Private,Software Development,Information Technology,Unknown / Non-Applicable,-1,1
6,A&A -Data Scientist - AI POD,5L - 9L,Data Scientist- Staff\nDo you have a passion f...,4.1,Deloitte\n4.1,Bengaluru,10000+ Employees,1850,Company - Private,Accounting & Tax,Finance,Unknown / Non-Applicable,-1,0


In [None]:
data['Salary Estimate'].value_counts()

4L - 8L      3
5L - 9L      3
5L - 8L      3
6L - 8L      3
6L - 9L      3
4L - 7L      2
5L - 7L      2
18L - 25L    1
2L           1
8L - 9L      1
7L - 7L      1
4L - 5L      1
1L           1
29L          1
7L - 8L      1
2L - 3L      1
1L - 10L     1
6L - 7L      1
4L - 10L     1
71L - 1Cr    1
4L           1
12T - 21T    1
9L           1
Name: Salary Estimate, dtype: int64

In [None]:
def get_avg_salary_in_lakhs(salary_str: str) -> int:
    min_sal, max_sal, avg_sal = 0,0,0
    if '-' in salary_str:
        min_sal = get_number_in_lakhs(salary_str.split('-')[0])
        max_sal = get_number_in_lakhs(salary_str.split('-')[1])
        avg_sal = (min_sal + max_sal)/2
    else:
        avg_sal = get_number_in_lakhs(salary_str)
    return avg_sal
        
def get_number_in_lakhs(num: str) -> int:
    if 'L' in num:
        return int(num.replace('L', ''))
    elif 'T' in num:
        return int(num.replace('T', '')) * 0.01
    elif 'Cr' in num:
        return int(num.replace('Cr', '')) * 100
    else:
        return int(num)

In [None]:
data['avg_salary_in_lakh'] = data['Salary Estimate'].apply(lambda x: get_avg_salary_in_lakhs(x))

In [None]:
data.head()

Unnamed: 0,Job Title,Salary Estimate,Job Description,Rating,Company Name,Location,Size,Founded,Type of ownership,Industry,Sector,Revenue,Competitors,Employer Provided Salary,avg_salary_in_lakh
1,Assistant Data Science Engineer,7L - 8L,"Established in 2007, Educate Girls’ is a non-p...",3.9,Educate Girls\n3.9,Mumbai,1001 to 5000 Employees,-1,Company - Private,-1,-1,Unknown / Non-Applicable,-1,1,7.5
2,Data Scientist,6L - 8L,About Us\n\nRamSoft is a global leader in Web ...,3.8,RamSoft\n3.8,Bengaluru,51 to 200 Employees,1994,Company - Private,Software Development,Information Technology,Unknown / Non-Applicable,-1,0,7.0
4,Data Analytics,4L - 7L,Job Description\n\n\n\n\nDomain\n\n\nData Anal...,3.7,Tata Technologies Europe\n3.7,Pune,10000+ Employees,1989,Company - Private,Research & Development,Management & Consulting,$100 to $500 million (USD),-1,0,5.5
5,Data Science Intern,12T - 21T,"ABOUT COTO\ncoto is a strong, growing deep tec...",4.3,Coto\n4.3,Mumbai,51 to 200 Employees,2021,Company - Private,Software Development,Information Technology,Unknown / Non-Applicable,-1,1,0.165
6,A&A -Data Scientist - AI POD,5L - 9L,Data Scientist- Staff\nDo you have a passion f...,4.1,Deloitte\n4.1,Bengaluru,10000+ Employees,1850,Company - Private,Accounting & Tax,Finance,Unknown / Non-Applicable,-1,0,7.0


In [None]:
# Company name text only
data['Company Name'] = data[['Company Name', 'Rating']].apply(lambda x: x['Company Name'] if x['Rating'] <0 else x['Company Name'][:-4], axis=1)

In [None]:
data.head()

Unnamed: 0,Job Title,Salary Estimate,Job Description,Rating,Company Name,Location,Size,Founded,Type of ownership,Industry,Sector,Revenue,Competitors,Employer Provided Salary,avg_salary_in_lakh
1,Assistant Data Science Engineer,7L - 8L,"Established in 2007, Educate Girls’ is a non-p...",3.9,Educate Girls,Mumbai,1001 to 5000 Employees,-1,Company - Private,-1,-1,Unknown / Non-Applicable,-1,1,7.5
2,Data Scientist,6L - 8L,About Us\n\nRamSoft is a global leader in Web ...,3.8,RamSoft,Bengaluru,51 to 200 Employees,1994,Company - Private,Software Development,Information Technology,Unknown / Non-Applicable,-1,0,7.0
4,Data Analytics,4L - 7L,Job Description\n\n\n\n\nDomain\n\n\nData Anal...,3.7,Tata Technologies Europe,Pune,10000+ Employees,1989,Company - Private,Research & Development,Management & Consulting,$100 to $500 million (USD),-1,0,5.5
5,Data Science Intern,12T - 21T,"ABOUT COTO\ncoto is a strong, growing deep tec...",4.3,Coto,Mumbai,51 to 200 Employees,2021,Company - Private,Software Development,Information Technology,Unknown / Non-Applicable,-1,1,0.165
6,A&A -Data Scientist - AI POD,5L - 9L,Data Scientist- Staff\nDo you have a passion f...,4.1,Deloitte,Bengaluru,10000+ Employees,1850,Company - Private,Accounting & Tax,Finance,Unknown / Non-Applicable,-1,0,7.0


In [None]:
# Age of the Company
data['company_age'] = data['Founded'].apply(lambda x: x if x<1 else 2023 - x)

In [None]:
# import spacy
# import en_core_web_md
# nlp = spacy.load("en_core_web_md")

# data['jd_keywords'] = data['Job Description'].apply(lambda x: nlp(x).ents)

In [None]:
# checking for keywords in job description

#python
data['python_req'] = data['Job Description'].apply(lambda x: 1 if 'python' in x.lower() else 0)

#excel
data['excel'] = data['Job Description'].apply(lambda x: 1 if 'excel' in x.lower() or 'ms-excel' in x.lower() else 0)

#aws
data['aws'] = data['Job Description'].apply(lambda x: 1 if 'aws' in x.lower() else 0)

#spark
data['spark'] = data['Job Description'].apply(lambda x: 1 if 'spark' in x.lower() else 0)



In [None]:
data.head()

Unnamed: 0,Job Title,Salary Estimate,Job Description,Rating,Company Name,Location,Size,Founded,Type of ownership,Industry,Sector,Revenue,Competitors,Employer Provided Salary,avg_salary_in_lakh,company_age,python_req,excel,aws,spark
1,Assistant Data Science Engineer,7L - 8L,"Established in 2007, Educate Girls’ is a non-p...",3.9,Educate Girls,Mumbai,1001 to 5000 Employees,-1,Company - Private,-1,-1,Unknown / Non-Applicable,-1,1,7.5,-1,0,0,0,0
2,Data Scientist,6L - 8L,About Us\n\nRamSoft is a global leader in Web ...,3.8,RamSoft,Bengaluru,51 to 200 Employees,1994,Company - Private,Software Development,Information Technology,Unknown / Non-Applicable,-1,0,7.0,29,0,0,0,0
4,Data Analytics,4L - 7L,Job Description\n\n\n\n\nDomain\n\n\nData Anal...,3.7,Tata Technologies Europe,Pune,10000+ Employees,1989,Company - Private,Research & Development,Management & Consulting,$100 to $500 million (USD),-1,0,5.5,34,0,0,0,0
5,Data Science Intern,12T - 21T,"ABOUT COTO\ncoto is a strong, growing deep tec...",4.3,Coto,Mumbai,51 to 200 Employees,2021,Company - Private,Software Development,Information Technology,Unknown / Non-Applicable,-1,1,0.165,2,0,0,0,0
6,A&A -Data Scientist - AI POD,5L - 9L,Data Scientist- Staff\nDo you have a passion f...,4.1,Deloitte,Bengaluru,10000+ Employees,1850,Company - Private,Accounting & Tax,Finance,Unknown / Non-Applicable,-1,0,7.0,173,0,0,0,0


In [None]:
data['python_req'].value_counts()

0    28
1     7
Name: python_req, dtype: int64