In [1]:
import pandas as pd

# Placeholder for the relative path to your CSV file
relative_path = '../raw/datasetsalary2024.csv'

# Reading the CSV file into a DataFrame
df = pd.read_csv(relative_path)

# Displaying the first few rows of the DataFrame
print(df.head())

   work_year experience_level employment_type                  job_title  \
0       2024               SE              FT                AI Engineer   
1       2024               SE              FT                AI Engineer   
2       2024               SE              FT              Data Engineer   
3       2024               SE              FT              Data Engineer   
4       2024               SE              FT  Machine Learning Engineer   

   salary salary_currency  salary_in_usd employee_residence  remote_ratio  \
0  202730             USD         202730                 US             0   
1   92118             USD          92118                 US             0   
2  130500             USD         130500                 US             0   
3   96000             USD          96000                 US             0   
4  190000             USD         190000                 US             0   

  company_location company_size  
0               US            M  
1           

In [4]:
# Adding a new column to the DataFrame that indicates whether the employee's residence matches the company's location
df['employee_company_location_match'] = df.apply(lambda row: 1 
                                                 if row['employee_residence'] == row['company_location'] 
                                                 else 0, axis=1)

print(df.head())

   work_year experience_level employment_type                  job_title  \
0       2024               SE              FT                AI Engineer   
1       2024               SE              FT                AI Engineer   
2       2024               SE              FT              Data Engineer   
3       2024               SE              FT              Data Engineer   
4       2024               SE              FT  Machine Learning Engineer   

   salary salary_currency  salary_in_usd employee_residence  remote_ratio  \
0  202730             USD         202730                 US             0   
1   92118             USD          92118                 US             0   
2  130500             USD         130500                 US             0   
3   96000             USD          96000                 US             0   
4  190000             USD         190000                 US             0   

  company_location company_size  employee_company_location_match  
0            

In [13]:
def categorize_job_function(job_title):
    """
    Categorizes job titles based on the presence of certain strings within the job title.
    
    Parameters:
    - job_title: The job title to categorize.
    
    Returns:
    - A category based on whether checked substring is in the job title.
    """
    if 'data' in job_title.lower():
        return 'Data Related Jobs'
    elif 'ai' in job_title.lower():
        return  'AI Related Jobs'
    elif 'ml' in job_title.lower() or 'machine learning' in job_title.lower():
        return 'Machine Learning Related Jobs'
    elif 'bi' in job_title.lower() or 'business intelligence' in job_title.lower():
        return 'Business Intelligence Related Jobs'
    else:
        return 'Other Functions'
# Applying the function to the job_title column to create a new category column
df['job_function'] = df['job_title'].apply(categorize_job_function)

# Displaying the first few rows to verify the new column
print(df.head())

   work_year experience_level employment_type                  job_title  \
0       2024               SE              FT                AI Engineer   
1       2024               SE              FT                AI Engineer   
2       2024               SE              FT              Data Engineer   
3       2024               SE              FT              Data Engineer   
4       2024               SE              FT  Machine Learning Engineer   

   salary salary_currency  salary_in_usd employee_residence  remote_ratio  \
0  202730             USD         202730                 US             0   
1   92118             USD          92118                 US             0   
2  130500             USD         130500                 US             0   
3   96000             USD          96000                 US             0   
4  190000             USD         190000                 US             0   

  company_location company_size  employee_company_location_match  \
0           

In [14]:
def categorize_job_seniority(job_title):
    """
    Categorizes job titles based on the presence of certain strings within the job title.
    
    Parameters:
    - job_title: The job title to categorize.
    
    Returns:
    - A category based on whether checked substring is in the job title.
    """
    if 'engineer' in job_title.lower() or 'analyst' in job_title.lower():
        return 'Engineer or Analyst'
    elif 'architect' in job_title.lower():
        return  'Architect'
    elif 'admin' in job_title.lower() or 'lead' in job_title.lower():
        return 'Administrator or Lead'
    elif 'manage' in job_title.lower():
        return 'Manager'
    else:
        return 'Other Seniority'
# Applying the function to the job_title column to create a new category column
df['job_seniority'] = df['job_title'].apply(categorize_job_seniority)

# Displaying the first few rows to verify the new column
print(df.head())

   work_year experience_level employment_type                  job_title  \
0       2024               SE              FT                AI Engineer   
1       2024               SE              FT                AI Engineer   
2       2024               SE              FT              Data Engineer   
3       2024               SE              FT              Data Engineer   
4       2024               SE              FT  Machine Learning Engineer   

   salary salary_currency  salary_in_usd employee_residence  remote_ratio  \
0  202730             USD         202730                 US             0   
1   92118             USD          92118                 US             0   
2  130500             USD         130500                 US             0   
3   96000             USD          96000                 US             0   
4  190000             USD         190000                 US             0   

  company_location company_size  employee_company_location_match  \
0           

In [22]:
# Counting the number of employees in each job function and seniority category
job_function_counts = df['job_function'].value_counts()
job_seniority_counts = df['job_seniority'].value_counts()
location_match_counts = df['employee_company_location_match'].value_counts()

print("Job Function Counts:")
print(job_function_counts)

print("\nJob Seniority Counts:")
print(job_seniority_counts)

print("\nEmployee-Company Location Match Counts:")
print(location_match_counts)

Job Function Counts:
job_function
Data Related Jobs                     11322
Machine Learning Related Jobs          2202
Other Functions                        2020
Business Intelligence Related Jobs      771
AI Related Jobs                         219
Name: count, dtype: int64

Job Seniority Counts:
job_seniority
Engineer or Analyst      9814
Other Seniority          5654
Manager                   498
Architect                 469
Administrator or Lead      99
Name: count, dtype: int64

Employee-Company Location Match Counts:
employee_company_location_match
1    16404
0      130
Name: count, dtype: int64


In [21]:
# Saving the updated DataFrame to a new CSV file into modified folder
df.to_csv('./datasetsalary2024_modified.csv', index=False)