In [6]:
# compiled code with kernel: py312

# Importing Libraries
import pandas as pd
import ast
import matplotlib.pyplot as plt
from datasets import load_dataset

# Loading Data
dataset = load_dataset('lukebarousse/data_jobs')
df = dataset['train'].to_pandas()

# Data Cleanup
df['job_posted_date'] = pd.to_datetime(df['job_posted_date'])

# Copying DF 
df_copy = df.copy()

# Dropping NaN values
df_copy[pd.notna(df_copy['salary_year_avg'])]['salary_year_avg']

# Function to return non-NaN values
def clean_list(skill_list):
    if pd.notna(skill_list):
     return ast.literal_eval(skill_list)
    else:
       return skill_list 
    
# Lambda function to return non-NaN values
df_copy['job_skills'] = df_copy['job_skills'].apply(
   lambda skill_list: 
   ast.literal_eval(skill_list) 
   if pd.notna(skill_list) 
   else skill_list
   )


In [7]:
# initial dtype: str

type(df['job_skills'][1])

str

In [9]:
# final dtype: list

type(df_copy['job_skills'][1])

list

## Part 2: Calculate projected salary
* Senior roles assume 5%
* Other roles assume 3%

In [10]:
help(df.apply)

Help on method apply in module pandas.core.frame:

apply(func: 'AggFuncType', axis: 'Axis' = 0, raw: 'bool' = False, result_type: "Literal['expand', 'reduce', 'broadcast'] | None" = None, args=(), by_row: "Literal[False, 'compat']" = 'compat', engine: "Literal['python', 'numba']" = 'python', engine_kwargs: 'dict[str, bool] | None' = None, **kwargs) method of pandas.core.frame.DataFrame instance
    Apply a function along an axis of the DataFrame.

    Objects passed to the function are Series objects whose index is
    either the DataFrame's index (``axis=0``) or the DataFrame's columns
    (``axis=1``). By default (``result_type=None``), the final return type
    is inferred from the return type of the applied function. Otherwise,
    it depends on the `result_type` argument.

    Parameters
    ----------
    func : function
        Function to apply to each column or row.
    axis : {0 or 'index', 1 or 'columns'}, default 0
        Axis along which the function is applied:

        

In [27]:
# usual apply func for columns:

df_salary = df[pd.notna(df['salary_year_avg'])].copy()

df_salary['salary_year_inflated'] = df_salary['salary_year_avg'].apply(lambda salary: salary * 1.03)
df_salary[['salary_year_avg','salary_year_inflated']] 

Unnamed: 0,salary_year_avg,salary_year_inflated
28,109500.0,112785.00
77,140000.0,144200.00
92,120000.0,123600.00
100,228222.0,235068.66
109,89000.0,91670.00
...,...,...
785624,139216.0,143392.48
785641,150000.0,154500.00
785648,221875.0,228531.25
785682,157500.0,162225.00


In [20]:
# lambda function for rows

df_salary = df[pd.notna(df['salary_year_avg'])].copy()

def projected_salary(row):
    if "Senior" in row['job_title_short']:
        return 1.05 * row['salary_year_avg']
    else: return 1.03 * row['salary_year_avg']

df_salary['salary_year_inflated'] = df_salary.apply(projected_salary, axis = 1)

df_salary[['job_title_short', 'salary_year_avg', 'salary_year_inflated']].head(30)
                                                    


Unnamed: 0,job_title_short,salary_year_avg,salary_year_inflated
28,Data Scientist,109500.0,112785.0
77,Data Engineer,140000.0,144200.0
92,Data Engineer,120000.0,123600.0
100,Data Scientist,228222.0,235068.66
109,Data Analyst,89000.0,91670.0
116,Data Scientist,114000.0,117420.0
146,Data Engineer,129500.0,133385.0
180,Data Analyst,90250.0,92957.5
212,Data Scientist,157500.0,162225.0
257,Data Scientist,103128.0,106221.84


In [26]:
# lambda function for the same function above:

df_salary['salary_year_inflated'] = df_salary.apply( lambda row: 1.05 * row['salary_year_avg'] if 'Senior' in row['job_title_short'] else 1.03 * row['salary_year_avg'], axis = 1)

df_salary[['job_title_short', 'salary_year_avg', 'salary_year_inflated']].head(30)

Unnamed: 0,job_title_short,salary_year_avg,salary_year_inflated
28,Data Scientist,109500.0,112785.0
77,Data Engineer,140000.0,144200.0
92,Data Engineer,120000.0,123600.0
100,Data Scientist,228222.0,235068.66
109,Data Analyst,89000.0,91670.0
116,Data Scientist,114000.0,117420.0
146,Data Engineer,129500.0,133385.0
180,Data Analyst,90250.0,92957.5
212,Data Scientist,157500.0,162225.0
257,Data Scientist,103128.0,106221.84
