In [12]:
import pandas as pd


def load_data(filepath):
    return pd.read_csv(filepath)

def clean_data(df):
    # Remove rows without "Salary", "Level", and "Collaboration Type"
    df = df.dropna(subset=["Level", "Salary", "Collaboration Type"], how="all")
    return df

def extract_salary(df):
    # Extract numeric values from "Salary"
    df["Salary"] = df["Salary"].str.extract('(\d+)', expand=False).astype(float)
    df.reset_index(drop=True, inplace=True)
    return df

def fill_missing_salaries(df):
    # Fill NaN values in "Salary" based on the mean salary of each "Level"
    for idx, val in enumerate(df["Salary"]):
        if pd.isna(val):
            level = df.iloc[idx]["Level"]
            level_mean = df.groupby(by="Level").get_group(level)["Salary"].mean()
            df.loc[idx, "Salary"] = level_mean

    return df

def preprocess_data(filepath):
    df = load_data(filepath)
    df = clean_data(df)
    df = extract_salary(df)
    df = fill_missing_salaries(df)
    # df.reset_index(drop=True, inplace=True)
    return df

df = preprocess_data("../data/raw/job_posts.csv")
