In [201]:
import pandas as pd
import numpy as np
import re

In [211]:
# for future key matching provided by chatGPT
# This is for creating soft and technical skills column
soft_skills = ["communication","analytical","leadership","initiative","presentation","flexible","collaboration"
               ,"organizational","work independently","interpersonal","teamwork","time management","critical thinking"
               ,"problem solving","detail oriented","adaptability","customer focused","multitasking","self motivated"]

technical_skills = ["sql","excel","python","statistics","tableau","power bi","r","machine learning","java","etl","sas","aws"
                    ,"azure","gcp","snowflake","databricks","redshift","bigquery","spark","hadoop","docker","kubernetes"
                    ,"mysql","postgres","git","linux","unix",".net","react","node","javascript","api","rest","graphql"
                    ,"tensorflow","pytorch","scikit","scikit-learn","numpy","pandas","matlab","c++","c#"]

# This is for finding entry level positions 
title_pattern = r"(?:entry|entry level|junior|jr|associate|level i|level 1|trainee|apprentice)"
desc_pattern = r"(?:0-1|0-2|1-2|recent graduate|new grad|entry level|graduate program|early career)"
exclude_pattern = r"(?:senior|sr|lead|principal|manager|director|architect|5\+|7\+|10\+|3\+)"

In [None]:
#
HOURS_PER_YEAR = 2080
MONTHS_PER_YEAR = 12

def parse_salary_row(pay, rate):
    if pd.isna(pay):
        return pd.Series([np.nan, np.nan, np.nan])

    text = str(pay).replace("–", "-").replace(",", "").strip().upper()

    numbers = re.findall(r"\d+\.?\d*", text)
    numbers = [float(n) for n in numbers]

    if not numbers:
        return pd.Series([np.nan, np.nan, np.nan])

    min_val = numbers[0]
    max_val = numbers[1] if len(numbers) > 1 else numbers[0]

    if "K" in text:
        min_val *= 1000
        max_val *= 1000

    # --- Determine salary type ---
    if isinstance(rate, str):
        rate = rate.lower()

    if rate == "an hour":
        salary_type = "hourly"
        min_annual = min_val * HOURS_PER_YEAR
        max_annual = max_val * HOURS_PER_YEAR

    elif rate == "a month":
        salary_type = "monthly"
        min_annual = min_val * MONTHS_PER_YEAR
        max_annual = max_val * MONTHS_PER_YEAR

    else:  # 'a year' OR NaN → treat as annual
        salary_type = "annual"
        min_annual = min_val
        max_annual = max_val

    return pd.Series([min_annual, max_annual, salary_type])


test[["salary_min_annual", "salary_max_annual", "salary_type"]] = df.apply(
    lambda r: parse_salary_row(r["salary_pay"], r["salary_rate"]),
    axis=1
)


In [159]:
# https://www.kaggle.com/datasets/lukebarousse/data-analyst-job-postings-google-search
# taken from google search
df = pd.read_csv("gsearch_jobs.csv")

# https://www.kaggle.com/datasets/rashikrahmanpritom/data-science-job-posting-on-glassdoor?select=Cleaned_DS_Jobs.csv
# all from glassdoor
df2 = pd.read_csv("Cleaned_DS_Jobs_2020.csv")

# https://www.kaggle.com/datasets/elahehgolrokh/data-science-job-postings-with-salaries-2025
#df2 = pd.read_csv("data_science_job_posts_2025.csv")

let's take this dataframe and extract entry-level/new-grad positions using key words found in either the job title or the description

In [160]:
df["is_entry_level"] = (
    df["title"].str.lower().str.contains(title_pattern, regex=True, na=False)
    |
    df["description"].str.lower().str.contains(desc_pattern, regex=True, na=False)
)

In [161]:
df["is_entry_level"] = df["is_entry_level"] & (
    ~df["title"].str.lower().str.contains(exclude_pattern, regex=True, na=False)
)

In [162]:
entry_jobs = df[df["is_entry_level"]].drop("is_entry_level",axis=1)

In [163]:
entry_jobs["date_time"] = pd.to_datetime(df["date_time"])
entry_jobs["year"] = entry_jobs["date_time"].dt.year

In [196]:
col_keep = ['title', "year","via","salary_pay","salary_rate","description","description_tokens","location","work_from_home"]

In [197]:
entry_jobs.columns

Index(['Unnamed: 0', 'index', 'title', 'company_name', 'location', 'via',
       'description', 'extensions', 'job_id', 'thumbnail', 'posted_at',
       'schedule_type', 'work_from_home', 'salary', 'search_term', 'date_time',
       'search_location', 'commute_time', 'salary_pay', 'salary_rate',
       'salary_avg', 'salary_min', 'salary_max', 'salary_hourly',
       'salary_yearly', 'salary_standardized', 'description_tokens', 'year'],
      dtype='object')

In [198]:
test = entry_jobs[col_keep].reset_index(drop=True)

In [214]:
test['via'] = test['via'].str.strip('via ')

In [205]:
entry_cleaned = test.drop(["salary_pay","salary_rate"],axis = 1)

In [208]:
entry_cleaned.to_csv("my_data.csv", index=False)

In [168]:
df2["is_entry_level"] = (
    df2["Job Title"].str.lower().str.contains(title_pattern, regex=True, na=False)
    |
    df2["Job Description"].str.lower().str.contains(desc_pattern, regex=True, na=False)
)

df2["is_entry_level"] = df2["is_entry_level"] & (
    ~df2["Job Title"].str.lower().str.contains(exclude_pattern, regex=True, na=False)
)

In [212]:
#entry_jobs2 = df2[df2["is_entry_level"]]

In [213]:
#entry_jobs2