In [1]:
import pandas as pd
import numpy as np
import re
import sys

sys.path.append("..")
from src import (parse_salary_row, extract_skills, SOFT_SKILL_ALIASES, TECHNICAL_SKILL_ALIASES, 
title_pattern, desc_pattern, exclude_pattern, build_alias_lookup,extract_soft_skills, build_skill_regex)

## Load in datasets (from multiple sources)

In [57]:
# https://www.kaggle.com/datasets/lukebarousse/data-analyst-job-postings-google-search
# taken from google search
df = pd.read_csv("../data/gsearch_jobs.csv")

# https://www.kaggle.com/datasets/rashikrahmanpritom/data-science-job-posting-on-glassdoor?select=Cleaned_DS_Jobs.csv
# all from glassdoor
df2 = pd.read_csv("../data/Cleaned_DS_Jobs_2020.csv")

# https://www.kaggle.com/datasets/elahehgolrokh/data-science-job-postings-with-salaries-2025
#df2 = pd.read_csv("data_science_job_posts_2025.csv")

### First dataframe cleaning

let's take this dataframe and extract entry-level/new-grad positions using key words found in either the job title or the description

In [3]:
# Query out non entry level jobs through text matching
df["is_entry_level"] = (
    df["title"].str.lower().str.contains(title_pattern, regex=True, na=False)
    |
    df["description"].str.lower().str.contains(desc_pattern, regex=True, na=False)
)

df["is_entry_level"] = df["is_entry_level"] & (
    ~df["title"].str.lower().str.contains(exclude_pattern, regex=True, na=False)
)

entry_jobs = df[df["is_entry_level"]].drop("is_entry_level",axis=1).reset_index(drop=True)

  df["title"].str.lower().str.contains(title_pattern, regex=True, na=False)
  df["description"].str.lower().str.contains(desc_pattern, regex=True, na=False)
  ~df["title"].str.lower().str.contains(exclude_pattern, regex=True, na=False)


In [4]:
#extract the year
entry_jobs["date_time"] = pd.to_datetime(df["date_time"])
entry_jobs["year"] = entry_jobs["date_time"].dt.year

In [5]:
# columns we will be keeping
col_keep = ['title', "year","via","salary_pay","salary_rate","description","description_tokens","location","work_from_home"]

In [6]:
entry_jobs_cleaned = entry_jobs[col_keep].copy()

# via column = source of job post
# change NaN values to be False
entry_jobs_cleaned['via'] = entry_jobs_cleaned['via'].str.strip('via ')
entry_jobs_cleaned['work_from_home'] = entry_jobs_cleaned['work_from_home'].fillna(False)

  entry_jobs_cleaned['work_from_home'] = entry_jobs_cleaned['work_from_home'].fillna(False)


In [7]:
#cleaning salary, works for both hourly, monthly, and yearly salary text
entry_jobs_cleaned[["salary_min_annual", "salary_max_annual", "salary_type"]] = df.apply(
    lambda r: parse_salary_row(r["salary_pay"], r["salary_rate"]),
    axis=1
)

In [8]:
entry_cleaned = entry_jobs_cleaned.drop(["salary_pay","salary_rate"],axis = 1)

In [67]:
entry_cleaned

Unnamed: 0,title,year,via,description,location,work_from_home,salary_min_annual,salary_max_annual,salary_type,tech_skills
0,Data Analyst,2023,LinkedIn,Job Title: Entry Level Business Analyst / Prod...,United States,False,101000.0,143000.0,annual,[]
1,Associate Research/Data Analyst-CES - Now Hiring,2023,Snagajob,Why you'll love working for this Department:\n...,"Jefferson City, MO",False,,,,[]
2,Data Analyst,2023,BeBee,"Status:\nFull Time, Exempt\n...\nLocation:\nPh...","Ardmore, OK",False,,,,"[sql, r, python, excel, sap, tableau, cognos, ..."
3,GIS Data Analyst,2023,BeBee,Description\n\nLeidos is seeking qualified can...,United States,False,31200.0,52000.0,hourly,"[sql, python]"
4,GIS Analyst I,2023,ZipRecruiter,Overview:\n\nGIS Analysts utilize GIS data fro...,"Oklahoma City, OK",False,90000.0,110000.0,annual,"[word, excel, outlook]"
...,...,...,...,...,...,...,...,...,...,...
4911,Data Analyst Intern - Undergraduate,2024,My ArkLaMiss Jobs,You could be the one who changes everything fo...,"Columbia, MO",False,,,,[sql]
4912,Data Analyst Intern,2024,My ArkLaMiss Jobs,First Orion is actively seeking a Data Analyst...,"Benton, AR",False,,,,"[data mining, sql, nosql, java, scala, python,..."
4913,Financial Data Analyst,2024,My ArkLaMiss Jobs,The Financial Data Analyst is primarily respon...,"Edmond, OK",False,,,,"[sql, python]"
4914,Data Analyst Intern (Undergraduate),2024,My ArkLaMiss Jobs,You could be the one who changes everything fo...,"Columbia, MO",False,,,,[sql]


Let's now extract skills from the description (both technical and soft). We already have a column "description_tokens" provided for us but let's see if we can get something more comprehensive before we drop it

In [10]:
# just checking what skills they included. Could be useful to add to my running list
# entry_cleaned.description_tokens
# unique_skills = entry_cleaned.description_tokens.apply(lambda x: x[1:-1].split(','))
# skills = set(
#     skill
#     for sublist in unique_skills
#     for skill in sublist
# )

# skills

In [11]:
alias_lookup = build_alias_lookup(TECHNICAL_SKILL_ALIASES)
skill_pattern = build_skill_regex(TECHNICAL_SKILL_ALIASES)

tech_skills = entry_cleaned["description"].apply(
    lambda txt: extract_skills(txt, alias_lookup, skill_pattern)
)

In [12]:
entry_cleaned['tech_skills'] = tech_skills
entry_cleaned = entry_cleaned.drop("description_tokens", axis=1)

In [13]:
soft_skills = entry_cleaned['description'].apply(lambda txt: extract_soft_skills(txt))

In [14]:
from collections import Counter
soft_skills_count = Counter(
    skill
    for lst in soft_skills.dropna()
    for skill in lst
)

In [15]:
soft_skills_count

Counter({'communication': 2383,
         'collaboration': 1720,
         'problem solving': 1400,
         'attention to detail': 1367,
         'teamwork': 918,
         'adaptability': 748,
         'initiative': 567,
         'time management': 540,
         'self-motivation': 440,
         'process improvement': 414,
         'critical thinking': 387,
         'project management': 367,
         'learning agility': 357,
         'analytical thinking': 356,
         'decision making': 333,
         'ownership': 325,
         'business acumen': 305,
         'leadership': 261,
         'coaching': 233,
         'networking': 230,
         'presentation skills': 197,
         'customer focus': 187,
         'continuous improvement': 162,
         'work ethic': 127,
         'relationship building': 90,
         'mentoring': 76,
         'storytelling': 72,
         'resilience': 72,
         'emotional intelligence': 66,
         'conflict resolution': 62,
         'strategic thinking

_____________________

In [16]:
tech_skills.iloc[4710]#.iloc[5:20].loc[15]

['python', 'excel', 'sql', 'github']

In [17]:
entry_cleaned.location.unique()

array(['  United States   ', '  Jefferson City, MO   ',
       '  Ardmore, OK   ', '  Oklahoma City, OK   ', ' Anywhere ',
       '  Harrison, AR   ', '  Missouri   ', '  Springfield, MO   ',
       '  Wichita, KS   ', '  Kansas City, MO   ', '  California, MO   ',
       '  Edmond, OK   ', '  Russellville, AR   ', 'Anywhere',
       'Denver, CO', '  Bentonville, AR   ', '  Tulsa, OK   ',
       '  Bartlesville, OK   ', '  Cave Springs, AR   ',
       '  Elm Springs, AR   ', 'Cheyenne, WY', 'Colorado Springs, CO',
       'Franktown, CO', 'Greenwood Village, CO', 'United States',
       'Westminster, CO', 'Brighton, CO', 'Centennial, CO',
       'Commerce City, CO', 'Edwards, CO', 'Lone Tree, CO', 'Pueblo, CO',
       'Thornton, CO', 'Windsor, CO', 'Allenspark, CO', 'Boulder, CO',
       'Santa Fe, NM', 'La Junta, CO', 'Englewood, CO', '  Topeka, KS   ',
       '  Lenexa, KS   ', '  Leawood, KS   ', '  Kansas City, KS   ',
       '  Fayetteville, AR   ', 'Colorado', 'Irvine, CA', 'Colum

In [18]:
entry_cleaned.description_tokens.iloc[4710]

AttributeError: 'DataFrame' object has no attribute 'description_tokens'

In [None]:
import ast

entry_cleaned["description_tokens"] = entry_cleaned["description_tokens"].apply(
    lambda x: ast.literal_eval(x) if isinstance(x, str) else x
)

In [None]:
entry_cleaned["description_tokens"]

In [None]:
# from collections import Counter

# tech_skill_counts = Counter(
#     skill
#     for lst in tech_skills
#     for skill in lst
# )

# token_skill_counts = Counter(
#     skill
#     for lst in entry_cleaned["description_tokens"].dropna()
#     for skill in lst
# )

In [None]:
# tech_skills.to_csv('our_cleaned.csv', index=True, header=True)

In [None]:
# sum(tech_skill_counts.values())

In [None]:
# sum(token_skill_counts.values())

In [None]:
# entry_cleaned.description_tokens.iloc[14]

In [None]:
# i = 113

# print("OLD:", entry_cleaned.loc[i, "description_tokens"])
# print("NEW:", tech_skills.iloc[i])

In [None]:
# entry_cleaned.description.iloc[98]

________________________________________

In [58]:
df2["is_entry_level"] = (
    df2["Job Title"].str.lower().str.contains(title_pattern, regex=True, na=False)
    |
    df2["Job Description"].str.lower().str.contains(desc_pattern, regex=True, na=False)
)

df2["is_entry_level"] = df2["is_entry_level"] & (
    ~df2["Job Title"].str.lower().str.contains(exclude_pattern, regex=True, na=False)
)

  df2["Job Title"].str.lower().str.contains(title_pattern, regex=True, na=False)
  df2["Job Description"].str.lower().str.contains(desc_pattern, regex=True, na=False)


In [59]:
entry_jobs2 = df2[df2["is_entry_level"]]

In [64]:
entry_jobs2['Job Description'].iloc[3]

"Puget Sound Energy is looking to grow our community with like-minded, top talented individuals like you! With our rapidly growing, award winning energy efficiency programs, our pathway to an exciting and innovative future is now.\n\nPSE's IT Application Solutions team is looking for qualified candidates to fill an open Associate Data Scientist position!\nJob Description\nPuget Sound Energy is an electric and gas utility which provides homes and businesses throughout the Northwest. In order to meet and anticipate our customers’ needs, our Data Services team is expanding its team to include machine learning and data science technologies. In order to accomplish this we have an urgent need for an experienced Associate Data Scientist to work in our analytical Community of Practice, document internal standards, and to support our functional business areas.\nJob Responsibilities\nIn addition to leading the development of our community, the Associate Data Scientist will be responsible for ass

In [None]:
#entry_jobs2

In [None]:
# test[["salary_min_annual", "salary_max_annual", "salary_type"]] = df.apply(
#     lambda r: parse_salary_row(r["salary_pay"], r["salary_rate"]),
#     axis=1
# )


In [None]:
yes = df2['Job Description'].apply(lambda txt: extract_soft_skills(txt))

In [None]:
yes2 = Counter(
    skill
    for lst in yes.dropna()
    for skill in lst
)

In [2]:
#JOB POSTINGS FROM 2019
#https://www.kaggle.com/datasets/jobspikr/data-scientist-job-postings-from-the-usa
df3 = pd.read_csv("../data/data_scientist_united_states_job_postings_jobspikr.csv")

In [3]:
df3["is_entry_level"] = (
    df3["job_title"].str.lower().str.contains(title_pattern, regex=True, na=False)
    |
    df3["job_description"].str.lower().str.contains(desc_pattern, regex=True, na=False)
)

df3["is_entry_level"] = df3["is_entry_level"] & (
    ~df3["job_title"].str.lower().str.contains(exclude_pattern, regex=True, na=False)
)

  df3["job_title"].str.lower().str.contains(title_pattern, regex=True, na=False)
  df3["job_description"].str.lower().str.contains(desc_pattern, regex=True, na=False)
  ~df3["job_title"].str.lower().str.contains(exclude_pattern, regex=True, na=False)


In [4]:
testing_df3 = df3[df3['is_entry_level']]

In [5]:
testing_df3

Unnamed: 0,crawl_timestamp,url,job_title,category,company_name,city,state,country,inferred_city,inferred_state,...,job_type,salary_offered,job_board,geo,cursor,contact_email,contact_phone_number,uniq_id,html_job_description,is_entry_level
3,2019-02-06 05:33:42 +0000,https://www.indeed.com/viewjob?jk=841edd86ead2...,"Data Scientist, Aladdin Wealth Tech, Associate...",Accounting/Finance,BlackRock,New York,NY 10055 (Midtown area),Usa,New york,New york,...,Undefined,,indeed,usa,1549432819259473,,,1c8541cd2c2c924f9391c7d3f526f64e,,True
8,2019-02-06 05:34:18 +0000,https://www.indeed.com/viewjob?jk=0fc298b9f3a8...,"Data Scientist, Aladdin Wealth Tech, Associate",Accounting/Finance,BlackRock,New York,NY 10055 (Midtown area),Usa,New york,New york,...,Undefined,,indeed,usa,1549436429066810,,,80d64b46bc7c89602f63daf06b9f1b4c,,True
42,2019-02-06 08:37:59 +0000,https://www.indeed.com/viewjob?jk=97103b53b4e3...,"Data Scientist Intern, Engineering - Software ...",Engineering/Architecture,Criteo,Palo Alto,CA 94301 (Professorville area),Usa,Palo alto,California,...,Undefined,,indeed,usa,1549447222845801,,,2775ed23918410617dbc6cad9886e455,,True
48,2019-02-07 05:29:42 +0000,https://www.dice.com/jobs/detail/Jr-Data-Scien...,Jr Data Scientist,,Hamilton Technical Personnel,New York,NY,Usa,New york,New york,...,Full Time,,dice,usa,1549519218250814,,,e8c1c747edb8003aec87a7f17f21a216,,True
65,2019-02-07 06:23:48 +0000,https://www.indeed.com/viewjob?jk=0141013576c9...,Junior Data Scientist,Computer/Internet,Cisco Careers,Research Triangle Park,NC,Usa,,North carolina,...,Undefined,,indeed,usa,1549522825712152,,,ea67f8f42a6fa593ce7acefddd8aef2e,,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9898,2019-09-29 06:52:26 +0000,https://www.theladders.com/job/associate-data-...,"Associate Data Scientist in Grand Rapids, MI",Associate Data Scientist,Daymon Worldwide,Grand Rapids,MI,US,Grand rapids,Michigan,...,Full Time,$80K - $100K,,,1569740470122411,,,d742a7e6a4be1a979c136336c5b86a41,"<div class=""job-description job-description-te...",True
9921,2019-09-29 06:39:14 +0000,https://www.theladders.com/job/data-scientist-...,Data Scientist / Machine Learning Research Pro...,Data Scientist / Machine Learning Research Pro...,Siemens,Princeton,NJ,US,Princeton,New jersey,...,Full Time,$80K - $100K,,,1569744055622923,,,9a51ee6e57f1e95fe09ce613877e8572,"<div class=""job-description job-description-te...",True
9944,2019-09-30 00:57:44 +0000,https://www.careerbuilder.com/job/J3V4ZJ70LJZW...,Data Scientist (PhD) - Intern,,ExxonMobil,Spring,TX,US,Spring,Texas,...,Full Time,,careerbuilder,usa,1569808862645778,,,f381eab96775a94239494fe8450958fb,<strong>Job Description</strong>\n<b>Job Summa...,True
9971,2019-10-01 02:55:23 +0000,https://job-openings.monster.com/associate-dat...,Associate Data Scientist,computer jobs,AIC,Neptune Beach,FL,Usa,Neptune beach,Florida,...,Full Time,,monster,usa,1569902445785343,,,51505c25d32452edab456cef25e4e898,<span id='TrackingJobBody' name='TrackingJobBo...,True
