In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans

In [2]:
df1 = pd.read_csv("../data/job_skills.csv")
len(df1)
print(df1.head())
print(df1.dtypes)

                                            job_link  \
0  https://www.linkedin.com/jobs/view/housekeeper...   
1  https://www.linkedin.com/jobs/view/assistant-g...   
2  https://www.linkedin.com/jobs/view/school-base...   
3  https://www.linkedin.com/jobs/view/electrical-...   
4  https://www.linkedin.com/jobs/view/electrical-...   

                                          job_skills  
0  Building Custodial Services, Cleaning, Janitor...  
1  Customer service, Restaurant management, Food ...  
2  Applied Behavior Analysis (ABA), Data analysis...  
3  Electrical Engineering, Project Controls, Sche...  
4  Electrical Assembly, Point to point wiring, St...  
job_link      object
job_skills    object
dtype: object


In [3]:
df2 = pd.read_csv("../data/linkedin_job_postings.csv")
len(df2)
print(df2.head())
print(df2.dtypes)

                                            job_link  \
0  https://www.linkedin.com/jobs/view/account-exe...   
1  https://www.linkedin.com/jobs/view/registered-...   
2  https://www.linkedin.com/jobs/view/restaurant-...   
3  https://www.linkedin.com/jobs/view/independent...   
4  https://www.linkedin.com/jobs/view/group-unit-...   

             last_processed_time got_summary got_ner is_being_worked  \
0   2024-01-21 07:12:29.00256+00           t       t               f   
1   2024-01-21 07:39:58.88137+00           t       t               f   
2  2024-01-21 07:40:00.251126+00           t       t               f   
3  2024-01-21 07:40:00.308133+00           t       t               f   
4  2024-01-19 09:45:09.215838+00           f       f               f   

                                           job_title  \
0  Account Executive - Dispensing (NorCal/Norther...   
1                 Registered Nurse - RN Care Manager   
2               RESTAURANT SUPERVISOR - THE FORKLIFT   
3     

In [4]:
c=0
df2_links = set(df2["job_link"])
for link in df1["job_link"]:
  if link not in df2_links:
    c +=1

print(c)

0


In [5]:
c=0
df1_links = set(df1["job_link"])
for link in df2["job_link"]:
  if link not in df1_links:
    c +=1

print(c)

52073


**removing non-overlapping entries from df2**

In [6]:
keep_links = set(df1["job_link"])
df2 = df2[df2["job_link"].isin(keep_links)].reset_index(drop=True)


## Data Prep
Working on `job_skills.csv` & `linkedin_job_postings` to create a dataset giving skills for a job title through the common linkage of `job_link` in both the datasets

Removing jobs from df2 that are not entry-level

In [7]:
df2_na = df2[df2["job_level"] == "Associate"].reset_index(drop=True)
import re

# patterns that indicate non-entry roles
non_entry_terms = [
    "manager", "principal", "supervisor", "specialist", "senior", "lead",
    "director", "vp", "head", r"sr\.?",   # matches "Sr" or "Sr."
]

# build a case-insensitive regex with word boundaries
non_entry_pattern = r"(?i)\b(" + "|".join(non_entry_terms) + r")\b"

# keep only rows whose job_title does NOT contain any of these terms
df2_assoc = df2_na[~df2_na["job_title"].str.contains(non_entry_pattern, na=False)].reset_index(drop=True)

df2_assoc.head(), len(df2_assoc)


  df2_assoc = df2_na[~df2_na["job_title"].str.contains(non_entry_pattern, na=False)].reset_index(drop=True)


(                                            job_link  \
 0  https://www.linkedin.com/jobs/view/special-age...   
 1  https://www.linkedin.com/jobs/view/control-sys...   
 2  https://mx.linkedin.com/jobs/view/sewer-at-str...   
 3  https://www.linkedin.com/jobs/view/kitchen-pre...   
 4  https://www.linkedin.com/jobs/view/development...   
 
              last_processed_time got_summary got_ner is_being_worked  \
 0  2024-01-21 08:08:21.308995+00           t       t               f   
 1  2024-01-21 08:08:24.021822+00           t       t               f   
 2  2024-01-21 02:01:09.882561+00           t       t               f   
 3  2024-01-21 06:01:14.503312+00           t       t               f   
 4  2024-01-19 14:39:08.158402+00           t       t               f   
 
                               job_title  \
 0   Special Agent: Law/Legal Background   
 1  Control Systems Integration Engineer   
 2                                 Sewer   
 3                          Kitchen Prep

In [8]:
# keep only the columns we need from df2
df2_small = df2_assoc[["job_link", "job_title"]]

# merge on job_link, then select final columns
df3 = (
    df1.merge(df2_small, on="job_link", how="inner")
       [["job_title", "job_skills"]]
)

#reset index
df3 = df3.reset_index(drop=True)
df3

Unnamed: 0,job_title,job_skills
0,Performance Analyst ( Material) – Analyste per...,"SAP, DRMIS, Data warehousing, Data analysis, D..."
1,"Analyst, Capital Markets","Debt and equity offering memorandums, Financia..."
2,Laboratory Technician,"Laboratory Technician, Pharmaceutical Testing,..."
3,Insurance Analyst,"Excel, Risk Management, Property Management, I..."
4,Part Time Sales Associate- CALVIN KLEIN,"Customer service, Suggestive selling, UPT, Con..."
...,...,...
110751,Newborn Hearing Screen Technician,"Newborn Hearing Screen Technician, Audiometry,..."
110752,Maintenance Operator,"Maintenance, Installation, Repair, Troubleshoo..."
110753,Industrial Services Laborer,"Power Washing, Hydro blasting, Digging, Sweepi..."
110754,Contract Compliance Officer – Waste Management,IOSH Managing Safely or equivalent Health & Sa...


creating a new dataset that has Associate & entry job level for both dataset!

In [9]:
# keep only the columns we need from df2
df2_small = df2_assoc[["job_link", "job_title"]]

# merge on job_link, then select final columns
df3 = (
    df1.merge(df2_small, on="job_link", how="inner")
       [["job_title", "job_skills"]]
)

#reset index
df3 = df3.reset_index(drop=True)
df3.head()

Unnamed: 0,job_title,job_skills
0,Performance Analyst ( Material) – Analyste per...,"SAP, DRMIS, Data warehousing, Data analysis, D..."
1,"Analyst, Capital Markets","Debt and equity offering memorandums, Financia..."
2,Laboratory Technician,"Laboratory Technician, Pharmaceutical Testing,..."
3,Insurance Analyst,"Excel, Risk Management, Property Management, I..."
4,Part Time Sales Associate- CALVIN KLEIN,"Customer service, Suggestive selling, UPT, Con..."


In [10]:
df3.to_csv("../data/cleaned_data.csv", index=False)