In [143]:
#import libraries
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from bs4 import BeautifulSoup

In [144]:
data = []
folder = r"C:\Users\balin\Desktop\magnimind-assignments\NLP job_posting project\data\job_postings"

for filename in os.listdir(folder):
    if filename.endswith('.html'):
        full_path = os.path.join(folder, filename)
        with open(full_path, 'r', encoding='utf-8') as f:
            soup = BeautifulSoup(f, 'lxml')
            title = soup.title.text if soup.title else 'No Title'
            body = soup.body.get_text(separator='\n', strip=True)
            data.append({'filename': filename, 'title': title, 'body': body})

# Create a DataFrame from the list of dictionaries
df = pd.DataFrame(data)


In [145]:
df.shape

(1458, 3)

In [146]:
df.head(10)

Unnamed: 0,filename,title,body
0,001b92395ed0fb62.html,"Data Scientist - Beavercreek, OH","Data Scientist - Beavercreek, OH\nData Scienti..."
1,00321a48d04fe754.html,"Data Scientist - Seattle, WA 98101","Data Scientist - Seattle, WA 98101\nAre you in..."
2,0079c11b2611349f.html,"Data Scientist - Newark, CA","Data Scientist - Newark, CA\nData Scientist\nN..."
3,007d9d7b5c09d820.html,Patient Care Assistant / PCA - Med/Surg (Fayet...,Patient Care Assistant / PCA - Med/Surg (Fayet...
4,00bf37ae19c7dfd7.html,"Data Manager / Analyst - Oakland, CA","Data Manager / Analyst - Oakland, CA\nHOW YOU ..."
5,0125eabc844281c9.html,"Scientific Programmer - Berkeley, CA","Scientific Programmer - Berkeley, CA\nCaribou ..."
6,014ae4dbded805d2.html,JD Digits - AI Lab Research Intern - Mountain ...,JD Digits - AI Lab Research Intern - Mountain ...
7,014ea972a4aa9812.html,Operations and Technology Summer 2020 Internsh...,Operations and Technology Summer 2020 Internsh...
8,0179ea131f141400.html,PwC Labs - Jr. Data Scientist - Machine Learni...,PwC Labs - Jr. Data Scientist - Machine Learni...
9,018866568cd5a0b0.html,"Data and Reporting Analyst - Olympia, WA 98501","Data and Reporting Analyst - Olympia, WA 98501..."


In [147]:
df['body'][df['filename'] == '0179ea131f141400.html'].values[0][:1000]

'PwC Labs - Jr. Data Scientist - Machine Learning (NLP) - Tampa, FL 33607\nPwC Labs is focused on standardizing, automating, delivering tools and processes and exploring emerging technologies that drive efficiency and enable our people to reimagine the possible. Process improvement, transformation, effective use of innovative technology and data & analytics, and leveraging alternative delivery solutions are key areas of focus to drive additional value for our firm. The AI Lab focuses on implementing solutions that impact efficiency and effectiveness of our technology functions. Process improvement, transformation, effective use of technology and data & analytics, and leveraging alternative delivery are key areas to drive value and continue to be recognized as the leading professional services firm. AI Lab is focused on identifying and prioritizing emerging technologies to get the most out of our investments.\nTo really stand out and make us ?t for the future in a constantly changing wo

In [148]:
df.describe()


Unnamed: 0,filename,title,body
count,1458,1458,1458
unique,1458,1364,1457
top,_p.html,"Data Scientist - New York, NY","Physics Data Scientist - Foothill Ranch, CA\nT..."
freq,1,13,2


In [149]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1458 entries, 0 to 1457
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   filename  1458 non-null   object
 1   title     1458 non-null   object
 2   body      1458 non-null   object
dtypes: object(3)
memory usage: 34.3+ KB


# Clean the data

In [150]:
# Split the title roughly into job title and location on the last separator (handle cases like at index 8)
split = df['title'].str.rsplit(' - ', n=1, expand=True)
df['job_title'] = split[0]
df['location']  = split[1].fillna('Unknown') # in case there is no location

In [151]:
df.drop(columns=['title','filename'], inplace=True)

In [152]:
df.sample(10)

Unnamed: 0,body,job_title,location
48,Research Associate - Data Science - Washington...,Research Associate - Data Science,"Washington, DC"
1031,"Physics Data Scientist - Foothill Ranch, CA\nT...",Physics Data Scientist,"Foothill Ranch, CA"
1109,"Data Scientist - San Francisco, CA\nOur compan...",Data Scientist,"San Francisco, CA"
1388,"Machine Learning Engineer - San Francisco, CA\...",Machine Learning Engineer,"San Francisco, CA"
585,"Operations Research Scientist - Boston, MA\nIn...",Operations Research Scientist,"Boston, MA"
1259,Sr Manufacturing Engineer - Medical Device - M...,Sr Manufacturing Engineer - Medical Device,"Memphis, TN"
1392,"Data Scientist, Mid - Alexandria, VA\nSecure o...","Data Scientist, Mid","Alexandria, VA"
838,"DEP Epidemiologist - Aberdeen Proving Ground, ...",DEP Epidemiologist,"Aberdeen Proving Ground, MD"
1071,"Principle Quality Engineer - Ames, IA\nAbout U...",Principle Quality Engineer,"Ames, IA"
921,"Data Analyst - Sunnyvale, CA\nAbout Clover:\nJ...",Data Analyst,"Sunnyvale, CA"


In [153]:
df['job_title'].iloc[3][:1000]

'Patient Care Assistant / PCA - Med/Surg (Fayette, AL)'

In [157]:
def strip_header(row):
    """Remove job title and location header from job description body"""
    body, job_title, location = row['body'], row['job_title'], row['location']
    header = f"{job_title} - {location}"
    
    # If body starts with the exact header, remove it cleanly
    if body.startswith(header):
        return body[len(header):].lstrip('\n')
    
    # Otherwise keep the original text
    return body

In [156]:
df.sample(10)

Unnamed: 0,body,job_title,location
1361,"QSR Store Assistant - Jacksonville, FL 32226\n...",QSR Store Assistant,"Jacksonville, FL 32226"
776,Certified Strength and Conditioning Specialist...,Certified Strength and Conditioning Specialist,United States
531,"Data Scientist - New York, NY 10010\nTeam Writ...",Data Scientist,"New York, NY 10010"
530,Senior Associate- Financial Consulting Valuati...,Senior Associate- Financial Consulting Valuati...,"New York, NY 10036"
1426,"Computational Biologist - Portland, OR\nFuncti...",Computational Biologist,"Portland, OR"
77,"Data Scientist, Analytics - Ads Infrastructure...","Data Scientist, Analytics - Ads Infrastructure","Seattle, WA 98101"
371,Industrial Engineering - Data Science - Carlsb...,Industrial Engineering - Data Science,"Carlsbad, CA 92009"
486,"Model Risk Manager - San Francisco, CA\nCurren...",Model Risk Manager,"San Francisco, CA"
761,Quantitative Consumer and Product Modeler - Ci...,Quantitative Consumer and Product Modeler,"Cincinnati, OH 45201"
555,"Data Scientist - Product BURLINGAME, CALIFORNI...","Data Scientist - Product BURLINGAME, CALIFORNI...","Burlingame, CA 94010"


K-means: fast, easy to apply, works well when clusters are compact, spherical and similar in size
Expectation-Maximisation (EM): often used with Gaussian Mixture models, handles clusters with different shapes, sizes, and orientations
- consider: scaling data, using PCA before clustering, and selecting number of clusters using BIC, AIC, silhouette score