# import libraries

In [None]:
# Core libraries
import pandas as pd
import numpy as np

# NLP libraries
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# ML / NLP
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Download required nltk data
nltk.download('stopwords')
nltk.download('wordnet')

# Load the Dataset

In [None]:
df = pd.read_csv('all_job_post.csv')

In [None]:
df.head()

# Select & Clean Required Columns

In [None]:
df = df[['job_title', 'job_description', 'job_skill_set']]

In [None]:
df.dropna(inplace=True)
df.reset_index(drop=True, inplace=True)

# Text Preprocessing Function (NLP Core)

Raw text is messy.
NLP models cannot understand raw text.

This step:

removes noise

standardizes text

improves similarity accuracy

In [None]:
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

In [None]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^a-zA-Z ]', '', text)
    words = text.split()
    words = [lemmatizer.lemmatize(w) for w in words if w not in stop_words]
    return " ".join(words)

# Apply Text Cleaning

In [None]:
df['job_skill_set'] = df['job_skill_set'].astype(str) # convert the list in str

In [None]:
df['clean_text'] = (
    df['job_title'] + " " +
    df['job_skill_set'] + " " +
    df['job_skill_set']   # repeat to give weight
)

df['clean_text'] = df['clean_text'].apply(clean_text)

In [None]:
df['clean_text'][0]

# Convert Text â†’ Numbers (TF-IDF)

In [None]:
tfidf = TfidfVectorizer(max_features=3000)
job_vectors = tfidf.fit_transform(df['clean_text'])

# save model

In [None]:
import joblib

joblib.dump(tfidf, "tfidf.pkl")
joblib.dump(job_vectors, "job_vectors.pkl")
joblib.dump(df, "jobs_df.pkl")