In [2]:
# Import necessary libraries
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk
nltk.download('stopwords')


[nltk_data] Downloading package stopwords to /Users/akshu/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
# Load the dataset
file_path = '/Users/akshu/career_counseling_system/Datasets/clean_data.parquet'
data = pd.read_parquet(file_path)

In [4]:
# Drop missing values
data.dropna(subset=['job_title', 'job_desc'], inplace=True)
# Remove duplicates
data.drop_duplicates(subset=['job_title', 'job_desc'], inplace=True)
print(f"Rows after dropping NaNs: {data.shape[0]}")
print(f"Rows after dropping duplicates: {data.shape[0]}")


Rows after dropping NaNs: 1415417
Rows after dropping duplicates: 1415417


In [5]:
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
data['job_title'] = data['job_title'].apply(lambda x: ' '.join([lemmatizer.lemmatize(word) for word in x.split()]))
data['job_desc'] = data['job_desc'].apply(lambda x: ' '.join([lemmatizer.lemmatize(word) for word in x.split()]))


In [6]:
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    words = re.sub(r'[^a-zA-Z\s]', '', text.lower()).split()  # Normalize and tokenize
    words = [word for word in words if word not in stop_words] # Remove stopwords
    words = [lemmatizer.lemmatize(word) for word in words]     # Lemmatize
    return ' '.join(words)

data['job_title'] = data['job_title'].apply(preprocess_text)
data['job_desc'] = data['job_desc'].apply(preprocess_text)


In [7]:
# Feature engineering
data['title_length'] = data['job_title'].apply(len)
data['desc_length'] = data['job_desc'].apply(len)


In [10]:
data['combined_text'] = data['job_title'] + " " + data['job_desc']
vectorizer = TfidfVectorizer(max_features=5000)  # Limit features to reduce memory usage
X_combined = vectorizer.fit_transform(data['combined_text'])


In [None]:
data.to_parquet('preprocessed_data.parquet')
