## 🧹 Data Preprocessing & Feature Extraction for Resume Classification

## 1. Import Required Libraries

In [3]:
import pandas as pd
import numpy as np
import re
import string
import nltk
import spacy
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import joblib
import warnings

warnings.filterwarnings('ignore')
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\usife\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\usife\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [5]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\usife\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

### 2. Load and Clean Dataset

In [8]:
# Load dataset
path = '../data/resume-dataset/UpdatedResumeDataSet.csv'
DataSet = pd.read_csv(path)

In [10]:
# Drop duplicates
DataSet.drop_duplicates(subset=['Resume'], inplace=True)
DataSet.reset_index(drop=True, inplace=True)

## 3. Create a Copy for Cleaning

In [13]:
cleaned_DataSet = DataSet.copy()

## 4. Clean Resume Text

In [16]:
nlp = spacy.load('en_core_web_sm')

def cleanResume(resumeText):
    resumeText = re.sub('http\\S+\\s*', ' ', resumeText)
    resumeText = re.sub('RT|cc', ' ', resumeText)
    resumeText = re.sub('#\\S+', '', resumeText)
    resumeText = re.sub('@\\S+', ' ', resumeText)
    resumeText = re.sub('[%s]' % re.escape(string.punctuation), ' ', resumeText)
    resumeText = re.sub(r'[^\x00-\x7f]', r' ', resumeText)
    resumeText = re.sub('\\s+', ' ', resumeText)
    return resumeText

In [18]:
cleaned_DataSet['Cleaned_Resume'] = cleaned_DataSet['Resume'].apply(cleanResume)

### 4. Encode Target Labels

In [21]:
le = LabelEncoder()
cleaned_DataSet['Category_encoded'] = le.fit_transform(cleaned_DataSet['Category'])

### 5. Feature Extraction (TF-IDF)

In [24]:
stop_words = set(stopwords.words('english') + ['``', "''"])

def tokenize_and_filter(text):
    tokens = word_tokenize(text)
    return [word for word in tokens if word.lower() not in stop_words and word not in string.punctuation]

In [26]:
cleaned_DataSet['Tokens'] = cleaned_DataSet['Cleaned_Resume'].apply(tokenize_and_filter)
cleaned_DataSet['Filtered_Text'] = cleaned_DataSet['Tokens'].apply(lambda tokens: ' '.join(tokens))

In [28]:
vectorizer = TfidfVectorizer(sublinear_tf=True, stop_words='english', max_features=1500)
X = vectorizer.fit_transform(cleaned_DataSet['Filtered_Text'])
y = cleaned_DataSet['Category_encoded']

### 6. Train/Test Split

In [31]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y)

In [33]:
print("\n✅ Train shape:", X_train.shape)
print("✅ Test shape:", X_test.shape)


✅ Train shape: (132, 1500)
✅ Test shape: (34, 1500)


### 7. Save Data with Joblib

In [36]:
joblib.dump(X_train, '../models/X_train.pkl')
joblib.dump(X_test, '../models/X_test.pkl')
joblib.dump(y_train, '../models/y_train.pkl')
joblib.dump(y_test, '../models/y_test.pkl')

['../models/y_test.pkl']

# ✅ Data is now preprocessed, vectorized, split, and saved. Ready for model training.