# 🧹 Data Preprocessing & Feature Extraction for Resume Classification

## 1. Import Required Libraries

In [191]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import RandomOverSampler
from sklearn.preprocessing import LabelEncoder
from nltk.corpus import stopwords
from collections import Counter
import pandas as pd
import numpy as np
import unicodedata
import joblib
import string
import spacy
import nltk
import re

In [193]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\usife\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

## 2. Load The Dataset

In [196]:
path = '../data/resume-dataset/UpdatedResumeDataSet.csv'  
DataSet = pd.read_csv(path)

In [198]:
# Drop duplicates if not done already
DataSet.drop_duplicates(subset=['Resume'], inplace=True)
DataSet.reset_index(drop=True, inplace=True)

## 3. Create a Copy for Cleaning

In [201]:
cleaned_DataSet = DataSet.copy()

## 4. Define Cleaning Function

In [204]:
# Load spaCy English model
nlp = spacy.load('en_core_web_sm')

In [206]:
# Common non-informative or noisy words to remove
common_garbage = [
    'skills', 'skill', 'skill details', 'education', 'education details',
    'company', 'company details', 'description', 'details', 'project',
    'months', 'exprience', 'year', 'january', 'client',
    'pvt', 'ltd', 'responsibilities', 'technical', 'environment',
    'work', 'working', 'like', 'using', 'various',
    'maharashtra', 'pune', 'india', 'monthscompany',
    'university', 'college', 'experience', 'team', 'role', 'work',
    'maintain', 'support', 'handle', 'activity', 'window', 'time',
    'good', 'new', 'etc', 'requirement', 'provide',
    'detail', 'month', 'base', 'high', 'responsibility', 'issue',
    'plan', 'user', 'office', 'include', 'relate', 'level', 'job',
    'information', '2016', 'ssc', 'mumbai',
    'school', 'bachelor', '2017', 'different', 'involve', 'document',
    'complete', 'quality', 'ensure', 'diploma', 'institute', '2015', 'control',
    'datum', '2014', '2012', 'hsc'
]

In [208]:
# Mapping for standardizing skill names
standardize_skills = {
    'scikit learn': 'scikit-learn',
    'matplot lib': 'matplotlib',
    'sqlserver': 'sql-server',
    'doc vec': 'doc2vec',
    'word vec': 'word2vec',
    'vader': 'VADER',
    'text blob': 'TextBlob',
    'nodejs': 'NodeJS'
}

In [210]:
def clean_resume(text):
    """
    Cleans and standardizes resume text for NLP processing.
    Steps include lowercasing, removing noise, standardizing terms,
    and lemmatization with filtering.
    """

    # Convert text to lowercase
    text = text.lower()

    # clening use Remove URLs, emails, mentions
    text = unicodedata.normalize("NFKD", text).encode("ascii", "ignore").decode("utf-8")
    text = re.sub(r'\b(?:' + '|'.join(map(re.escape, common_garbage)) + r')\b', '', text)
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = re.sub(r'http\S+|www\.\S+', '', text)
    text = re.sub(r'\S+@\S+', '', text)
    text = re.sub(r'@[\w_]+', '', text)
    text = re.sub(r'#\w+', '', text)
    text = re.sub(r'\r|\n|\r\n', ' ', text)
    text = re.sub(r'\s{2,}', ' ', text)
    text = re.sub(r'\b\d+\b', '', text)
    text = re.sub(r'(js){2,}', 'js', text)  
    text = re.sub(r'([a-z])([A-Z])', r'\1 \2', text)  


    # Standardize skill variants 
    for k, v in standardize_skills.items():
        text = text.replace(k, v)

    # Lemmatize and remove stopwords, punctuation, and short tokens
    doc = nlp(text)
    tokens = [
        token.lemma_ for token in doc
        if not token.is_stop and not token.is_punct and len(token) > 2
    ]

    # Remove duplicates while preserving token order
    seen = set()
    cleaned_tokens = []
    for token in tokens:
        if token not in seen:
            seen.add(token)
            cleaned_tokens.append(token)

    return ' '.join(cleaned_tokens)

## 5. Apply Cleaning Function

In [213]:
cleaned_DataSet['Resume'] = cleaned_DataSet['Resume'].apply(clean_resume)

In [214]:
print("\n📌 Sample Resume Text After cleaning:")
cleaned_DataSet.iloc[0]['Resume']


📌 Sample Resume Text After cleaning:


'programming language python pandas numpy scipy scikitlearn matplotlib sql java javascriptjquery machine learn regression svm naa baye knn random forest decision tree boost technique cluster analysis word embed sentiment natural process dimensionality reduction topic model lda nmf pca neural net database visualization mysql server cassandra hbase elasticsearch d3js dcjs plotly kibana ggplot tableau regular expression html css angular logstash kafka flask git docker computer vision open understanding deep learning datum science assurance associate ernst young llp javascript jquery fraud investigation dispute service technology assist review tar accelerate run analytic generate report core member help develop automate platform tool scratch discovery domain implement predictive coding result reduce labor cost spend lawyer understand end flow solution research development classification mining present text work analyze output precision monitoring entire code evidence follow standard classi

In [215]:
all_words = ' '.join(cleaned_DataSet['Resume']).split()
word_freq = Counter(all_words)
#Top 50 Most Frequent Words in Resumes
top_words = word_freq.most_common(50)
print("Top 50 Most Frequent Words in Resumes:")
for word, freq in top_words:
    print(f"{word}: {freq}")

Top 50 Most Frequent Words in Resumes:
system: 94
management: 83
technology: 75
database: 67
datum: 66
service: 66
tool: 65
application: 65
software: 64
engineering: 64
computer: 63
developer: 63
development: 61
report: 59
business: 59
design: 59
develop: 58
work: 58
process: 56
customer: 55
create: 55
sql: 53
solution: 50
base: 50
manage: 50
requirement: 47
language: 46
provide: 46
server: 45
implement: 45
window: 45
project: 44
testing: 44
knowledge: 44
issue: 43
test: 43
java: 42
science: 42
analysis: 41
web: 41
activity: 41
engineer: 40
perform: 39
relate: 38
include: 38
communication: 37
involve: 37
product: 36
manager: 36
lead: 36


## 6. Encode Target Labels

In [220]:
le = LabelEncoder()
cleaned_DataSet['Category'] = le.fit_transform(cleaned_DataSet['Category'])

In [222]:
print("\n📌 Category encoding ")
cleaned_DataSet['Category']


📌 Category encoding 


0       6
1       6
2       6
3       6
4       6
       ..
161    23
162    23
163    23
164    23
165    23
Name: Category, Length: 166, dtype: int32

## 7. Optional: Oversample to Balance Categories
# Comment/uncomment this section as needed

In [134]:
max_size = cleaned_DataSet['Category'].value_counts().max()
balanced_df = cleaned_DataSet.groupby('Category').apply(lambda x: x.sample(max_size, replace=True)).reset_index(drop=True)

  balanced_df = cleaned_DataSet.groupby('Category').apply(lambda x: x.sample(max_size, replace=True)).reset_index(drop=True)


In [135]:
# Shuffle the balanced dataset
cleaned_DataSet_balanced = balanced_df.sample(frac=1).reset_index(drop=True)

## 8. Prepare Features and Target

In [224]:
X = cleaned_DataSet['Resume'].values
y = cleaned_DataSet['Category'].values

In [234]:
X_train_balanced, X_test_balanced, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=42,
)

## 9. TF-IDF Vectorization

In [237]:
word_vectorizer = TfidfVectorizer(
    sublinear_tf=True,
    stop_words='english',
    max_features=1500
)

In [239]:
X_train_vec = word_vectorizer.fit_transform(X_train_balanced)
X_test_vec = word_vectorizer.transform(X_test_balanced)

In [253]:
ros = RandomOverSampler(random_state=42)
X_train_bal, y_train_bal = ros.fit_resample(X_train_vec, y_train_balanced)

In [171]:
X_train_bal.shape

(300, 1500)

In [173]:
word_vectorizer.fit(X)
X_vectorized = word_vectorizer.transform(X)

In [175]:
word_vectorizer.fit(X_balanced)
X_balanced_vectorized = word_vectorizer.transform(X_balanced)

In [177]:
print("\n✅ TF-IDF feature matrix shape:", X_vectorized.shape)
print("\n✅ TF-IDF feature balanced matrix shape:", X_balanced_vectorized.shape)


✅ TF-IDF feature matrix shape: (166, 1500)

✅ TF-IDF feature balanced matrix shape: (166, 1500)


## 10. Split into Train and Test Sets

In [156]:
X_train, X_test, y_train, y_test = train_test_split(
    X_vectorized,
    y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

In [158]:
X_train_balanced, X_test_balanced, y_train_balanced, y_test_balanced = train_test_split(
    X_balanced_vectorized,
    y_balanced,
    test_size=0.2,
    random_state=42,
)

In [180]:
print("\n✅ Train shape:", X_train.shape)
print("✅ Test shape:", X_test.shape)
print("\n✅ Train balanc shape:", X_train_balanced.shape)
print("✅ Test balanc shape:", X_test_balanced.shape)

NameError: name 'X_train' is not defined

## 11. Save Split into Train and Test Sets

In [163]:
# Save the split data
joblib.dump(X_train, '../models/X_train.pkl')
joblib.dump(X_test, '../models/X_test.pkl')
joblib.dump(y_train, '../models/y_train.pkl')
joblib.dump(y_test, '../models/y_test.pkl')

['../models/y_test.pkl']

In [261]:
# Save the split data balanced
joblib.dump(X_train_bal, '../models/X_train_bal.pkl')
joblib.dump(X_test_balanced, '../models/X_test_bal.pkl')
joblib.dump(y_train_bal, '../models/y_train_bal.pkl')
joblib.dump(y_test_balanced, '../models/y_test_bal.pkl')

['../models/y_test_bal.pkl']

<300x1500 sparse matrix of type '<class 'numpy.float64'>'
	with 30173 stored elements in Compressed Sparse Row format>