In [2]:
# STEP 1: Import libraries
import pandas as pd
import spacy
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Load spaCy model
nlp = spacy.load("en_core_web_sm")

# STEP 2: Upload DataSet
df = pd.read_csv('UpdatedResumeDataSet.csv')

# Use only relevant columns
df = df[['Resume', 'Category']].dropna()

# STEP 3: Preprocess resume text using spaCy
def preprocess(text):
    doc = nlp(text)
    tokens = [token.lemma_.lower() for token in doc if token.is_alpha and not token.is_stop]
    return " ".join(tokens)

df['cleaned_resume'] = df['Resume'].apply(preprocess)

# STEP 4: TF-IDF vectorization
vectorizer = TfidfVectorizer(max_features=3000)
X = vectorizer.fit_transform(df['cleaned_resume'])
y = df['Category']

# STEP 5: Train-test split with stratification
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# STEP 6: Train Logistic Regression classifier
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

# STEP 7: Evaluate the model
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# STEP 8 (Optional): Show a few predictions
for i in range(10):
    print(f"Prediction: {y_pred[i]} | Actual: {y_test.iloc[i]}")

Accuracy: 0.9948186528497409
Prediction: ETL Developer | Actual: ETL Developer
Prediction: Health and fitness | Actual: Health and fitness
Prediction: Advocate | Actual: Advocate
Prediction: Automation Testing | Actual: Automation Testing
Prediction: Operations Manager | Actual: Operations Manager
Prediction: PMO | Actual: PMO
Prediction: Java Developer | Actual: Java Developer
Prediction: Civil Engineer | Actual: Civil Engineer
Prediction: Operations Manager | Actual: Operations Manager
Prediction: Operations Manager | Actual: Operations Manager
