In [3]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import re
import spacy
from spacy.matcher import Matcher

In [5]:
# Download necessary NLTK resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')

# Define stopwords
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Mage\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Mage\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Mage\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [6]:
!python -m spacy download en_core_web_sm

# Load the English language model
nlp = spacy.load("en_core_web_sm")

Traceback (most recent call last):
  File "<frozen runpy>", line 189, in _run_module_as_main
  File "<frozen runpy>", line 148, in _get_module_details
  File "<frozen runpy>", line 112, in _get_module_details
  File "c:\Users\Mage\AppData\Local\pypoetry\Cache\virtualenvs\weightedrank-VABqxEaP-py3.12\Lib\site-packages\spacy\__init__.py", line 6, in <module>
  File "c:\Users\Mage\AppData\Local\pypoetry\Cache\virtualenvs\weightedrank-VABqxEaP-py3.12\Lib\site-packages\spacy\errors.py", line 3, in <module>
    from .compat import Literal
  File "c:\Users\Mage\AppData\Local\pypoetry\Cache\virtualenvs\weightedrank-VABqxEaP-py3.12\Lib\site-packages\spacy\compat.py", line 39, in <module>
    from thinc.api import Optimizer  # noqa: F401
    ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\Mage\AppData\Local\pypoetry\Cache\virtualenvs\weightedrank-VABqxEaP-py3.12\Lib\site-packages\thinc\api.py", line 1, in <module>
    from .backends import (
  File "c:\Users\Mage\AppData\Local\pypoetry\Cache\vi

OSError: [E050] Can't find model 'en_core_web_sm'. It doesn't seem to be a Python package or a valid path to a data directory.

In [None]:
# Improved clean_text function with stemming and negation handling
# Function to clean resume text
def clean_text(resumeText):
    resumeText = re.sub('http\S+\s*', ' ', resumeText)  # remove URLs
    resumeText = re.sub('RT|cc', ' ', resumeText)  # remove RT and cc
    resumeText = re.sub('#\S+', '', resumeText)  # remove hashtags
    resumeText = re.sub('@\S+', '  ', resumeText)  # remove mentions
    resumeText = re.sub('[%s]' % re.escape("""!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~"""), ' ', resumeText)  # remove punctuations
    resumeText = re.sub(r'[^\x00-\x7f]',r' ', resumeText)
    resumeText = re.sub('\s+', ' ', resumeText)  # remove extra whitespace
    resumeText = re.sub(r'\\[rn]', ' ', resumeText)  # remove \r and \n
    return resumeText.strip()  # strip leading and trailing whitespace

def extract_experience(text):
    experience_score = 0
    # Initialize spaCy Matcher
    matcher = Matcher(nlp.vocab)
    # Define pattern for matching experience mentions
    pattern = [{"IS_DIGIT": True}, {"LOWER": {"IN": ["to", "or"]}, "OP": "?"}, {"IS_DIGIT": True, "OP": "?"}, {"LOWER": {"IN": ["year", "yr", "yrs", "year's"]}}]
    matcher.add("EXPERIENCE", [pattern])
    # Process the text with spaCy
    doc = nlp(text)
    # Iterate over matches found by the Matcher
    for match_id, start, end in matcher(doc):
        start_token = doc[start]
        end_token = doc[end - 1]
        start_year = int(start_token.text)
        end_year = int(end_token.text) if end_token.text.isdigit() else None
        if end_year:
            years = end_year - start_year
        else:
            years = 1  # If only start year mentioned, consider 1 year of experience
        # Additional points for relevant titles
        title = ""
        for token in doc[start:end]:
            if token.pos_ == "NOUN" or token.pos_ == "PROPN":
                title += token.text + " "
        if title:
            experience_score += years * 2
        # Weighted based on experience duration
        experience_score += years
    return experience_score

def extract_education(text):
    education_score = 0
    doc = nlp(text)
    # Extract entities related to education
    for ent in doc.ents:
        if ent.label_ == "DEGREE":
            degree_text = ent.text.lower()
            if "bachelor" in degree_text or "b.e." in degree_text or "btech" in degree_text or "bachelor's" in degree_text or "bs" in degree_text:
                education_score += 1
            elif "master" in degree_text or "mphil" in degree_text or "master's" in degree_text:
                education_score += 2
            elif "phd" in degree_text or "Ph.d" in degree_text:
                education_score += 3
        elif ent.label_ == "ORG":
            education_score += 1  # Points for attending a university
    return education_score

# Function to calculate keyword score
def calculate_keyword_score(text, keywords, weights):
  score = 0
  for word, weight in zip(keywords, weights):
    score += text.count(word) * weight
  return score

In [None]:
# Load data into a Pandas dataframe
df = pd.read_csv('resumes.csv')
df.drop_duplicates(inplace=True)
df['id'] = range(1, len(df) + 1)
df.set_index('id', inplace=True)
df.head()

In [None]:
# Clean resume text in each row
df['cleaned_text'] = df['Resume'].apply(clean_text)
df.head()

In [None]:
# Extract features (replace with more sophisticated methods)
df['experience_score'] = df['cleaned_text'].apply(extract_experience)
df['education_score'] = df['cleaned_text'].apply(extract_education)
print(df.head())
# Define job-specific keywords and weights
keywords = ["java", "spring", "hibernate", "maven", "J2EE", "SQL", "RESTful", "API", "JavaScript", "HTML", "CSS", "Git"]
weights = [4, 1, 3, 3, 3, 2, 3, 3, 4, 3, 3, 2]# Adjust weights based on importance

In [None]:
# Calculate keyword score for each resume
df['keyword_score'] = df['cleaned_text'].apply(lambda x: calculate_keyword_score(x, keywords, weights))

# Define weights for different factors (adjust based on your needs)
weight_experience = 0.2
weight_education = 0.7
weight_keywords = 0.1
#+ (weight_skills * df.apply(lambda x: len(x['skills']), axis=1))
# Calculate overall ranking score
df['rank_score'] = (weight_experience * df['experience_score']) + (weight_education * df['education_score']) + (weight_keywords * df['keyword_score'])

In [None]:
# Sort dataframe by ranking score (highest to lowest)
df_sorted = df.sort_values(by='rank_score', ascending=False)
#pd.set_option('display.max_colwidth', None)
# Assuming df_sorted is your DataFrame containing the 'Resume' and 'rank_score' columns
print(df_sorted[['cleaned_text','rank_score']].head())