In [29]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import re
import spacy
from spacy.matcher import Matcher

In [30]:
# Download necessary NLTK resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')

# Define stopwords
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Mage\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Mage\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Mage\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [31]:
!python -m spacy download en_core_web_sm

# Load the English language model
nlp = spacy.load("en_core_web_sm")

Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
     --------------------------------------- 0.0/12.8 MB 217.9 kB/s eta 0:00:59
     --------------------------------------- 0.0/12.8 MB 245.8 kB/s eta 0:00:52
     --------------------------------------- 0.1/12.8 MB 547.6 kB/s eta 0:00:24
      -------------------------------------- 0.2/12.8 MB 919.0 kB/s eta 0:00:14
     - -------------------------------------- 0.4/12.8 MB 1.3 MB/s eta 0:00:10
     - -------------------------------------- 0.6/12.8 MB 1.8 MB/s eta 0:00:07
     -- ------------------------------------- 0.8/12.8 MB 2.1 MB/s eta 0:00:06
     -- ------------------------------------- 0.8/12.8 MB 2.1 MB/s eta 0:00:06
     -- ------------------------------------

In [49]:
# Improved clean_text function with stemming and negation handling
# Function to clean resume text
def clean_text(resumeText):
    resumeText = re.sub('http\S+\s*', ' ', resumeText)  # remove URLs
    resumeText = re.sub('RT|cc', ' ', resumeText)  # remove RT and cc
    resumeText = re.sub('#\S+', '', resumeText)  # remove hashtags
    resumeText = re.sub('@\S+', '  ', resumeText)  # remove mentions
    resumeText = re.sub('[%s]' % re.escape("""!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~"""), ' ', resumeText)  # remove punctuations
    resumeText = re.sub(r'[^\x00-\x7f]',r' ', resumeText)
    resumeText = re.sub('\s+', ' ', resumeText)  # remove extra whitespace
    resumeText = re.sub(r'\\[rn]', ' ', resumeText)  # remove \r and \n
    return resumeText.strip()  # strip leading and trailing whitespace

def extract_experience(text):
    try:
        experience_score = 0
        # Initialize spaCy Matcher
        matcher = Matcher(nlp.vocab)
        # Define pattern for matching experience mentions
        pattern = [{"IS_DIGIT": True, "OP": "?"}, {"LOWER": {"IN": ["to", "or"]}, "OP": "?"}, {"IS_DIGIT": True, "OP": "?"}, {"LOWER": {"IN": ["year", "yr", "yrs", "year's","years"]}}]
        matcher.add("EXPERIENCE", [pattern])
        # Process the text with spaCy
        doc = nlp(text)
        # Iterate over matches found by the Matcher
        for match_id, start, end in matcher(doc):
            start_token = doc[start]
            # If a specific number of years is mentioned, set the experience score directly
            if start_token.like_num:
                experience_score = int(start_token.text)
                break
            # If a range of years is mentioned, calculate the difference
            if start_token.text.isdigit() and doc[end - 1].text.isdigit():
                start_year = int(start_token.text)
                end_year = int(doc[end - 1].text)
                experience_score = end_year - start_year
                break
        return experience_score
    except Exception as e:
        print("Error occurred:", e)
        return None

def extract_education(text):
    education_score = 0
    doc = nlp(text)
    # Define patterns for degrees
    degree_patterns = {
        "bachelor": re.compile(r"\b(b\.?a\.?|b\.?s\.?c?\.?|b\.?e\.?|b\.?tech|bachelor's?)\b"),
        "master": re.compile(r"\b(m\.?a\.?|m\.?s\.?c?\.?|m\.?e\.?|m\.?phil|masters?)\b"),
        "phd": re.compile(r"\b(ph\.?d\.?|doctorate|d\.?phil)\b")
    }
    
    # Extract entities related to education
    for ent in doc.ents:
        if ent.label_ == "DEGREE" or ent.text.lower() in degree_patterns.keys():
            for degree, pattern in degree_patterns.items():
                if re.search(pattern, ent.text.lower()):
                    if degree == "bachelor":
                        education_score += 1
                    elif degree == "master":
                        education_score += 2
                    elif degree == "phd":
                        education_score += 3
                    break  # Stop searching for degrees once found
        elif ent.label_ == "ORG" and ("university" in ent.text.lower() or "college" in ent.text.lower()):
            education_score += 1  # Points for attending a university or college
    return education_score

# Function to calculate keyword score
def calculate_keyword_score(text, keywords, weights):
  score = 0
  for word, weight in zip(keywords, weights):
    score += text.count(word) * weight
  return score

  resumeText = re.sub('http\S+\s*', ' ', resumeText)  # remove URLs
  resumeText = re.sub('#\S+', '', resumeText)  # remove hashtags
  resumeText = re.sub('@\S+', '  ', resumeText)  # remove mentions
  resumeText = re.sub('[%s]' % re.escape("""!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~"""), ' ', resumeText)  # remove punctuations
  resumeText = re.sub('\s+', ' ', resumeText)  # remove extra whitespace


In [50]:
# Load data into a Pandas dataframe
df = pd.read_csv('resumes.csv')
df.drop_duplicates(inplace=True)
df['id'] = range(1, len(df) + 1)
df.set_index('id', inplace=True)
df.head()

Unnamed: 0_level_0,Category,Resume
id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,Data Science,Skills * Programming Languages: Python (pandas...
2,Data Science,Education Details \r\nMay 2013 to May 2017 B.E...
3,Data Science,"Areas of Interest Deep Learning, Control Syste..."
4,Data Science,Skills â¢ R â¢ Python â¢ SAP HANA â¢ Table...
5,Data Science,"Education Details \r\n MCA YMCAUST, Faridab..."


In [51]:
# Clean resume text in each row
df['cleaned_text'] = df['Resume'].apply(clean_text)
df.head()

Unnamed: 0_level_0,Category,Resume,cleaned_text
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,Data Science,Skills * Programming Languages: Python (pandas...,Skills Programming Languages Python pandas num...
2,Data Science,Education Details \r\nMay 2013 to May 2017 B.E...,Education Details May 2013 to May 2017 B E UIT...
3,Data Science,"Areas of Interest Deep Learning, Control Syste...",Areas of Interest Deep Learning Control System...
4,Data Science,Skills â¢ R â¢ Python â¢ SAP HANA â¢ Table...,Skills R Python SAP HANA Tableau SAP HANA SQL ...
5,Data Science,"Education Details \r\n MCA YMCAUST, Faridab...",Education Details MCA YMCAUST Faridabad Haryan...


In [52]:
# Extract features (replace with more sophisticated methods)
df['experience_score'] = df['cleaned_text'].apply(extract_experience)
df['education_score'] = df['cleaned_text'].apply(extract_education)
print(df.head())
# Define job-specific keywords and weights
keywords = ["java", "spring", "hibernate", "maven", "J2EE", "SQL", "RESTful", "API", "JavaScript", "HTML", "CSS", "Git"]
weights = [4, 1, 3, 3, 3, 2, 3, 3, 4, 3, 3, 2]# Adjust weights based on importance

        Category                                             Resume  \
id                                                                    
1   Data Science  Skills * Programming Languages: Python (pandas...   
2   Data Science  Education Details \r\nMay 2013 to May 2017 B.E...   
3   Data Science  Areas of Interest Deep Learning, Control Syste...   
4   Data Science  Skills â¢ R â¢ Python â¢ SAP HANA â¢ Table...   
5   Data Science  Education Details \r\n MCA   YMCAUST,  Faridab...   

                                         cleaned_text  experience_score  \
id                                                                        
1   Skills Programming Languages Python pandas num...                 0   
2   Education Details May 2013 to May 2017 B E UIT...                 1   
3   Areas of Interest Deep Learning Control System...                 1   
4   Skills R Python SAP HANA Tableau SAP HANA SQL ...                 1   
5   Education Details MCA YMCAUST Faridabad Haryan..

In [64]:
# Calculate keyword score for each resume
df['keyword_score'] = df['cleaned_text'].apply(lambda x: calculate_keyword_score(x, keywords, weights))

# Define weights for different factors (adjust based on your needs)
weight_experience = 0.3
weight_education = 0.3
weight_keywords = 0.4
#+ (weight_skills * df.apply(lambda x: len(x['skills']), axis=1))
# Calculate overall ranking score
df['rank_score'] = (weight_experience * df['experience_score']) + (weight_education * df['education_score']) + (weight_keywords * df['keyword_score'])

In [65]:
# Sort dataframe by ranking score (highest to lowest)
df_sorted = df.sort_values(by='rank_score', ascending=False)
#pd.set_option('display.max_colwidth', None)
# Assuming df_sorted is your DataFrame containing the 'Resume' and 'rank_score' columns
print(df_sorted[['cleaned_text','rank_score']].head()) 

                                          cleaned_text  rank_score
id                                                                
132  Software Skills RDBMS MS SQL SERVER 2000 2005 ...        98.3
113  Technical Skills Key Skills MS Technology Net ...        40.7
153  Technical Skills CATEGORY SKILLS Language C C ...        36.1
38   Education Details B C A Bachelor Computer Appl...        30.7
127  TECHNICAL SKILLS Operating Systems MS Windows ...        26.5


In [63]:
text = "I have 3 yrs experience"
print("Experience Score: ", extract_experience(text))

Experience Score:  3


In [57]:
# Test the function
text = "John has a Bachelor of Science in Computer Science from XYZ University and a PhD in Engineering from ABC University."
print(extract_education(text))


5
