In [47]:
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import pandas as pd
import numpy as np

## Functions below to extract data from resumes using text 

In [26]:
# extract name from resume
def extract_name_from_resume(text):
    name = None

    # Use regex pattern to find a potential name
    pattern = r"(\b[A-Z][a-z]+\b)\s(\b[A-Z][a-z]+\b)"
    match = re.search(pattern, text)
    if match:
        name = match.group()

    return name

In [27]:
# extract email and phone number and any urls from resume
def extract_email_phone(text):
    email_regex = r'[A-Za-z0-9\._%+\-]+@[A-Za-z0-9\.\-]+\.[A-Za-z]{2,}'
    phone_regex = r'\d{3}-\d{3}-\d{4}|\(\d{3}\)\s\d{3}-\d{4}|\d{3}\.\d{3}\.\d{4}|\d{3}\s\d{3}\s\d{4}|\d{10}|\d{5}\s\d{5}|\+\d{2}\s\d{10}|\+\d{2}\s\d{5}\s\d{5}'

    emails = re.findall(email_regex, text)
    phones = re.findall(phone_regex, text)
    urls= re.findall(r'\b((?:https?:\/\/|www\.)(?:[-a-zA-Z0-9@:%._\+~#=]{1,256}\.)+[a-zA-Z]{2,6}\b(?:[-a-zA-Z0-9@:%_\+.~#?&//=]*))', text)
    
    processed_phones = []
    for phone in phones:
        if len(phone) > 10:
            phone = phone[-10:]  # Keep the last 10 digits
        processed_phones.append(phone)

    linkedin_regex = r'^https?://(?:www\.)?linkedin\.com/[^\s]+|linkedin\.com/in/[\w-]+'
    linkedin_urls = []

    for url in urls:
        if re.match(linkedin_regex, url):
            linkedin_urls.append(url)
    
    if not linkedin_urls:
        linkedin_urls = ["None"]

    return emails, processed_phones, urls, linkedin_urls

In [28]:
def extract_education_from_resume(text):
    education = []
    pattern = r"(?i)(?:(?:Bachelor|B\.S\.|B\.A\.|Master|M\.S\.|M\.A\.|Ph\.D\.)\s(?:[A-Za-z]+\s)*[A-Za-z]+)|(?:Bsc|\bB\.\w+|\bM\.\w+|\bPh\.D\.\w+|\bBachelor(?:'s)?|\bMaster(?:'s)?|\bPh\.D)\s(?:\w+\s)*\w+"
    matches = re.findall(pattern, text)
    for match in matches:
        education.append(match.strip())

    return education

def extract_college_name(text):
    college_pattern = r'(?i)(?:[A-Z][a-z]* College of Engineering|[A-Z][a-z]* Educational Institute|University of [A-Z][a-z]*|Ecole [A-Z][a-z]*|Indian Institute Of Technology-[A-Z][a-z]|National Institute Of Technology-[A-Z][a-z])'

  # Use re.findall to find all matches in the entire text
    matches = re.findall(college_pattern, text)

  # Return the first match if found, otherwise return None
    return matches[0].strip() if matches else None
    

## Below code is to extract data from resumes using normal basic functions


In [29]:
df = pd.read_csv('resumes.csv')

# Create a list to store the processed data
processed_data = []
resume_data = df['Resume txt'].tolist()

# Process each resume text
for index, row in df.iterrows():
    sl_no = row['Sl no']
    text = row['Resume txt']
    
    emails, phones, urls, linkedIn_url = extract_email_phone(text)
    name = extract_name_from_resume(text)
    edu_details = extract_education_from_resume(text)
    
    processed_data.append({
        "sl no": sl_no,
        "Name": name,
        "emails": ', '.join(emails),
        "contact no's": ', '.join(phones),
        "LinkedIn": ', '.join(linkedIn_url),
        "Education": ', '.join(edu_details)
    })

# Create a new DataFrame from the processed data
processed_df = pd.DataFrame(processed_data)

# Write the processed data to dataofresume.csv
processed_df.to_csv('dataofresume.csv', index=False)

print("Data has been processed and written to dataofresume.csv")
# print(resume_data)

Data has been processed and written to dataofresume.csv


In [20]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors
from sklearn.metrics.pairwise import cosine_similarity

In [30]:

def read_txt_file(file_path):
    with open(file_path, 'r') as file:
        text = file.read()
    return text

job_desc=read_txt_file('sample.txt')

In [31]:
def get_wordnet_pos(word):
    """Map POS tag to first character lemmatize() accepts"""
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}

    return tag_dict.get(tag, wordnet.NOUN)

def preprocess(text):
    text = text.lower()
    word_tokens = word_tokenize(text)

    # Remove punctuations
    word_tokens = [word for word in word_tokens if word.isalnum()]

    # Remove stopwords and common words
    stop_words = set(stopwords.words('english'))
    # Add more common words to remove if needed
    common_words = set(['from', 'what', 'and'])
    word_tokens = [word for word in word_tokens if not word in stop_words and not word in common_words]

    lemmatizer = WordNetLemmatizer()
    lemmatized_tokens = [lemmatizer.lemmatize(word, get_wordnet_pos(word)) for word in word_tokens]
    return ' '.join(lemmatized_tokens)


In [32]:
job_desc_processed = preprocess(job_desc)
job_desc_skills = set(job_desc_processed.split())
print(job_desc_skills)

{'ml', 'code', 'unsupervised', 'excellent', 'algorithm', 'quality', 'develop', 'life', 'phd', 'understand', 'passionate', 'azure', 'environment', 'development', 'job', 'candidate', 'work', 'similar', 'datasets', 'interested', 'practice', 'requirement', 'experience', 'mission', 'hadoop', 'professional', 'version', 'look', 'pipeline', 'project', 'framework', 'implement', 'transform', 'late', 'scalable', 'innovative', 'role', 'program', 'analytical', 'publish', 'support', 'career', 'evaluation', 'problem', 'strong', 'solid', 'vision', 'analyze', 'familiarity', 'ability', 'relevant', 'ca', 'specialize', 'teamwork', 'deploy', 'prefer', 'employee', 'salary', 'improve', 'review', 'employer', 'remote', 'participate', 'learn', 'identify', 'package', 'offer', 'commit', 'machine', 'extract', 'training', 'production', 'supervise', 'make', 'benefit', 'model', 'hour', 'letter', 'processing', 'performance', 'insight', 'cover', 'related', 'create', 'apply', 'collaborative', 'location', 'san', 'compute

In [33]:
documents = resume_data+ [job_desc]

# Initialize the TF-IDF Vectorizer
vectorizer = TfidfVectorizer()

# Fit and transform the documents into TF-IDF vectors
tfidf_matrix = vectorizer.fit_transform(documents)

# Convert the TF-IDF matrix to an array
tfidf_array = tfidf_matrix.toarray()

In [34]:
resume_vectors = tfidf_array[:-1]  # All except the last
job_desc_vector = tfidf_array[-1]  # The last vector

# Create and train the KNN model on resume vectors
knn_model = NearestNeighbors(n_neighbors=12, metric='cosine')
knn_model.fit(resume_vectors)

# Find the k most similar resumes to the job description vector
distances, indices = knn_model.kneighbors(job_desc_vector.reshape(1, -1))
ats_scores = (1 - distances) * 100

# Print the results
print("Distances to the nearest resumes:", distances)
print("Indices of the nearest resumes:", indices)
print("ATS Scores:", ats_scores)

Distances to the nearest resumes: [[0.49717068 0.51415742 0.51638184 0.52133094 0.54598075 0.63196613
  0.64834057 0.65652105 0.70806493 0.78081552 0.79934059 0.83724388]]
Indices of the nearest resumes: [[ 0  7  6  9 11  1  2 10  3  5  4  8]]
ATS Scores: [[50.28293238 48.58425844 48.3618157  47.86690615 45.40192535 36.80338747
  35.16594272 34.34789453 29.19350714 21.91844846 20.06594144 16.27561163]]


In [50]:

indices=np.array(indices)
ats_scores=np.array(ats_scores)
indices=indices.reshape(-1)
ats_scores=ats_scores.reshape(-1)


n = len(indices)
for i in range(n):
    for j in range(0, n - i - 1):
            # Swap if the element found is greater than the next element
        if indices[j] > indices[j + 1]:
            indices[j], indices[j + 1] = indices[j + 1], indices[j]
            ats_scores[j], ats_scores[j + 1] = ats_scores[j + 1], ats_scores[j]


print(indices.shape)
print(ats_scores.shape)

print(indices)
print(ats_scores)


(12,)
(12,)
[ 0  1  2  3  4  5  6  7  8  9 10 11]
[50.28293238 36.80338747 35.16594272 29.19350714 20.06594144 21.91844846
 48.3618157  48.58425844 16.27561163 47.86690615 34.34789453 45.40192535]


In [54]:
new_df = pd.read_csv('dataofResume.csv')

if 'KNN Score' not in new_df.columns:
    new_df['KNN Score'] = None



for i, index in enumerate(indices):
    sl_no = index + 1  # Add 1 to match the Sl no in the CSV file
    score = ats_scores[i]
    new_df.loc[df['Sl no'] == sl_no, 'KNN Score'] = score

# Save the updated CSV
new_df.to_csv('dataOfResume.csv', index=False)

print("KNN scores have been added successfully.")

KNN scores have been added successfully.


In [36]:
job_desc_vector = tfidf_array[-1].reshape(1, -1)  # The last vector, reshaped

# Calculate cosine similarities
similarities = cosine_similarity(resume_vectors, job_desc_vector)

# Calculate percentage matches
percentage_matches = [round(similarity[0] * 100, 2) for similarity in similarities]

# Print the results
for i, percentage_match in enumerate(percentage_matches):
    print(f"Resume {i + 1}: {percentage_match}% match with job description")

Resume 1: 50.28% match with job description
Resume 2: 36.8% match with job description
Resume 3: 35.17% match with job description
Resume 4: 29.19% match with job description
Resume 5: 20.07% match with job description
Resume 6: 21.92% match with job description
Resume 7: 48.36% match with job description
Resume 8: 48.58% match with job description
Resume 9: 16.28% match with job description
Resume 10: 47.87% match with job description
Resume 11: 34.35% match with job description
Resume 12: 45.4% match with job description


In [56]:
new_df = pd.read_csv('dataofResume.csv')

if 'Cosine Similarity' not in new_df.columns:
    new_df['Cosine Similarity'] = None
    
new_df['Cosine Similarity'] = percentage_matches
new_df.to_csv('dataOfResume.csv', index=False)

print("Cosine Similarity scores have been added successfully.")


Cosine Similarity scores have been added successfully.
