In [None]:
import os
import json
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer
from pyaspeller import YandexSpeller
import textstat
import PyPDF2  # For extracting text from PDFs
import yake 

# Initialize tools
grammar_tool = YandexSpeller()
sentence_model = SentenceTransformer("all-MiniLM-L6-v2")
nlp = spacy.load("en_core_web_sm")

def extract_text_from_pdf(pdf_path):
    """Extracts text from a PDF file."""
    with open(pdf_path, "rb") as file:
        reader = PyPDF2.PdfReader(file)
        text = ""
        for page in reader.pages:
            text += page.extract_text()
    return text

def extract_keywords(text, top_n=10):
    """Extract keywords using YAKE and spaCy."""
    # Use YAKE (Keyword Extraction)
    kw_extractor = yake.KeywordExtractor(lan="en", n=1, dedupLim=0.9, top=top_n)
    yake_keywords = {kw for kw, _ in kw_extractor.extract_keywords(text)}

    # Use spaCy for NLP processing
    doc = nlp(text)
    spacy_keywords = {token.lemma_ for token in doc if token.pos_ in ["NOUN", "PROPN", "VERB", "ADJ"]}

    # Combine results & return top keywords
    all_keywords = list(yake_keywords | spacy_keywords)
    return sorted(all_keywords, key=lambda x: text.count(x), reverse=True)[:top_n]

def keyword_inclusion_score(job_description, resume_text):
    """Calculates the percentage of job description keywords included in the resume."""
    keywords = extract_keywords(job_description)
    included = [keyword for keyword in keywords if keyword in resume_text]
    return len(included) / len(keywords)

def semantic_similarity(original_text, edited_text):
    """Calculates semantic similarity between two texts."""
    embeddings = sentence_model.encode([original_text, edited_text])
    return cosine_similarity([embeddings[0]], [embeddings[1]])[0][0]

def count_impact_phrases(text):
    """Counts impact-driven phrases in the text."""
    impact_words = ["increased", "improved", "reduced", "achieved", "optimized", "enhanced", "utilized"]
    return sum(text.lower().count(word) for word in impact_words)

def evaluate_resume(original_resume_path, optimized_resume_path, job_description):
    """Evaluates the optimized resume against the original resume and job description."""
    # Extract text
    if original_resume_path.endswith(".pdf"):
        original_text = extract_text_from_pdf(original_resume_path)
    else:
        original_text = extract_text_from_docx(original_resume_path)

    if optimized_resume_path.endswith(".pdf"):
        optimized_text = extract_text_from_pdf(optimized_resume_path)
    else:
        optimized_text = extract_text_from_docx(optimized_resume_path)

    # Keyword inclusion
    keyword_score = keyword_inclusion_score(job_description, optimized_text)

    # Semantic similarity
    similarity = semantic_similarity(original_text, optimized_text)

    # Impact-driven phrases
    impact_phrases = count_impact_phrases(optimized_text)

    # Print results
    print(f"Keyword Inclusion Score: {keyword_score:.2f}")
    print(f"Semantic Similarity: {similarity:.2f}")
    print(f"Impact-Driven Phrases: {impact_phrases}")

    # Return metrics
    return {
        "keyword_score": keyword_score,
        "semantic_similarity": similarity,
        "impact_phrases": impact_phrases,
    }

# Example usage
original_resume_path = "/Users/aadi/Downloads/Deep Learning Applications - AIPI 540/Natural Language Processing/lllm-resume-optimizer/data/raw/Sample_Aaditey_Pillai_Resume.docx"
optimized_resume_path = "/Users/aadi/Downloads/Deep Learning Applications - AIPI 540/Natural Language Processing/lllm-resume-optimizer/data/outputs/optimized_resume.docx"

job_description = """
About the job
Lab Summary: AI Research Center (AIC) located in Mountain View, California focuses on research and development which directly impacts future Samsung products reaching hundreds of millions of users worldwide. We are focused on pushing the state-of-the-art and practice in natural language and knowledge intelligence.

Position Summary: Samsung Research AI center, located in Mountain View, CA, is currently recruiting world-class students who can thrive in a fast-pace, cross team, results-driven environment, with focus on highly visible, challenging, and cross discipline projects. You will be part of an exciting project to build an adaptive, personalized, contextual and secure AI model and system to enable fast, accurate and safe interactions tailored to users’ needs on Samsung devices.

Position Responsibilities

Develop and implement novel deep learning/reinforcement learning algorithms for natural language processing (text, speech) in various applications
Contribute to the research activities of our team
Generate creative solutions (patents) and publish in top conferences (papers)

Required Skills 

Teamwork and communication skills
Current Ph.D. student in CS, EE, or related field
Experience in one or more of the following areas:
Strong background in machine learning (supervised learning, transfer learning, one-shot or few shot learning, unsupervised learning, semi-supervised learning, weakly supervised learning, meta-learning, outlier detection, etc.).
Expertise in LLM including model architecture, training/finetuning techniques, retrieval augmented generation (RAG), reasoning and action planning, etc.
Experience in conversational AI technologies: natural language processing (e.g., language models, semantic parsing, natural language generation etc.), dialogue (e.g., state tracking, policy learning), and representation learning (embedding, conceptualization, etc.).
Experience in knowledge augmented AI technologies (e.g., language prompt, knowledge graph, neuro-symbolic learning).
Experience in agentic AI is a plus.
Experience in multimodal AI technologies for various multimodal applications.
Experience in on-device AI technologies such as lightweight model architecture design.
Proficiency in a neural network library (e.g., PyTorch, TensorFlow).
Proven track record of research/publications on machine learning and artificial intelligence field (NeurIPS, ICML, ICLR, AAAI, IJCAI, CVPR, ACL, EMNLP, NAACL, TACL, etc.)
"""

evaluate_resume(original_resume_path, optimized_resume_path, job_description)

Keyword Inclusion Score: 0.80
Semantic Similarity: 0.84
Impact-Driven Phrases: 5


{'keyword_score': 0.8,
 'semantic_similarity': np.float32(0.8382788),
 'impact_phrases': 5}