In [1]:
#pip install PyPDF2 nltk scikit-learn pandas

In [2]:
import PyPDF2 

In [5]:
def extract_text_from_pdf(pdf_path):
    text = ""
    with open(pdf_path, 'rb') as file:
        reader = PyPDF2.PdfReader(file)
        for page in reader.pages:
            text += page.extract_text()
        return text

In [20]:
import re

def extract_area_of_interest_section(text):
    # Search for "Area of Interest" and capture subsequent lines
    pattern = r"Area of Interest\s*(.*?)\s*(?=\n[A-Z]|$)"  # Match until next capitalized section or end of text
    
    # Use re.DOTALL to ensure multiline match
    match = re.search(pattern, text, re.DOTALL)
    
    if match:
        return match.group(1).strip()  # Return the extracted content without leading/trailing spaces
    else:
        return ""  # Return an empty string if no match is found

In [7]:
import nltk
#nltk.download('punkt')
from nltk.tokenize import word_tokenize

def tokenize_text(section_text):
    tokens = word_tokenize(section_text)
    return [word for word in tokens if word.isalpha()]

[nltk_data] Downloading package punkt to C:\Users\ANIRUTH
[nltk_data]     SINGHA\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [9]:
def process_pdf_for_interests(pdf_path):
    text = extract_text_from_pdf(pdf_path)
    area_of_interest_text = extract_area_of_interest_section(text)
    
    if area_of_interest_text:
        tokens = tokenize_text(area_of_interest_text)
        print("Extracted Tokens:", tokens)
    else:
        print("No 'Area of Interest' section found.")


In [10]:
import pandas as pd
import re
from nltk.tokenize import word_tokenize
import nltk

#nltk.download('punkt')

# Function to tokenize and clean words
def tokenize_text(text):
    tokens = word_tokenize(text.lower())  # Convert to lowercase and tokenize
    return set(word for word in tokens if word.isalpha())  # Keep only alphabetic tokens

# Function to extract 'Area of Interest' section from text
def extract_area_of_interest_section(text):
    pattern = r"Area of Interest.?([\s\S]?)(?:\n\n|\Z)"  # Match until a blank line or end of file
    match = re.search(pattern, text, re.IGNORECASE)
    if match:
        return match.group(1).strip()
    return ""

# Function to match keywords with a dataset
def match_keywords_with_dataset(area_of_interest_text, dataset_path):
    # Extract and tokenize keywords from 'Area of Interest'
    extracted_keywords = tokenize_text(area_of_interest_text)
    print("Extracted Keywords:", extracted_keywords)

    # Load dataset
    df = pd.read_excel(dataset_path)  # Assuming Excel file with 'required skill set' column
    if 'required skill set' not in df.columns:
        raise ValueError("The dataset must have a 'required skill set' column.")

    # Match keywords with 'required skill set' in each row
    matched_rows = []
    for index, row in df.iterrows():
        skill_set_keywords = tokenize_text(row['required skill set'])
        common_keywords = extracted_keywords.intersection(skill_set_keywords)
        
        if common_keywords:  # If any keywords match
            matched_rows.append((index, row['required skill set'], list(common_keywords)))
    
    # Display matched results
    if matched_rows:
        print("\nMatched Keywords:")
        for idx, skills, common in matched_rows:
            print(f"Row {idx}: Required Skill Set -> '{skills}' | Matched Keywords: {common}")
    else:
        print("No matching skills found.")

# # Example usage
# pdf_text = """
# John's Resume

# Area of Interest:
# Machine Learning, Data Science, Artificial Intelligence, Python Programming

# Education:
# Bachelor of Technology in Computer Science
# """

# # Path to dataset file
# dataset_path = "skill_training_dataset.xlsx"  # Replace with your dataset file path

# # Extract 'Area of Interest' section
# area_of_interest_text = extract_area_of_interest_section(pdf_text)

# # Match keywords with the dataset
# match_keywords_with_dataset(area_of_interest_text, dataset_path)

In [13]:
corpus = extract_text_from_pdf("./computer_engineering_cv2.pdf")

In [21]:
extracted_corpus = extract_area_of_interest_section(corpus)

In [22]:
print(extracted_corpus)

My key areas of interest lie in software development, particularly full-stack development, where I can
combine front-end and back-end technologies to create robust applications. I am passionate about
exploring networking, cloud computing, and automation tools to enhance system efficiency. I enjoy
problem-solving and seek opportunities to work on innovative projects that involve coding, testing,
and deployment while utilizing emerging technologies.


In [29]:
import pandas as pd

def match_skills_with_percentage(text, dataset):
    """
    Matches skills from a given text to the Req_Skills column in a dataset 
    and returns the Training_Name(s) with their percentage of matching skills.

    Args:
        text (str): The input text containing skills to match.
        dataset (pd.DataFrame): The dataset containing Training_Name and Req_Skills columns.

    Returns:
        pd.DataFrame: A DataFrame with Training_Name and matching percentage, sorted by percentage.
    """
    # Tokenize the input text into a set of skills
    input_skills = set(word.strip().lower() for word in text.replace(',', ' ').split())

    # List to store results
    results = []

    # Iterate through the dataset and compute the match percentage
    for _, row in dataset.iterrows():
        training_name = row['Training_Name']
        req_skills = set(word.strip().lower() for word in row['Req_Skills'].replace(',', ' ').split())

        # Calculate the intersection and percentage of match
        matched_skills = input_skills & req_skills
        total_skills = len(req_skills)
        matched_percentage = (len(matched_skills) / total_skills) * 100 if total_skills > 0 else 0

        # Append the result
        results.append({
            'Training_Name': training_name,
            'Matched_Skills': ', '.join(matched_skills),
            'Matching_Percentage': round(matched_percentage, 2)
        })

    # Convert results to a DataFrame
    results_df = pd.DataFrame(results)

    # Sort the results by Matching_Percentage in descending order
    sorted_results = results_df.sort_values(by='Matching_Percentage', ascending=False).reset_index(drop=True)

    return sorted_results


# # Example Usage
# text = """
# My key areas of interest lie in software development, particularly full-stack development, where I can 
# combine front-end and back-end technologies to create robust applications. I am passionate about 
# exploring networking, cloud computing, and automation tools to enhance system efficiency. I enjoy 
# problem-solving and seek opportunities to work on innovative projects that involve coding, testing, 
# and deployment while utilizing emerging technologies.
# """

# # Simulated dataset based on the provided data
# data = {
#     'Training_Name': [
#         'Python Programming Basics', 'Advanced Python', 'Data Analysis with Python',
#         'Machine Learning Essentials', 'Deep Learning Specialization', 'DevOps Fundamentals',
#         'AWS Cloud Practitioner', 'Azure Fundamentals'
#     ],
#     'Req_Skills': [
#         'Python, VS Code, PyCharm, pip, virtualenv, Jupyter',
#         'Python, Decorators, Generators, Asyncio, Django, Flask',
#         'Pandas, NumPy, Matplotlib, Seaborn, Jupyter, SciPy',
#         'Scikit-learn, NumPy, Pandas, TensorFlow, PyTorch, Jupyter',
#         'TensorFlow, Keras, PyTorch, Neural Networks, GPUs, CUDA',
#         'Docker, Kubernetes, Jenkins, CI/CD, Git, Ansible',
#         'AWS, S3, EC2, CloudFormation, IAM, Lambda',
#         'Azure, ARM Templates, Azure DevOps, Virtual Machines, App Services'
#     ]
# }

# # Convert to DataFrame
# df = pd.DataFrame(data)

# # Call the function
# matched_trainings = match_skills_with_percentage(text, df)

# # Output the results
# print("Training Matches with Percentage:")
# print(matched_trainings)


In [30]:
df = pd.read_csv("minor.csv", encoding='latin1')

In [33]:
matched_trainings = match_skills_with_percentage(extracted_corpus, df)

In [36]:
print("Training Matches with Percentage:")
print(matched_trainings)

Training Matches with Percentage:
                      Training_Name        Matched_Skills  Matching_Percentage
0                  Software Testing   testing, automation                40.00
1                            DevOps     tools, automation                33.33
2              Software Engineering  development, testing                33.33
3     Cloud Automation with Ansible            automation                33.33
4                   Ethical Hacking  development, testing                33.33
...                             ...                   ...                  ...
1066                     Smart Grid                                       0.00
1067            Thermal Engineering                                       0.00
1068             Autonomous Systems                                       0.00
1069  Advanced Chemical Engineering                                       0.00
1070         Social Media Analytics                                       0.00

[1071 rows x 3 co