In [13]:
#!/usr/bin/env python
# coding: utf-8

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import re
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import nltk
nltk.download('stopwords')
import ast

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/anirudhyadav/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:

clustered_dataset = pd.read_csv('jd_data_for_clustering.csv')
print(clustered_dataset.head(5))
clustered_dataset.info()

  profile                                         tech_stack  \
0      DS  [Team management, Prototype, Database design, ...   
1      DS  [Computer science, Training, Web technologies,...   
2      DS  [deep learning, Statistical analysis, data sci...   
3      DS  [RCA, Software design, Version control, Coding...   
4      DS  [Data Science, Python, Machine Learning, Model...   

                                 tobeclusteredcolumn  
0  DS [Team management, Prototype, Database desig...  
1  DS [Computer science, Training, Web technologi...  
2  DS [deep learning, Statistical analysis, data ...  
3  DS [RCA, Software design, Version control, Cod...  
4  DS [Data Science, Python, Machine Learning, Mo...  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4099 entries, 0 to 4098
Data columns (total 3 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   profile              4099 non-null   object
 1   tech_stack           4099 n

In [4]:
# Assuming 'roles_dataset.csv' is your dataset with 'profile' and 'tech_stack' columns
# dataset = pd.read_csv('jd_data_for_clustering.csv')

def cleanJD(tech_stack):
    tech_stack = re.sub('RT|cc', ' ', tech_stack)  # remove RT and cc
    tech_stack = re.sub('#\S+', '', tech_stack)  # remove hashtags
    tech_stack = re.sub('@\S+', '  ', tech_stack)  # remove mentions
    tech_stack = re.sub('[%s]' % re.escape("""!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~"""), ' ', tech_stack)  # remove punctuations
    tech_stack = re.sub(r'[^\x00-\x7f]',r' ', tech_stack) 
    tech_stack = re.sub('\s+', ' ', tech_stack)  # remove extra whitespace
    return tech_stack

# Text cleaning function
def clean_text(text):

    # # Apply custom cleaning logic
    # text = cleanJD(text)

    # Convert to lowercase
    text = text.lower()
    
    # Remove punctuation and special characters
    text = re.sub(r'[^a-z\s]', '', text)
    
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    text = ' '.join(word for word in text.split() if word not in stop_words)
    
    # Apply stemming
    stemmer = PorterStemmer()
    text = ' '.join(stemmer.stem(word) for word in text.split())
    
    return text

# Apply text cleaning to the dataset
clustered_dataset['tech_stack_cleaned'] = clustered_dataset['tech_stack'].apply(clean_text)

# Function to find the role that best matches the cleaned input list of skills
def find_matching_role(input_skills, clustered_dataset):
    # Combine the input skills into a single string after cleaning
    input_skills_text = " ".join(clean_text(skill) for skill in input_skills)

    # Combine the cleaned tech_stack column in the dataset into a single string for each role
    clustered_dataset['tech_stack_combined_cleaned'] = clustered_dataset.groupby('profile')['tech_stack_cleaned'].transform(lambda x: " ".join(x))

    # Create a CountVectorizer to convert the cleaned text into vectors
    vectorizer = CountVectorizer().fit_transform([input_skills_text] + list(clustered_dataset['tech_stack_combined_cleaned']))

    # Calculate the cosine similarity
    cosine_similarities = cosine_similarity(vectorizer[0], vectorizer[1:])[0]

    # Find the index of the role with the highest similarity
    best_matching_role_index = cosine_similarities.argmax()

    # Get the profile (role) corresponding to the best matching index
    best_matching_role = clustered_dataset.iloc[best_matching_role_index]['profile']

    # Set a threshold for similarity to consider a match
    similarity_threshold = 0.3

    # Check if the best matching role meets the similarity threshold
    if cosine_similarities[best_matching_role_index] >= similarity_threshold:
        return best_matching_role
    else:
        return "No matching role found"

In [5]:
print(clustered_dataset['tech_stack_cleaned'].sample(5) )

2345             sql dba profession databas administr sql
1262                        devop veracod devop tool tool
3651    spring boot hibern core java spring mvc produc...
3106    java jenkin angularj css ms sql docker reactj ...
580     autom sa infrastructur manag applic develop da...
Name: tech_stack_cleaned, dtype: object


### Manual Testing

In [12]:

# Example usage 1
# input_skills = ['Machine learning', 'Python', 'NLP', 'Analytical', 'TensorFlow', 'Natural language processing']
input_skills = ['Automation', 'Azure Cloud', 'Cicd Pipeline', 'SQL Queries', 'Jenkins', 'GIT', 'Docker', 'Ansible']
print((input_skills))
matching_role = find_matching_role(input_skills, clustered_dataset)

# Display the matching role or a message if no matching role is found
print(f"The input skills match the role: {matching_role}")


# Reading the dataset from the local machine having resumes with skills defined
resumedata = pd.read_csv('ResumeValidator-ResumeData.csv')





<class 'list'>
The input skills match the role: DO
<class 'list'>
The input skills match the role:  DO


In [44]:
# Example usage 2

input_skills_list1 = ['xxx', 'yyy', 'aaa', 'bbb', 'ccc','ddd']
matching_role1 = find_matching_role(input_skills_list1, clustered_dataset)

# Display the matching role
print(f"The input skills match the role: {matching_role1}")

# Example usage 3

input_skills_list2 = ['DevOps', 'VMware', 'Nginx', 'JBoss', 'Configuration management', 'Linux', 'Docker', 'Terraform']
matching_role2 = find_matching_role(input_skills_list2, clustered_dataset)

# Display the matching role
print(f"The input skills match the role: {matching_role2}")

# Example usage 4

input_skills_list4 =  ['Hibernate', 'Front end', 'Agile', 'J2Ee', 'HTML','microservices']
matching_role4 = find_matching_role(input_skills_list4, clustered_dataset)

# Display the matching role
print(f"The input skills match the role: {matching_role4}")

# Example usage 5

input_skills_list5 = ['DevOps', 'VMware', 'Nginx', 'JBoss', 'Configuration management', 'Linux', 'Docker', 'Terraform']
matching_role5 = find_matching_role(input_skills_list5, clustered_dataset)

# Display the matching role
print(f"The input skills match the role: {matching_role5}")

# Example usage 6

input_skills_list6 = ['Ops', 'Mongodb Dba', 'Mongo Ops Manager', 'DBMS', 'MongoDB', 'Database administration', 'MongoDB Database', 'Management']
matching_role6 = find_matching_role(input_skills_list6, clustered_dataset)

# Display the matching role
print(f"The input skills match the role: {matching_role6}")

# Example usage 7

input_skills_list7 = ['Smart scan', 'Query Optimization', 'Root Cause Analysis', 'Exadata', 'ZDLRA', 'ZFS', 'Sql Performance Tuning', 'Performance Tuning']
matching_role7 = find_matching_role(input_skills_list7, clustered_dataset)

# Display the matching role
print(f"The input skills match the role: {matching_role7}")

### Automated Input Testing

In [45]:
# Reading the dataset from the local machine having resumes with skills defined
resumedata = pd.read_csv('ResumeValidator-ResumeData.csv')
print(resumedata.head(1))

       Profile                                         tech_stack
0  Ramkumar DO  ['Automation', 'Azure Cloud', 'Cicd Pipeline',...


In [18]:
# Example usage for each row in the dataset
for index, row in resumedata.iterrows():
    profile = row['Profile']
    tech_stack = row['tech_stack']
    
    input_skills_list = ast.literal_eval(tech_stack)  # changing dataype from strin to list
    # print((profile))
    matching_role = find_matching_role(input_skills_list, clustered_dataset)
    
    # Display the matching role or a message if no matching role is found
    print(f"The input skills match the role: {profile} is {matching_role}")


The input skills match the role: Ramkumar DO is DO
The input skills match the role: Kamal DO is DO
The input skills match the role: Anirudh Yadav DS is DS
The input skills match the role: Sumit Mujumdar DS is DS
The input skills match the role: Prem Java is JAVA
The input skills match the role: Gaurav Java is No matching role found
The input skills match the role: Anurag Java is JAVA
The input skills match the role: Ravindra DB is DB
The input skills match the role: Shamsh DB is No matching role found
The input skills match the role: Anil DB is DB
The input skills match the role: Sharat is No matching role found
