In [2]:
# Importing required libraries
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans


In [3]:
# Reading the CSV file back to memory
clustered_dataset = pd.read_csv('jd_data_for_clustering.csv')
clustered_dataset.head(5)

Unnamed: 0,profile,tech_stack,tobeclusteredcolumn
0,DS,"[Team management, Prototype, Database design, ...","DS [Team management, Prototype, Database desig..."
1,DS,"[Computer science, Training, Web technologies,...","DS [Computer science, Training, Web technologi..."
2,DS,"[deep learning, Statistical analysis, data sci...","DS [deep learning, Statistical analysis, data ..."
3,DS,"[RCA, Software design, Version control, Coding...","DS [RCA, Software design, Version control, Cod..."
4,DS,"[Data Science, Python, Machine Learning, Model...","DS [Data Science, Python, Machine Learning, Mo..."


In [8]:
# Creating a image from base dataframe for further use
transformed_df = clustered_dataset

In [10]:
# Assuming your DataFrame is named 'df'
# You can read it from your CSV file or use the provided DataFrame

# Create a TF-IDF vectorizer
vectorizer = TfidfVectorizer()

# Transform the 'tobeclusteredcolumn' column into a TF-IDF matrix
tfidf_matrix = vectorizer.fit_transform(transformed_df['tobeclusteredcolumn'])

# Get the number of unique profiles
n_clusters = len(transformed_df['profile'].unique())

# Perform K-Means clustering with the specified number of clusters
kmeans = KMeans(n_clusters=n_clusters, random_state=42)
transformed_df['cluster'] = kmeans.fit_predict(tfidf_matrix)

# Display the DataFrame with the assigned clusters
print(transformed_df)


  super()._check_params_vs_input(X, default_n_init=10)


     profile                                         tech_stack  \
0         DS  [Team management, Prototype, Database design, ...   
1         DS  [Computer science, Training, Web technologies,...   
2         DS  [deep learning, Statistical analysis, data sci...   
3         DS  [RCA, Software design, Version control, Coding...   
4         DS  [Data Science, Python, Machine Learning, Model...   
...      ...                                                ...   
4094    JAVA  [Java, MySQL, Spring Boot, Microservices, Deve...   
4095    JAVA  [Java Spring Boot, Jwt, Redis, Spring Boot, SQ...   
4096    JAVA  [Java, MySQL, Spring Boot, Microservices, Desi...   
4097    JAVA  [Core Java, Design Patterns, Spring Boot, Micr...   
4098    JAVA  [Java, MVC Framework, Spring Boot, Fullstack D...   

                                    tobeclusteredcolumn  cluster  
0     DS [Team management, Prototype, Database desig...        1  
1     DS [Computer science, Training, Web technologi...      

In [12]:
# Validate number of clusters
print(transformed_df.groupby(['profile','cluster']).count())
# Writing the clustered dataset to local
# transformed_df.to_csv('clustereddataset.csv',index=False)

                 tech_stack  tobeclusteredcolumn
profile cluster                                 
DB      0              1063                 1063
        1                20                   20
        2                 1                    1
        3               156                  156
DO      0                 2                    2
        1                 4                    4
        2                 1                    1
        3               913                  913
DS      0                 1                    1
        1               659                  659
        3                40                   40
JAVA    0                 3                    3
        1                 4                    4
        2              1024                 1024
        3               208                  208


In [14]:
# Validate number of clusters
print(transformed_df.groupby(['profile','cluster']).count())
# Writing the clustered dataset to local
transformed_df.to_csv('clustereddataset_from_kmeans.csv',index=False)

                 tech_stack  tobeclusteredcolumn
profile cluster                                 
DB      0              1063                 1063
        1                20                   20
        2                 1                    1
        3               156                  156
DO      0                 2                    2
        1                 4                    4
        2                 1                    1
        3               913                  913
DS      0                 1                    1
        1               659                  659
        3                40                   40
JAVA    0                 3                    3
        1                 4                    4
        2              1024                 1024
        3               208                  208


In [16]:
# This validator part
resumedata = pd.read_csv('ResumeValidator-ResumeData.csv')
resumedata

Unnamed: 0,Profile,tech_stack
0,Ramkumar DO,"['Automation', 'Azure Cloud', 'Cicd Pipeline',..."
1,Kamal DO,"['DevOps', 'VMware', 'Nginx', 'JBoss', 'Config..."
2,Anirudh Yadav DS,"['Machine learning', 'Python', 'NLP', 'Analyti..."
3,Sumit Mujumdar DS,"['Machine learning', 'Python', 'NLP', 'Analyti..."
4,Prem Java,"['Java', 'Hibernate', 'Spring Boot']"
5,Gaurav Java,"['Hibernate', 'Front end', 'Agile', 'J2Ee', 'H..."
6,Anurag Java,"['Java', 'Fullstack Development', 'Servlets / ..."
7,Ravindra DB,"['Ops', 'Mongodb Dba', 'Mongo Ops Manager', 'D..."
8,Shamsh DB,"['Smart scan', 'Query Optimization', 'Root Cau..."
9,Anil DB,"['Database Administration', 'SQL Server', 'Uni..."


In [17]:
# # Transform the 'tech_stack' column into a TF-IDF matrix
# tfidf_matrix = vectorizer.fit_transform(resumedata['tech_stack'])

# Transform the 'tech_stack' column into a TF-IDF matrix for new input data
new_tfidf_matrix = vectorizer.transform(resumedata['tech_stack'])

# Predict the cluster for the new input data using the trained K-Means model
resumedata['predicted_cluster'] = kmeans.predict(new_tfidf_matrix)

# Display the new input data with predicted clusters
print(resumedata)

resumedata.to_csv('newlogicoutput.csv')

             Profile                                         tech_stack  \
0        Ramkumar DO  ['Automation', 'Azure Cloud', 'Cicd Pipeline',...   
1           Kamal DO  ['DevOps', 'VMware', 'Nginx', 'JBoss', 'Config...   
2   Anirudh Yadav DS  ['Machine learning', 'Python', 'NLP', 'Analyti...   
3  Sumit Mujumdar DS  ['Machine learning', 'Python', 'NLP', 'Analyti...   
4          Prem Java               ['Java', 'Hibernate', 'Spring Boot']   
5        Gaurav Java  ['Hibernate', 'Front end', 'Agile', 'J2Ee', 'H...   
6        Anurag Java  ['Java', 'Fullstack Development', 'Servlets / ...   
7        Ravindra DB  ['Ops', 'Mongodb Dba', 'Mongo Ops Manager', 'D...   
8          Shamsh DB  ['Smart scan', 'Query Optimization', 'Root Cau...   
9            Anil DB  ['Database Administration', 'SQL Server', 'Uni...   

   predicted_cluster  
0                  3  
1                  3  
2                  1  
3                  1  
4                  2  
5                  3  
6            