In [15]:
# Importing required libraries
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans


In [9]:
# Reading the CSV file back to memory
clustered_dataset = pd.read_csv('JD-Data_FP1_clean.csv')
clustered_dataset.head(5)

Unnamed: 0,Profile,job_title,experience,salary,location,job_description,tech_stack
0,DO,DevOps Engineer,7-11 Yrs,12-20 Lacs PA,,DevOps EngineerDevelopment of the full life cy...,"[Automation, Azure Cloud, Cicd Pipeline, SQL Q..."
1,DO,Senior DevOps Engineer,5-7 Yrs,Not disclosed,,Skill Sets . 5+ years of experience working on...,"[DevOps, VMware, Nginx, JBoss, Configuration m..."
2,DO,Senior Devops Engineer,3-8 Yrs,Not disclosed,,AWS: Working experience and a good understandi...,"[Terraform, Kubernates, AWS, Java, Aws Devops,..."
3,DO,Azure Devops Engineer,4-9 Yrs,10-20 Lacs PA,,Role & responsibilities Good exp in Azure devo...,"[Azure Devops, Azure Kubernetes Service, Cicd ..."
4,DO,DevOps Engineer - AWS,5-10 Yrs,"50,000-60,000 PA",,Hands on Experience on writing pipeline as cod...,"[continuous integration, kubernetes, nexus, te..."


In [11]:
# Splitting the tech stack to rows for clustering
# Create an empty list to store the transformed data
transformed_data = []

# Iterate through each row
for index, row in clustered_dataset.iterrows():
    profile = row['Profile']
    tech_stack = row['tech_stack']

    # Split the tech_stack string into a list
    tech_stack_list = [tech.strip('[]').strip() for tech in tech_stack.split(',')]

    # Append each tech_stack element as a separate row
    for tech in tech_stack_list:
        transformed_data.append([profile, tech])

# Create a new DataFrame with the transformed data
transformed_df = pd.DataFrame(transformed_data, columns=['Profile', 'tech_stack'])

# Remove duplicate rows
transformed_df = transformed_df.drop_duplicates()

# Display the transformed DataFrame without duplicates
print(transformed_df)

      Profile                   tech_stack
0          DO                   Automation
1          DO                  Azure Cloud
2          DO                Cicd Pipeline
3          DO                  SQL Queries
4          DO                      Jenkins
...       ...                          ...
31955      DB           Oracle clusterware
31956      DB               RAC Management
31957      DB      DB lifecycle management
31958      DB  Oracle Dataguard Management
31969      DB                      Postgre

[5251 rows x 2 columns]


In [13]:
transformed_df['tobeclusteredcolumn'] = pd.concat([transformed_df['Profile'], transformed_df['tech_stack']], axis=1).apply(lambda x: ' '.join(x), axis=1)

transformed_df

Unnamed: 0,Profile,tech_stack,tobeclusteredcolumn
0,DO,Automation,DO Automation
1,DO,Azure Cloud,DO Azure Cloud
2,DO,Cicd Pipeline,DO Cicd Pipeline
3,DO,SQL Queries,DO SQL Queries
4,DO,Jenkins,DO Jenkins
...,...,...,...
31955,DB,Oracle clusterware,DB Oracle clusterware
31956,DB,RAC Management,DB RAC Management
31957,DB,DB lifecycle management,DB DB lifecycle management
31958,DB,Oracle Dataguard Management,DB Oracle Dataguard Management


In [17]:

# Assuming your DataFrame is named 'df'
# You can read it from your CSV file or use the provided DataFrame

# Create a TF-IDF vectorizer
vectorizer = TfidfVectorizer()

# Transform the 'tech_stack' column into a TF-IDF matrix
tfidf_matrix = vectorizer.fit_transform(transformed_df['tech_stack'])

# Get the number of unique profiles
n_clusters = len(transformed_df['Profile'].unique())

# Perform K-Means clustering with the specified number of clusters
kmeans = KMeans(n_clusters=n_clusters, random_state=42)
transformed_df['cluster'] = kmeans.fit_predict(tfidf_matrix)

# Display the DataFrame with the assigned clusters
print(transformed_df)


      Profile                   tech_stack             tobeclusteredcolumn  \
0          DO                   Automation                   DO Automation   
1          DO                  Azure Cloud                  DO Azure Cloud   
2          DO                Cicd Pipeline                DO Cicd Pipeline   
3          DO                  SQL Queries                  DO SQL Queries   
4          DO                      Jenkins                      DO Jenkins   
...       ...                          ...                             ...   
31955      DB           Oracle clusterware           DB Oracle clusterware   
31956      DB               RAC Management               DB RAC Management   
31957      DB      DB lifecycle management      DB DB lifecycle management   
31958      DB  Oracle Dataguard Management  DB Oracle Dataguard Management   
31969      DB                      Postgre                      DB Postgre   

       cluster  
0            0  
1            0  
2           

  super()._check_params_vs_input(X, default_n_init=10)


In [18]:
# Validate number of clusters
print(transformed_df['cluster'].unique())

[0 1 3 2]


In [19]:
# Writing the clustered dataset to local
transformed_df.to_csv('clustereddataset.csv',index=False)