## Find similar Jobs


#### This notebook showcases how T5 (text-to-text transfer transformer) finds similarity between 2 texts.

### Import neccesary libraries

In [1]:
import torch
import json 
from transformers import T5Tokenizer, T5ForConditionalGeneration, T5Config
import re
import pandas as pd

### Utility functions

In [16]:
# pre process the test.
def normalize_text(text):
    regex = r"(\d(?:\.|\))\s+)|([a-z]\)\s+)|(•\s+)|([A-Z]\.\s+)|((?:[IVX]|[ivx])+(?:\.|\))\s+)"
    text = text.strip().replace("\n","").lower()     # convert to lowercase
    text = re.sub(regex, "", text)                   # remove all bullet points
    return text

# Apply transformers T5 to find similarity between 2 texts (between score 0 to 5)
def Find_similarity(row):
    t5_prepared_Text = "stsb sentence1: "+normalize_text(text) + " sentence2: "+ normalize_text(row["description"])
    tokenized_text = tokenizer.encode(t5_prepared_Text, return_tensors="pt")    
    summary_ids = model.generate(tokenized_text)      # semantic similarity 
    return float(tokenizer.decode(summary_ids[0]))

### Find top 10 similar jobs 

In [17]:
def find_top_10_similar_jobs(job_description, jobs_dataset):
    model = T5ForConditionalGeneration.from_pretrained('t5-small')
    tokenizer = T5Tokenizer.from_pretrained('t5-small')
    
    df_jobs = pd.read_excel(jobs_dataset)
    df_jobs['Similarity_Quotient'] = df_jobs.apply(lambda row: Find_similarity(row),axis=1)
    df_top_10_similar_jobs = df_jobs.sort_values(by='Similarity_Quotient', ascending=False).head(10)
    return df_top_10_similar_jobs

In [18]:
jobs_dataset = r"C:\Data_Science_Lab\Sai\Job recommendation engine\Job_Scrapping_Internshala_.xlsx"

job_description = """Selected intern's day-to-day responsibilities include: 
 
 1. Conducting research by implementing machine learning/deep learning models
 2. Handling management of dataset, training, testing, and analysis of models
 3. Reading and writing research papers
 4. Performing various experiments and analysis
 5. Writing technical reports
 6. Prototyping solutions
 7. Building systems around the deep learning models
 8. Building infrastructures for the deep learning-based systems
 9. Working on real-world problems and working with IITB Ph.D. students
"""

find_top_10_similar_jobs(job_description, jobs_dataset)

Some weights of the model checkpoint at t5-small were not used when initializing T5ForConditionalGeneration: ['encoder.block.0.layer.0.layer_norm.bias', 'encoder.block.0.layer.1.layer_norm.bias', 'encoder.block.1.layer.0.layer_norm.bias', 'encoder.block.1.layer.1.layer_norm.bias', 'encoder.block.2.layer.0.layer_norm.bias', 'encoder.block.2.layer.1.layer_norm.bias', 'encoder.block.3.layer.0.layer_norm.bias', 'encoder.block.3.layer.1.layer_norm.bias', 'encoder.block.4.layer.0.layer_norm.bias', 'encoder.block.4.layer.1.layer_norm.bias', 'encoder.block.5.layer.0.layer_norm.bias', 'encoder.block.5.layer.1.layer_norm.bias', 'encoder.final_layer_norm.bias', 'decoder.block.0.layer.0.layer_norm.bias', 'decoder.block.0.layer.1.layer_norm.bias', 'decoder.block.0.layer.2.layer_norm.bias', 'decoder.block.1.layer.0.layer_norm.bias', 'decoder.block.1.layer.1.layer_norm.bias', 'decoder.block.1.layer.2.layer_norm.bias', 'decoder.block.2.layer.0.layer_norm.bias', 'decoder.block.2.layer.1.layer_norm.bias

Unnamed: 0.1,Unnamed: 0,job_title,company_name,location,salary,description,Similarity_Quotient
1,2,Deep Learning,IIT Bombay,Work From Home,2000-4000 /month,Selected intern's day-to-day responsibilities ...,5.0
70,71,Artificial Intelligence (AI),Advanto Software,Work From Home,2000-2001 /month,Selected intern's day-to-day responsibilities ...,4.0
101,102,Data Science,KUKbit Software Lab,Pune,15000 /month,Selected intern's day-to-day responsibilities ...,4.0
51,52,Data Science/ML,Centre For Computational Technologies Private ...,Work From Home,8000 /month,Selected intern's day-to-day responsibilities ...,4.0
114,115,Machine Learning,Mavenai Technologies LTD,Work From Home,5000-8000 /month,Selected intern's day-to-day responsibilities ...,4.0
59,60,Data Science,SkillBit,Work From Home,5000 /month,Selected intern's day-to-day responsibilities ...,4.0
136,137,Data Science,SkillBit,Pune,5000 /month,Selected intern's day-to-day responsibilities ...,4.0
24,25,Machine Learning,TikiLIVE,Work From Home,12000 /month,Selected intern's day-to-day responsibilities ...,3.8
131,132,Machine Learning,Revelin7 Technology Private Limited,Chandigarh,10000 /month,Selected intern's day-to-day responsibilities ...,3.8
130,131,Machine Learning,SJTech Solutions,Work From Home,2500-3500 /month,Selected intern's day-to-day responsibilities ...,3.8
