In [528]:
#semantic-search-in-python

In [530]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from transformers import AutoTokenizer, AutoModel
import torch
import ast
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import matplotlib.pyplot as plt
from tqdm import tqdm
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ashis\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ashis\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\ashis\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [532]:
# Load the Excel file

df = pd.read_excel(r"data\\linkedin_job_posts_insights.xlsx")
# Initial data inspection

In [534]:
print("Shape:", df.shape)
print("Size:", df.size)
print("Columns:", df.columns)
print("Info:")
print(df.info())
print("Count of null values:\n", df.isnull().sum())
# Drop rows with null values
df_cleaned = df.dropna()
# Count after dropping nulls
print("Count after dropping null values:\n", df_cleaned.isnull().sum())

Shape: (31597, 9)
Size: 284373
Columns: Index(['job_title', 'company_name', 'location', 'hiring_status', 'date',
       'seniority_level', 'job_function', 'employment_type', 'industry'],
      dtype='object')
Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 31597 entries, 0 to 31596
Data columns (total 9 columns):
 #   Column           Non-Null Count  Dtype         
---  ------           --------------  -----         
 0   job_title        31571 non-null  object        
 1   company_name     30657 non-null  object        
 2   location         31588 non-null  object        
 3   hiring_status    31597 non-null  object        
 4   date             31597 non-null  datetime64[ns]
 5   seniority_level  30289 non-null  object        
 6   job_function     30007 non-null  object        
 7   employment_type  30006 non-null  object        
 8   industry         29586 non-null  object        
dtypes: datetime64[ns](1), object(8)
memory usage: 2.2+ MB
None
Count of null values:
 job_tit

In [540]:
def preprocess_text(text):
    text = str(text).lower()
    tokens = word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(token) for token in tokens]
    return ' '.join(tokens)
def generate_embedding(text, tokenizer, model):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).numpy()
def extract_relevant_sentences(summary, query_embedding, tokenizer, model, k=2):
    sentences = summary.split(".")
    sentence_embeddings = [generate_embedding(sentence, tokenizer, model) for sentence in sentences]
    similarities = [cosine_similarity(query_embedding.reshape(1, -1),sentence_embedding.reshape(1, -1))[0, 0] for sentence_embedding in sentence_embeddings]
    top_k_indices = np.argsort(similarities)[-k:]
    return [sentences[index] for index in top_k_indices]
def search_authors_and_relevant_parts(df, query, tokenizer, model, n=3, k=2):
    query_embedding = generate_embedding(query, tokenizer, model)
    df["similarity"] = df["embedding"].apply(lambda x: cosine_similarity(x.reshape(1, -1), query_embedding.reshape(1, -1))[0, 0])
    top_n_results = df.sort_values("similarity", ascending=False).head(n)
    for _, row in top_n_results.iterrows():
        relevant_parts = extract_relevant_sentences(row["summary"],query_embedding, tokenizer, model, k)
        print(f"Job Title: {row['job_title']} (similarity: {row['similarity']:4f})")
        print(f"Relevant parts: {'.'.join(relevant_parts)}...")

In [542]:
model_name = "sentence-transformers/paraphrase-MiniLM-L6-v2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

tokenizer_config.json:   0%|          | 0.00/314 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]



model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

In [548]:
# Create a summary column with all relevant metadata concatenated
df_cleaned.loc[:, 'summary'] = df_cleaned.apply(
lambda x: f"{x['job_title']} {x['company_name']} {x['location']}{x['hiring_status']} {x['date']} {x['seniority_level']} {x['job_function']}{x['employment_type']} {x['industry']}", axis=1)
# Preprocess the summary text
df_cleaned.loc[:, "preprocessed_summary"] = df_cleaned["summary"].apply(preprocess_text)
# Generate embeddings and save them, avoiding comma errors
embeddings = []
for summary in tqdm(df_cleaned["preprocessed_summary"], desc="Generatingembeddings"):
    embedding = generate_embedding(summary, tokenizer, model)
    embeddings.append(embedding)
# Convert embeddings to a format that can be saved without comma errors
df_cleaned.loc[:, "embedding"] = embeddings
df_cleaned.loc[:, "embedding"] = df_cleaned["embedding"].apply(lambda x: ', '.join(map(str, x[0])))
# Save the DataFrame with embeddings to a CSV file
df_cleaned.to_csv("summaries_with_embeddings.csv", index=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cleaned.loc[:, 'summary'] = df_cleaned.apply(
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cleaned.loc[:, "preprocessed_summary"] = df_cleaned["summary"].apply(preprocess_text)
Generatingembeddings: 100%|██████████████████████████████████████████████████████████████████████████████████| 27972/27972 [04:49<00:00, 96.79it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.

In [551]:
import pandas as pd
import numpy as np
# Load the CSV file
df_with_embeddings = pd.read_csv("summaries_with_embeddings.csv")
# Convert the embeddings from strings back to numpy arrays
df_with_embeddings["embedding"] = df_with_embeddings["embedding"].apply(
lambda x: np.array([float(num) for num in x.split(', ')])
)

In [555]:
from sklearn.metrics.pairwise import cosine_similarity
def search(query, df, tokenizer, model, top_n=5):
    # Preprocess the query
    preprocessed_query = preprocess_text(query)
    # Generate the embedding for the query
    query_embedding = generate_embedding(preprocessed_query, tokenizer,model)[0]
    # Compute cosine similarity between the query embedding and the job embeddings
    similarities = cosine_similarity([query_embedding], list(df["embedding"]))
    # Add similarity scores to the DataFrame
    df["similarity_score"] = similarities[0]
    # Get the top N most similar results
    top_n_indices = np.argsort(similarities[0])[-top_n:][::-1]
    top_n_results = df.iloc[top_n_indices]
    return top_n_results

In [561]:
while True:
    # Ask the user for input query
    query = input("Enter your query (or 'exit' to quit): ")
    # Check if the user wants to exit
    if query.lower() == 'exit':
        print("Exiting...")
        break
    # Perform semantic search using the search function
    results = search(query, df_with_embeddings, tokenizer, model)
# Display the top results
print(f"\nTop results for '{query}':")
for index, row in results.iterrows():
    print(f"Job Title: {row['job_title']}")
    print(f"Company: {row['company_name']}")
    print(f"Location: {row['location']}")
    print(f"Similarity Score: {row['similarity_score']:.4f}")
    print("-------------------------")


Enter your query (or 'exit' to quit):  Hey buddy help me find jobs related to Big Data Analytics in India country
Enter your query (or 'exit' to quit):  exit


Exiting...

Top results for 'exit':
Job Title: Interesting Job Opportunity: Senior Analyst - Data Science
Company: AOS
Location: Kolkata, West Bengal, India
Similarity Score: 0.6424
-------------------------
Job Title: Interesting Job Opportunity: Senior Analyst - R/SQL/Python
Company: AOS
Location: Warangal, Telangana, India
Similarity Score: 0.6403
-------------------------
Job Title: Big Data Engineer - E4504
Company: Nisum
Location: Hyderabad, Telangana, India
Similarity Score: 0.6322
-------------------------
Job Title: Interesting Job Opportunity: Full Stack Developer - React.js/Node.js
Company: Justdial
Location: Bengaluru, Karnataka, India
Similarity Score: 0.6150
-------------------------
Job Title: Interesting Job Opportunity: Full Stack Developer - React.js/Node.js
Company: Justdial
Location: Bengaluru, Karnataka, India
Similarity Score: 0.6150
-------------------------


In [563]:
def search(query, df, tokenizer, model, top_n=5):
    # Preprocess the query
    preprocessed_query = preprocess_text(query)
    # Generate the embedding for the query
    query_embedding = generate_embedding(preprocessed_query, tokenizer,model)[0]
    # Compute cosine similarity between the query embedding and the job embeddings
    similarities = cosine_similarity([query_embedding], list(df["embedding"]))
    # Add similarity scores to the DataFrame
    df["similarity_score"] = similarities[0]
# Get the top N most similar results
    top_n_indices = np.argsort(similarities[0])[-top_n:][::-1]
    top_n_results = df.iloc[top_n_indices]
    return top_n_results

In [565]:
while True:
    query = input("Enter your query (or 'exit' to quit): ").strip().lower()
    if query == 'exit':
        print("Exiting...")
        break
    if query:
        results = search(query, df_with_embeddings, tokenizer, model)
        if not results.empty:
            print(f"Top results for '{query}':")
            for index, result in results.iterrows():
                print(f"Job Title: {result['job_title']}")
                print(f"Company: {result['company_name']}")
                print(f"Location: {result['location']}")
                print(f"Similarity Score: {result['similarity_score']:.4f}")
                print("-------------------------")
        else:
            print("No results found.")
    else:
        print("Please enter a valid query.")

Enter your query (or 'exit' to quit):   Hey buddy , How are you? Can you help me find Data Scientist role . Also Data Analyst will also be fine.


Top results for 'hey buddy , how are you? can you help me find data scientist role . also data analyst will also be fine.':
Job Title: Lead Data Scientist
Company: Correlate Resources
Location: Melbourne, Victoria, Australia
Similarity Score: 0.6154
-------------------------
Job Title: Lead Data Scientist
Company: Correlate Resources
Location: Melbourne, Victoria, Australia
Similarity Score: 0.6154
-------------------------
Job Title: Lead Data Scientist
Company: Correlate Resources
Location: Melbourne, Victoria, Australia
Similarity Score: 0.6154
-------------------------
Job Title: Lead Data Scientist
Company: Correlate Resources
Location: Melbourne, Victoria, Australia
Similarity Score: 0.6039
-------------------------
Job Title: Lead Data Scientist
Company: Correlate Resources
Location: Melbourne, Victoria, Australia
Similarity Score: 0.6039
-------------------------


Enter your query (or 'exit' to quit):  exit


Exiting...


In [567]:
from sklearn.metrics.pairwise import cosine_similarity
def search(query, df, tokenizer, model, threshold=0.5):
    # Preprocess the query
    preprocessed_query = preprocess_text(query)
    # Generate the embedding for the query
    query_embedding = generate_embedding(preprocessed_query, tokenizer,model)[0]
    # Compute cosine similarity between the query embedding and the job␣↪embeddings
    similarities = cosine_similarity([query_embedding], list(df["embedding"]))
    # Add similarity scores to the DataFrame
    df["similarity_score"] = similarities[0]
    # Filter results based on the similarity score threshold
    matching_results = df[df["similarity_score"] >= threshold]
    # Sort results by similarity score in descending order
    matching_results = matching_results.sort_values(by="similarity_score",ascending=False)
    return matching_results

In [569]:
while True:
    query = input("Enter your query (or 'exit' to quit): ").strip().lower()
    if query == 'exit':
        print("Exiting...")
        break
    if query:
        results = search(query, df_with_embeddings, tokenizer, model,threshold=0.5)
        if not results.empty:
            print(f"Matching results for '{query}':")
            for index, result in results.iterrows():
                print(f"Job Title: {result['job_title']}")
                print(f"Company: {result['company_name']}")
                print(f"Location: {result['location']}")
                print(f"Similarity Score: {result['similarity_score']:.4f}")
                print("-------------------------")
            else:
                print("No results found.")
    else:
        print("Please enter a valid query.")

Enter your query (or 'exit' to quit):  Hey buddy i am looking for data analyst and data scientist role could you help me find roles in India


Matching results for 'hey buddy i am looking for data analyst and data scientist role could you help me find roles in india':
Job Title: 
            
        AI/ML Data Scientist
      
          
Company: 
            Experfy
          
Location: 
            Bengaluru, Karnataka, India
          
Similarity Score: 0.6197
-------------------------
Job Title: Interesting Job Opportunity: Senior Analyst - Data Science
Company: AOS
Location: Kolkata, West Bengal, India
Similarity Score: 0.6116
-------------------------
Job Title: Lead Data Scientist
Company: Correlate Resources
Location: Melbourne, Victoria, Australia
Similarity Score: 0.5996
-------------------------
Job Title: Lead Data Scientist
Company: Correlate Resources
Location: Melbourne, Victoria, Australia
Similarity Score: 0.5996
-------------------------
Job Title: Lead Data Scientist
Company: Correlate Resources
Location: Melbourne, Victoria, Australia
Similarity Score: 0.5996
-------------------------
Job Title: Researche

Enter your query (or 'exit' to quit):  exit


Exiting...
