In [1]:
import getpass
import os

os.environ["LANGSMITH_TRACING"] = "true"
os.environ["LANGSMITH_API_KEY"] = getpass.getpass()

 ········


In [2]:
from langchain_community.document_loaders import PyPDFLoader

In [3]:
file_path = r"C:\Users\bhushan\Desktop\dev1\semantic_st25\data\NCO_2015_stripped_descriptions.pdf"
loader = PyPDFLoader(file_path)

docs = loader.load()

In [24]:
print(len(docs))

1152


In [4]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000, chunk_overlap=200, add_start_index=True
)
all_splits = text_splitter.split_documents(docs)

len(all_splits)

4313

In [5]:
from langchain_huggingface import HuggingFaceEmbeddings

embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [6]:
vector_1 = embeddings.embed_query(all_splits[0].page_content)
vector_2 = embeddings.embed_query(all_splits[1].page_content)

assert len(vector_1) == len(vector_2)
print(f"Generated vectors of length {len(vector_1)}\n")
print(vector_1[:10])

Generated vectors of length 768

[0.027572426944971085, -0.027625780552625656, 7.453797297785059e-05, -0.01742544025182724, -0.025777947157621384, 0.03375286981463432, 0.04299711808562279, 0.015540871769189835, 0.003127140225842595, -0.006702519953250885]


In [9]:
import faiss
from langchain_community.docstore.in_memory import InMemoryDocstore
from langchain_community.vectorstores import FAISS

embedding_dim = len(embeddings.embed_query("hello world"))
index = faiss.IndexFlatL2(embedding_dim)

vector_store = FAISS(
    embedding_function=embeddings,
    index=index,
    docstore=InMemoryDocstore(),
    index_to_docstore_id={},
)

In [10]:
ids = vector_store.add_documents(documents=all_splits)

In [24]:
results = vector_store.similarity_search_with_score("what does a Bank Manager do")
doc, score = results[0]
print(f"Score: {score}\n")
print(doc)

Score: 0.8673990368843079

page_content='enterprise or organization, in consultation 
with Senior Managers and with Managers 
of other departments or sections. 
 
1211.0100 
General Manager, Bank 
 
General Manager, Bank organi zes, 
controls and supervi ses, within authority 
delegated, activities of a private or public 
bank or one or more of its departments or 
branches. Is designated according to 
authority delegated or work performed 
such as: AGENT (BANK) SUB -AGENT 
(BANK)' metadata={'producer': 'PDFium', 'creator': 'PDFium', 'creationdate': 'D:20250812142019', 'source': 'C:\\Users\\bhushan\\Desktop\\dev1\\semantic_st25\\data\\NCO_2015_stripped_descriptions.pdf', 'total_pages': 1152, 'page': 20, 'page_label': '21', 'start_index': 2391}


In [25]:
#
#
#
#
#
#
#

In [40]:
import pandas as pd
import ollama
import time
import sys

df = pd.read_csv(r"C:\Users\bhushan\Desktop\dev1\semantic_st25\nco_data.csv")

descriptions = []
test_df = df.head(500)  # only take first 100 rows

start_time = time.time()

for _, row in test_df.iterrows():
    occupation_title = row["occupation_title"]
    division = row["division"]
    subdivision = row["subdivision"]
    group = row["group"]
    family = row["family"]

    results = vector_store.similarity_search_with_score(occupation_title)
    if not results:
        descriptions.append("")
        continue

    doc, score = results[0]
    chunk = doc.page_content.strip()

    prompt = f"""
    You are given a text chunk that may include the description of a specific job mixed with unrelated content.
    Your task is to provide a broad description of the job. Follow these pointers:
    - Identify and extract only information that directly describes the given job.
    - Summarize the job's purpose, nature of work, main duties, and essential skills.
    - Ignore unrelated or administrative details.
    - Rephrase in clear, professional language.
    - Keep it factual and concise (maximum 100 words).
    - No headings, formattings, or introductions.

    Occupation Title: {occupation_title}
    Division: {division}
    Subdivision: {subdivision}
    Group: {group}
    Family: {family}
    Text Chunk: {chunk}

    Clean, concise job description:
    """.strip()

    resp = ollama.generate(
        model="gemma3:1b",
        prompt=prompt
    )

    descriptions.append(resp["response"].strip())

    elapsed = time.time() - start_time
    sys.stdout.write(f"\rRow {row['sno']} done | Elapsed: {elapsed:.2f} sec")
    sys.stdout.flush()

print()

test_df["description"] = descriptions
test_df.to_csv(r"C:\Users\bhushan\Desktop\dev1\semantic_st25\nco_data_with_desc_500_new.csv", index=False)


Row 500 done | Elapsed: 1719.85 sec


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_df["description"] = descriptions
