In [1]:
# Import necessary libraries
import requests
import pandas as pd
import time
from transformers import T5Tokenizer, T5ForConditionalGeneration
import torch
import os
from IPython.display import display

from sentence_transformers import SentenceTransformer
import faiss
import numpy as np

# Initialize the T5 model and tokenizer
model_name = "t5-base"
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)

# Initialize the SentenceTransformer for embeddings
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')  # Efficient and suitable for this task

# Section 2: Data Collection from APIs

# Adzuna API credentials
ADZUNA_APP_ID = '6298fc50'
ADZUNA_APP_KEY = 'bc2646c5a0a78e7a9f1cdde1f307650c'

def fetch_job_data(query, location, page=1, results_per_page=50):
    """
    Fetch job data from Adzuna API.
    """
    url = f'https://api.adzuna.com/v1/api/jobs/gb/search/{page}'
    params = {
        'app_id': ADZUNA_APP_ID,
        'app_key': ADZUNA_APP_KEY,
        'results_per_page': results_per_page,
        'what': query,
        'where': location,
        'content-type': 'application/json'
    }
    response = requests.get(url, params=params)
    if response.status_code == 200:
        return response.json()
    elif response.status_code == 429:
        print("Rate limit exceeded. Waiting for 60 seconds.")
        time.sleep(60)
        return fetch_job_data(query, location, page, results_per_page)
    else:
        print(f"Failed to fetch data: {response.status_code}")
        return {}

# Fetch job data
JOB_QUERY = 'data scientist'
LOCATION = 'London'
PAGES_TO_FETCH = 2  # Adjust as needed

jobs_data = []

for page in range(1, PAGES_TO_FETCH + 1):
    print(f"Fetching page {page}...")
    data = fetch_job_data(JOB_QUERY, LOCATION, page)
    jobs = data.get('results', [])
    jobs_data.extend(jobs)
    time.sleep(1)  # To respect API rate limits

print(f"Total jobs fetched: {len(jobs_data)}")

# Convert to DataFrame
df_jobs = pd.DataFrame(jobs_data)
display(df_jobs.head())

# Load O*NET Data
ONET_FILE_PATH = 'Occupation Data.txt'  # Replace with your actual file path

if not os.path.exists(ONET_FILE_PATH):
    print(f"File {ONET_FILE_PATH} not found. Please download it from https://www.onetcenter.org/database.html and place it in the notebook directory.")
else:
    df_occupations = pd.read_csv(ONET_FILE_PATH, sep='\t', encoding='utf-8')
    display(df_occupations.head())

    # Section 3: Building the Knowledge Base

    documents = []

    # Process Adzuna Job Data
    for index, row in df_jobs.iterrows():
        title = row.get('title', 'N/A')
        location = row.get('location', {}).get('display_name', 'N/A') if isinstance(row.get('location', {}), dict) else 'N/A'
        description = row.get('description', 'No description provided.')
        content = f"Job Title: {title}\nLocation: {location}\nDescription: {description}"
        documents.append({'content': content, 'meta': {'source': 'Adzuna'}})

    # Process O*NET Occupation Data
    if 'Title' in df_occupations.columns and 'Description' in df_occupations.columns:
        for index, row in df_occupations.iterrows():
            occupation_title = row.get('Title', 'N/A')
            occupation_description = row.get('Description', 'No description provided.')
            content = f"Occupation Title: {occupation_title}\nDescription: {occupation_description}"
            documents.append({'content': content, 'meta': {'source': 'O*NET'}})
    else:
        print("Expected columns 'Title' and 'Description' not found in O*NET data.")
    
    print(f"Total documents prepared: {len(documents)}")

    # Extract document contents
    doc_texts = [doc['content'] for doc in documents]

    # Generate embeddings
    print("Generating document embeddings...")
    embeddings = embedding_model.encode(doc_texts, convert_to_numpy=True, show_progress_bar=True)

    # Build FAISS index
    embedding_dim = embeddings.shape[1]
    index = faiss.IndexFlatL2(embedding_dim)  # Using L2 distance
    index.add(embeddings)

    print(f"FAISS index has {index.ntotal} vectors.")

    # Section 4: Generating Responses

    def generate_response(user_input, top_k=5):
        """
        Generate a response based on user input using the T5 model with retrieved context.
        """
        # Embed the user input
        query_embedding = embedding_model.encode([user_input], convert_to_numpy=True)
        
        # Search for top_k similar documents
        distances, indices = index.search(query_embedding, top_k)
        
        # Retrieve the top_k documents
        relevant_docs = [doc_texts[idx] for idx in indices[0]]
        
        # Combine the relevant documents into the context
        context = "\n".join(relevant_docs)
        
        # Create the prompt
        prompt = f"Based on the following documents, answer the question: {user_input}\nDocuments:\n{context}"
        
        # Tokenize input
        inputs = tokenizer(prompt, return_tensors="pt", max_length=512, truncation=True)
        
        # Generate response
        with torch.no_grad():
            outputs = model.generate(**inputs, max_length=150)
        
        return tokenizer.decode(outputs[0], skip_special_tokens=True)

    # Section 5: Creating a Chatbot Interface

    def chat():
        """
        Simple command-line chatbot interface.
        """
        print("Welcome to the Career Guidance Chatbot!")
        print("Type 'exit' to quit.\n")
        while True:
            user_input = input("You: ")
            if user_input.lower() in ['exit', 'quit']:
                print("Chatbot: Goodbye!")
                break
            response = generate_response(user_input)
            print(f"Chatbot: {response}\n")

    # Section 6: Testing the Chatbot

    if __name__ == "__main__":
        chat()





You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Fetching page 1...
Fetching page 2...
Total jobs fetched: 100


Unnamed: 0,id,title,category,salary_min,company,latitude,location,__CLASS__,salary_is_predicted,created,salary_max,contract_type,adref,redirect_url,description,longitude,contract_time
0,4849700272,Senior Data Scientist,"{'tag': 'it-jobs', '__CLASS__': 'Adzuna::API::...",75000.0,"{'display_name': 'ADLIB Recruitment', '__CLASS...",51.451818,{'__CLASS__': 'Adzuna::API::Response::Location...,Adzuna::API::Response::Job,0,2024-09-03T00:00:03Z,75000.0,permanent,eyJhbGciOiJIUzI1NiJ9.eyJpIjoiNDg0OTcwMDI3MiIsI...,https://www.adzuna.co.uk/jobs/land/ad/48497002...,Senior Data Scientist Leading Financial Servic...,-0.02806,
1,4840002558,Data Scientist,"{'label': 'IT Jobs', '__CLASS__': 'Adzuna::API...",52070.58,{'__CLASS__': 'Adzuna::API::Response::Company'...,51.503378,{'__CLASS__': 'Adzuna::API::Response::Location...,Adzuna::API::Response::Job,1,2024-08-26T15:01:04Z,52070.58,,eyJhbGciOiJIUzI1NiJ9.eyJzIjoibmtHNzh5RjM3eEc5V...,https://www.adzuna.co.uk/jobs/land/ad/48400025...,"Data Scientist (STEM Background, Python) - Hyb...",-0.139134,
2,4854081270,Data Scientist,"{'label': 'IT Jobs', '__CLASS__': 'Adzuna::API...",71161.93,"{'display_name': 'Formula Recruitment', '__CLA...",51.503378,{'__CLASS__': 'Adzuna::API::Response::Location...,Adzuna::API::Response::Job,1,2024-09-06T08:42:24Z,71161.93,contract,eyJhbGciOiJIUzI1NiJ9.eyJpIjoiNDg1NDA4MTI3MCIsI...,https://www.adzuna.co.uk/jobs/land/ad/48540812...,Data Scientist | Up to £400 per day Outside IR...,-0.139134,
3,4866121274,Data Scientist,{'__CLASS__': 'Adzuna::API::Response::Category...,74070.98,"{'display_name': 'Gloo', '__CLASS__': 'Adzuna:...",51.503378,"{'area': ['UK', 'London'], '__CLASS__': 'Adzun...",Adzuna::API::Response::Job,1,2024-09-17T05:44:26Z,74070.98,,eyJhbGciOiJIUzI1NiJ9.eyJpIjoiNDg2NjEyMTI3NCIsI...,https://www.adzuna.co.uk/jobs/land/ad/48661212...,A rapidly expanding AI-powered customer analyt...,-0.139134,
4,4847376731,Data Scientist,{'__CLASS__': 'Adzuna::API::Response::Category...,70384.73,{'__CLASS__': 'Adzuna::API::Response::Company'...,51.503378,{'__CLASS__': 'Adzuna::API::Response::Location...,Adzuna::API::Response::Job,1,2024-09-01T12:02:01Z,70384.73,,eyJhbGciOiJIUzI1NiJ9.eyJzIjoibmtHNzh5RjM3eEc5V...,https://www.adzuna.co.uk/jobs/land/ad/48473767...,"My client, a leading insurer, is seeking a Dat...",-0.139134,


Unnamed: 0,O*NET-SOC Code,Title,Description
0,11-1011.00,Chief Executives,Determine and formulate policies and provide o...
1,11-1011.03,Chief Sustainability Officers,"Communicate and coordinate with management, sh..."
2,11-1021.00,General and Operations Managers,"Plan, direct, or coordinate the operations of ..."
3,11-1031.00,Legislators,"Develop, introduce, or enact laws and statutes..."
4,11-2011.00,Advertising and Promotions Managers,"Plan, direct, or coordinate advertising polici..."


Total documents prepared: 1116
Generating document embeddings...


Batches:   0%|          | 0/35 [00:00<?, ?it/s]

FAISS index has 1116 vectors.
Welcome to the Career Guidance Chatbot!
Type 'exit' to quit.

Chatbot: False

Chatbot: True

Chatbot: False

Chatbot: False

Chatbot: False

Chatbot: True



KeyboardInterrupt: Interrupted by user