# This notebook is to generate embeddings of the job offers dataset

In [1]:
import pandas as pd
import openai
import pickle
from openai.embeddings_utils import (
    get_embedding,
    distances_from_embeddings,
    tsne_components_from_embeddings,
    chart_from_components,
    indices_of_nearest_neighbors_from_distances,
)
openai.api_key='Put your api key here'
EMBEDDING_MODEL = "text-embedding-ada-002"

In [2]:
df=pd.read_csv('../data/clean/techmap-jobs-cleaned.csv')

In [8]:
df_sampled=df[:2000]

In [9]:
embedding_cache_path = '../data/embeddings/recommendation_embeddings.pkl'
try:
    embedding_cache = pd.read_pickle(embedding_cache_path)
except FileNotFoundError:
    embedding_cache = {}
with open(embedding_cache_path, "wb") as embedding_cache_file:
    pickle.dump(embedding_cache, embedding_cache_file)

def get_embedding(text, model="text-embedding-ada-002"):
   text = text.replace("\n", " ")
   return openai.Embedding.create(input = [text], model=model)['data'][0]['embedding']

def embedding_from_string(
    string: str,
    model: str = EMBEDDING_MODEL,
    embedding_cache=embedding_cache
) -> list:
    """Return embedding of given string, using a cache to avoid recomputing."""
    if (string, model) not in embedding_cache.keys():
        embedding_cache[(string, model)] = get_embedding(string, model)
        with open(embedding_cache_path, "wb") as embedding_cache_file:
            pickle.dump(embedding_cache, embedding_cache_file)
    return embedding_cache[(string, model)]
    


In [15]:
embeddings=[embedding_from_string(string, model=EMBEDDING_MODEL) for string in df[:5000]['text'].tolist()]

In [5]:
def print_recommendations_from_strings(
    df: pd.DataFrame,
    query: str,
    k_nearest_neighbors: int = 5,
    model=EMBEDDING_MODEL,
) -> list[int]:
    """Print out the k nearest neighbors of a given string."""
    # get embeddings for all strings
    embeddings = [embedding_from_string(string, model=model) for string in df['text'].tolist()]
    # get the embedding of the source string
    query_embedding = embedding_from_string(query,model=model)
    # get distances between the source embedding and other embeddings (function from embeddings_utils.py)
    distances = distances_from_embeddings(query_embedding, embeddings, distance_metric="cosine")
    # get indices of nearest neighbors (function from embeddings_utils.py)
    indices_of_nearest_neighbors = indices_of_nearest_neighbors_from_distances(distances)
    jobs=[]
    # print out source string
    print(f"Source string: {query}")
    # print out its k nearest neighbors
    k_counter = 0
    for i in indices_of_nearest_neighbors:
        # skip any strings that are identical matches to the starting string
        # stop after printing out k articles
        if k_counter >= k_nearest_neighbors:
            break
        k_counter += 1

        # print out the similar strings and their distances
        print(
            f"""
        --- Recommendation #{k_counter} (nearest neighbor {k_counter} of {k_nearest_neighbors}) ---
        Job Title: {df['position'][i]}
        Company: {df['orgCompany'][i]}
        Located in: {df['orgAddress'][i]}
        Job Description: {df['text'][i]}
        Salary: {df['salary'][i]}
        Url: {df['url'][i]}
        """
        )
        jobs.append({"Job Title":df['position'][i],"Company":df['orgCompany'][i],"Located in":df['orgAddress'][i],"Job Description":df['text'][i],"Salary":df['salary'][i],"Url":df['url'][i]})

    return jobs


In [38]:
jobs=print_recommendations_from_strings(df_sampled, "Project Manager", k_nearest_neighbors=100, model=EMBEDDING_MODEL)

Source string: Education National School of Mines of Nancy, Big Data Specialization Courses : Data Analysis, Machine Learning, Deep Learning, Statistical Inference, Optimization, Operations Research Faculty of Sciences at the University of Lorraine,  M2 in parallel with Mines Nancy : Mathematical Engineering for Data Science  Courses: Databases and Information Systems, Statistics for High-Dimensional Data Preparatory Classes for Grandes Écoles- Lycée Lissane Eddine Ibn Al-Khatib /  Groupe Scolaire Al Qalam, Physics and Mathematics Projets Personnalised Chatbot Creating a personalized chatbot named "AchrafBot" based on my academic and professional background involves: •    Creating Elaborate Prompts for GPT-3.5: Developing detailed prompts for GPT-3.5 to form a context-aware chatbot. These prompts  should encompass my academic and professional history, all organized within a single text. •    Developing a Conversational Interface using HTML/JavaScript: Building a conversational interfac