# Fine-tuned LLM
First approach is to use a fine-tuned LLM. This notebook demonstrate the process of fine-tuning a GPT to only perform the task of generating LinkedIn messages.
### Import packages needed

In [4]:
import json
from collections import defaultdict
import os
import openai
import numpy as np
# !pip -q install datasets tiktoken openai
import tiktoken

### Define the paths

In [5]:
current_dir = os.getcwd()
parent_dir = os.path.dirname(current_dir)

data_folder = os.path.join(parent_dir, 'data')
file_path = os.path.join(data_folder, 'content_generator_train.jsonl')

### Load the training data

In [6]:
# Create an empty list to store the loaded JSON objects
dataset = []

# Open the JSONL file and read line by line
with open(file_path, 'r') as file:
    for line in file:
        # Each line is a valid JSON object, so we parse it
        json_object = json.loads(line)
        dataset.append(json_object)

# Now 'dataset' contains a list of dictionaries representing each line in the JSONL file
dataset

[{'messages': [{'role': 'system',
    'content': 'You are a content generator that generates LinkedIn post for job seekers.'},
   {'role': 'user',
    'content': 'Generate a LinkedIn post introducing myself as a job seeker looking for opportunities as an executive assistant or paralegal. Emphasize my qualifications and preferences, including a Master of Science degree in Criminal Justice, proficiency in Microsoft 365 and Google Suite apps, organizational skills, and a desire for remote, full-time work with a minimum salary of $100k and comprehensive benefits. Mention the exclusion of Primerica, MLM, sales, or commission-only roles. Ask for community assistance in tagging hiring managers or TA professionals and express eagerness to connect and explore opportunities. Include relevant hashtags like #jobseeker, #executiveassistant, and #paralegal.'},
   {'role': 'assistant',
    'content': "Hello, LinkedIn! I hope everyone is enjoying their weekend☀️ /nI am a #jobseeker eager to become an 

In [3]:
# Format error checks
format_errors = defaultdict(int)

for ex in dataset:
    if not isinstance(ex, dict):
        format_errors["data_type"] += 1
        continue

    messages = ex.get("messages", None)
    if not messages:
        format_errors["missing_messages_list"] += 1
        continue

    for message in messages:
        if "role" not in message or "content" not in message:
            format_errors["message_missing_key"] += 1

        if any(k not in ("role", "content", "name") for k in message):
            format_errors["message_unrecognized_key"] += 1

        if message.get("role", None) not in ("system", "user", "assistant"):
            format_errors["unrecognized_role"] += 1

        content = message.get("content", None)
        if not content or not isinstance(content, str):
            format_errors["missing_content"] += 1

    if not any(message.get("role", None) == "assistant" for message in messages):
        format_errors["example_missing_assistant_message"] += 1

if format_errors:
    print("Found errors:")
    for k, v in format_errors.items():
        print(f"{k}: {v}")
else:
    print("No errors found")

No errors found


In [6]:
def num_tokens_from_messages(messages, model="gpt-3.5-turbo"):
  """Returns the number of tokens used by a list of messages."""
  try:
      encoding = tiktoken.encoding_for_model(model)
  except KeyError:
      encoding = tiktoken.get_encoding("cl100k_base")
  if model == "gpt-3.5-turbo":  # note: future models may deviate from this
      num_tokens = 0
      for message in messages:
          num_tokens += 4  # every message follows <im_start>{role/name}\n{content}<im_end>\n
          for key, value in message.items():
              num_tokens += len(encoding.encode(value))
              if key == "name":  # if there's a name, the role is omitted
                  num_tokens += -1  # role is always required and always 1 token
      num_tokens += 2  # every reply is primed with <im_start>assistant
      return num_tokens
  else:
      raise NotImplementedError(f"""num_tokens_from_messages() is not presently implemented for model {model}.
  See https://github.com/openai/openai-python/blob/main/chatml.md for information on how messages are converted to tokens.""")

model="gpt-3.5-turbo"
print(f"{num_tokens_from_messages(messages, model)} prompt tokens counted.")

302 prompt tokens counted.


In [7]:
system_message = """You are a content generator that generates LinkedIn post for job seekers."""

In [None]:
os.environ["OPENAI_API_KEY"] = os.getenv("OPENAI_API_KEY")
openai.api_key = os.environ["OPENAI_API_KEY"]

    
res = openai.File.create(
    file=open("train.jsonl", "r"),
    purpose='fine-tune'
)
res

<File file id=file-1Ia34249kLTGei3Wmr3XGo83 at 0x10ae578f0> JSON: {
  "object": "file",
  "id": "file-1Ia34249kLTGei3Wmr3XGo83",
  "purpose": "fine-tune",
  "filename": "file",
  "bytes": 2343,
  "created_at": 1702426483,
  "status": "processed",
  "status_details": null
}

In [5]:
file_id = res["id"]
file_id

'file-nhnLxoMisn0uQEzLufbW2ozJ'

In [6]:
res = openai.FineTuningJob.create(training_file=file_id, model="gpt-3.5-turbo")
res

<FineTuningJob fine_tuning.job id=ftjob-i1TPtyQnADlJpBrJ5r9YjLiN at 0x10498a390> JSON: {
  "object": "fine_tuning.job",
  "id": "ftjob-i1TPtyQnADlJpBrJ5r9YjLiN",
  "model": "gpt-3.5-turbo-0613",
  "created_at": 1700194191,
  "finished_at": null,
  "fine_tuned_model": null,
  "organization_id": "org-ViQITQ1IoZfuzh7CgKfIWIe7",
  "result_files": [],
  "status": "validating_files",
  "validation_file": null,
  "training_file": "file-nhnLxoMisn0uQEzLufbW2ozJ",
  "hyperparameters": {
    "n_epochs": "auto",
    "batch_size": "auto",
    "learning_rate_multiplier": "auto"
  },
  "trained_tokens": null,
  "error": null
}

In [12]:
job_id = res["id"]
response = openai.FineTuningJob.retrieve(job_id)
response

ft_model = response["fine_tuned_model"]


In [13]:
from langchain.chat_models import ChatOpenAI
llm = ChatOpenAI(
    temperature=0.3,
    model_name=ft_model
)

In [12]:
test_messages = []
test_messages.append({"role": "system", "content": system_message})
user_message = "Create a LinkedIn post express openess to work. The person is looking for data scientist summer internship. The post should highlight python, SQL, R skills. Also, it should mention ability to manipulate big data as well as machine learning experiences. Include relevant hashtags."
test_messages.append({"role": "user", "content": user_message})

print(test_messages)

[{'role': 'system', 'content': 'You are a content generator that generates LinkedIn post for job seekers.'}, {'role': 'user', 'content': 'Create a LinkedIn post express openess to work. The person is looking for data scientist summer internship. The post should highlight python, SQL, R skills. Also, it should mention ability to manipulate big data as well as machine learning experiences. Include relevant hashtags.'}]


In [22]:

response = openai.ChatCompletion.create(
    model=ft_model, messages=test_messages, temperature=0, max_tokens=500
)
print(response["choices"][0]["message"]["content"])

🌟 #JobSeeker #OpenToOpportunities #CareerSearch #LinkedInNetworking 🌟

🔎 Seeking a Data Scientist Summer Internship 🔍

📚 Are you looking for a talented and motivated data scientist intern? Look no further! I am currently open to exciting opportunities in the field of data science for this summer. 🌞

🔬 With a strong background in Python, SQL, and R, I am equipped with the necessary technical skills to tackle complex data challenges. 💻 My ability to manipulate big data sets and extract valuable insights sets me apart from the crowd. 📊

💡 Additionally, I have hands-on experience in machine learning, enabling me to develop predictive models and drive data-driven decision-making. 🤖

🌐 If you know of any organizations or individuals seeking a passionate and dedicated data scientist intern, please feel free to reach out. Let's connect and explore potential opportunities together! 🤝

#DataScience #InternshipOpportunity #Python #SQL #R #BigData #MachineLearning #DataDrivenDecisionMaking #Linked

In [13]:
print(f"{num_tokens_from_messages(test_messages, model)} prompt tokens counted.")

77 prompt tokens counted.
