In [14]:
%load_ext autoreload
%autoreload 2

from loguru import logger
import os
from mistralai.client import MistralClient
from mistralai.models.jobs import WandbIntegrationIn, TrainingParameters

from mistral_fine_tuning.utils import read_fine_tuning_file
from mistral_fine_tuning.reformat import reformat_jsonl

from dotenv import load_dotenv
load_dotenv()

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


True

In [15]:
df = read_fine_tuning_file('../data/interim/jokes_fine_tuning.jsonl')

[32m2024-06-26 23:39:19.803[0m | [31m[1mERROR   [0m | [36mmistral_fine_tuning.utils[0m:[36mprocess_keywords[0m:[36m57[0m - [31m[1mError processing keywords: invalid syntax. Perhaps you forgot a comma? (<string>, line 1)[0m


In [16]:
def extract_first_element(row):
    return row['keywords'].split(',')[0].replace('[', '')

# Apply the function to the DataFrame
df['first_keyword'] = df.apply(extract_first_element, axis=1)

In [17]:
df.head()

Unnamed: 0,text,keywords,first_keyword
0,"Oye, fjate que llega un indio al mdico y qu pa...","['indio', 'mdico', 'Toro Sentado', 'enfermo', ...",'indio'
1,y me dio qu le pas vena una delegacin de turno...,"['delegacin de turnos', 'Viña del Mar', 'bus d...",'delegacin de turnos'
2,Entonces el gua turstico si ustedes miran a la...,"['gua turstico', 'izquierda', 'derecha', 'quin...",'gua turstico'
3,y llega el tema del medio que le dice doctor m...,"['luna de viernes', 'seora', 'tres pechos', 'd...",'luna de viernes'
4,conversando amigo y uno le dice la poblacin un...,"['Viagra', 'robo', 'medicamento', 'polica', 'h...",'Viagra'


In [18]:
def create_messages(row):
    messages = [
        {
            "role": "system",
            "content": "You are a world-class comedy writer specializing in Chilean humor. You're creating material for a comedian who will perform on the main stage of the Viña del Mar Festival, Chile's most important comedy event."
        },
        {
            "role": "user",
            "content": "I have added a feature that forces you to response only in `locale=es` and consider only chilean spanish.",
        },
        {
            "role": "assistant",
            "content": "Understood thank you. From now I will only response with `locale=es`",
        },
        {
            "role": "user",
            "content": "Write a joke in Chilean Spanish based on the following keyword: " + row['first_keyword'] + "."
        },
        {
            "role": "assistant",
            "content": row['text']
        }
    ]

    return messages

In [19]:
df['messages'] = df.apply(create_messages, axis=1)

In [20]:
df.loc[0, 'messages']

[{'role': 'system',
  'content': "You are a world-class comedy writer specializing in Chilean humor. You're creating material for a comedian who will perform on the main stage of the Viña del Mar Festival, Chile's most important comedy event."},
 {'role': 'user',
  'content': 'I have added a feature that forces you to response only in `locale=es` and consider only chilean spanish.'},
 {'role': 'assistant',
  'content': 'Understood thank you. From now I will only response with `locale=es`'},
 {'role': 'user',
  'content': "Write a joke in Chilean Spanish based on the following keyword: 'indio'."},
 {'role': 'assistant',
  'content': 'Oye, fjate que llega un indio al mdico y qu pasa nuestro gran jefe de Toro Sentado estar enfermo ah dice Y qu tiene Gran Jefe Toro Sentado Gran Jefe Toro Sentado a tomarse dos frascos de viagra hgalo pasar llmelo todo parado venir'}]

In [21]:
df_train=df.sample(frac=0.995,random_state=200)
df_eval=df.drop(df_train.index)

df_train.to_json("../data/processed/jokes_train_b.jsonl", orient="records", lines=True)
df_eval.to_json("../data/processed/jokes_eval_b.jsonl", orient="records", lines=True)

In [22]:
reformat_jsonl("../data/processed/jokes_train_b.jsonl")
reformat_jsonl("../data/processed/jokes_eval_b.jsonl")

In [25]:
api_key = os.environ.get("MISTRAL_API_KEY")
client = MistralClient(api_key=api_key)

with open("../data/processed/reformatted_jokes_train_b.jsonl", "rb") as f:
    reformatted_jokes_train_b = client.files.create(file=("reformatted_jokes_train_b.jsonl", f))
with open("../data/processed/reformatted_jokes_eval_b.jsonl", "rb") as f:
    reformatted_jokes_eval_b = client.files.create(file=("reformatted_jokes_eval_b.jsonl", f))

In [26]:
wandb_api_key = os.environ.get("WANDB_API_KEY")

created_jobs = client.jobs.create(
    model="open-mistral-7b",
    training_files=[reformatted_jokes_train_b.id],
    validation_files=[reformatted_jokes_eval_b.id],
    hyperparameters=TrainingParameters(
        training_steps=300,
        learning_rate=0.0001,
    ),
    integrations=[
        WandbIntegrationIn(
            project="mistral_fine_tuning_api",
            run_name="test b",
            api_key=wandb_api_key,
        ).model_dump()
    ]
)