## Joke Maker
### Use Mistral to extract keywords from joke

#### Load jokes into csv file

In [12]:
import pandas as pd
df = pd.read_csv('dad_jokes.csv',index_col=0)
df[:3]

Unnamed: 0,joke
0,A steak pun is a rare medium well done.
1,They say that breakfast is the most important ...
2,What do you get if you cross an angry sheep wi...


#### Load Mistral LLM

In [13]:

from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
check='mistralai/Mistral-7B-Instruct-v0.2'
model = AutoModelForCausalLM.from_pretrained(check,torch_dtype=torch.float16).to('cuda')
tokenizer = AutoTokenizer.from_pretrained(check)
tokenizer.add_special_tokens({'pad_token': '[PAD]'})

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

1

In [25]:
jokes_list = list(df['joke'][1:])
def encode_sentences_batch(sentences, batch_size=16):
    # Define prompt prefix and ending
    prompt_prefix = '\n<s>[INST] Instruction: Give only a list of topics that the given joke talks about in the same line? Do not give any explanation or suggestions after the answer. Joke: A steak pun is a rare medium well done.[/INST]\nTopics: [steak, pun]<s>'
    inst = '\n[INST]Instruction: Give only a list of topics that the given joke talks about in the same line? Do not give any explanation or suggestions after the answer. Joke: '
    ending = '[/INST]\nTopics: '

    # Tokenize and encode the prompts in batches
    prompts = [prompt_prefix+ inst + sentence + ending for sentence in sentences]
    return prompts

prompts = encode_sentences_batch(jokes_list)


#### For each prompt, generate keywords using Mistral

In [311]:
# make a batch of prompts and then run the following code for each of these batches and save data along with processing of each batch.
from tqdm import tqdm
import re
batch_size=8
counter=1
joke_key_dict_list=[]

for index, prompt in enumerate(prompts):
    if counter%500==0:
        #save result somewhere
        df_save = pd.DataFrame(joke_key_dict_list)
        df_save.to_csv('joke_keywords.csv', mode='a', index=False, header=False)
        joke_key_dict_list = []
        file2 = open('status.txt','a')
        file2.write(f'{counter} out of {len(prompts)}\n')
        file2.close()

    model_inputs = tokenizer(prompt, padding=True, truncation=True, return_tensors="pt").to("cuda")
    generated_ids = model.generate(**model_inputs, max_new_tokens=20,)
    output = tokenizer.batch_decode(generated_ids)[0]   
    result = output.split('\n')[-1].strip()

    joke_key_dict={'joke':jokes_list[index],'keywords':result}
    joke_key_dict_list.append(joke_key_dict)
    counter+=1

#### Load the newly generated file full of keywords

In [184]:
import pandas as pd
df_joke_words = pd.read_csv('joke_keywords.csv', names=['Joke','Topics'])
df_joke_words[:3]

Unnamed: 0,Joke,Topics
0,They say that breakfast is the most important ...,"[breakfast, meal, day, poison, antidote]</s>"
1,What do you get if you cross an angry sheep wi...,"Topics: [angry sheep, moody cow, animal, baaa..."
2,An apple a day keeps the doctor away. At least...,"Topics: [apple, doctor, away, throw]</s>"


In [185]:
df_joke_words['Topics'] = df_joke_words['Topics'].fillna('')

In [186]:
type(df_joke_words)

pandas.core.frame.DataFrame

In [187]:
df_list = []
for element in df_joke_words.itertuples():
    joke = element[1]
    topics=element[2]
        
    if topics is None or not all(char in topics for char in (', ', '[')):
        # df_list.append([element[1], element[2], ''])  # if comma and opening square bracket doesn't exist, then add nothing
        continue
    topics = element[2].split(', ')
    topics_list=[]
    topics_list.append(topics[0][topics[0].index('[')+1:])  # remove opening square bracket from the first element

    if topics[-1].__contains__(']'):
        topics_list.append(topics[-1][:topics[-1].index(']')])  # remove closing square bracket from the last element
    else:
        topics_list.append(topics[-1])
    for topic in topics[1:-1]:  # iterate on the remaining elements
        topics_list.append(topic.strip())
    df_list.append([element[1], element[2], topics_list])  # add everything for the new dataframe


In [196]:
# df_list_new = pd.DataFrame(df_list)
df_list_new[:3]

Unnamed: 0,0,1,2
0,They say that breakfast is the most important ...,"[breakfast, meal, day, poison, antidote]</s>","[breakfast, antidote, meal, day, poison]"
1,What do you get if you cross an angry sheep wi...,"Topics: [angry sheep, moody cow, animal, baaa...","[angry sheep, baaad mooood, moody cow, animal]"
2,An apple a day keeps the doctor away. At least...,"Topics: [apple, doctor, away, throw]</s>","[apple, throw, doctor, away]"


## Finetuning Mistral on this data

### Split df into test and train

In [190]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(df_list_new, test_size=0.2)

# rename columns
train.columns=['target','unwanted','text']
test.columns=['target','unwanted','text']

In [267]:
def add_instruct(examples):
    
    inst = '<s><INST>Write a joke from the given list of topics</INST>\nTopics: [angry sheep, baaad mooood, moody cow, animal]\nJoke: What do you get if you cross an angry sheep with a moody cow? An animal that\’s in a baaaaad mooood.<s>'
    next_inst = '<INST>Write a joke from the given list of topics</INST>\nTopics: '
    ending = 'Joke: '
    new_prompts = [inst+'\n'+next_inst+'['+', '.join(text)+']'+'\n'+ending for text in examples]
    return new_prompts
    
text_list_train = list(train['text'])
text_list_test = list(test['text'])

train_new = add_instruct(text_list_train)
test_new = add_instruct(text_list_test)
 

In [281]:
df_train = pd.DataFrame(columns=['text','target'])
df_train['text']=train_new
df_train['target']=train['target']
# df_test = pd.DataFrame(test_new,test['target'])
df_train[:3]

Unnamed: 0,text,target
0,<s><INST>Write a joke from the given list of t...,They say that breakfast is the most important ...
1,<s><INST>Write a joke from the given list of t...,What do you get if you cross an angry sheep wi...
2,<s><INST>Write a joke from the given list of t...,An apple a day keeps the doctor away. At least...


In [283]:
df_train.to_csv('./train.csv',index=0)
df_test.to_csv('./test.csv',index=0)


In [285]:
project_name = 'jokemaker'
model_name = 'mistralai/Mistral-7B-Instruct-v0.2'
push_to_hub = False
# hf_token = "YOUR HF TOKEN"
repo_id = "abhatia/mistral_trained"
learning_rate = 2e-4
num_epochs = 4
batch_size = 1
block_size = 1024
trainer = "sft"
warmup_ratio = 0.1
weight_decay = 0.01
gradient_accumulation = 4
use_fp16 = True
use_peft = True
use_int4 = True
lora_r = 16
lora_alpha = 32
lora_dropout = 0.045

In [286]:
import os
os.environ["PROJECT_NAME"] = project_name
os.environ["MODEL_NAME"] = model_name
os.environ["PUSH_TO_HUB"] = str(push_to_hub)
# os.environ["HF_TOKEN"] = hf_token
os.environ["REPO_ID"] = repo_id
os.environ["LEARNING_RATE"] = str(learning_rate)
os.environ["NUM_EPOCHS"] = str(num_epochs)
os.environ["BATCH_SIZE"] = str(batch_size)
os.environ["BLOCK_SIZE"] = str(block_size)
os.environ["WARMUP_RATIO"] = str(warmup_ratio)
os.environ["WEIGHT_DECAY"] = str(weight_decay)
os.environ["GRADIENT_ACCUMULATION"] = str(gradient_accumulation)
os.environ["USE_FP16"] = str(use_fp16)
os.environ["USE_PEFT"] = str(use_peft)
os.environ["USE_INT4"] = str(use_int4)
os.environ["LORA_R"] = str(lora_r)
os.environ["LORA_ALPHA"] = str(lora_alpha)
os.environ["LORA_DROPOUT"] = str(lora_dropout)

In [None]:
!autotrain llm \
--train \
--model ${MODEL_NAME} \
--project-name ${PROJECT_NAME} \
--data-path data/ \
--text-column text \
--lr ${LEARNING_RATE} \
--batch-size ${BATCH_SIZE} \
--merge_adapter \
--epochs ${NUM_EPOCHS} \
--block-size ${BLOCK_SIZE} \
--warmup-ratio ${WARMUP_RATIO} \
--lora-r ${LORA_R} \
--lora-alpha ${LORA_ALPHA} \
--lora-dropout ${LORA_DROPOUT} \
--weight-decay ${WEIGHT_DECAY} \
--gradient-accumulation ${GRADIENT_ACCUMULATION} \
$( [[ "$USE_FP16" == "True" ]] && echo "--mixed-precision fp16" ) \
$( [[ "$USE_INT4" == "True" ]] && echo "--quantization int4" ) \
$( [[ "$USE_PEFT" == "True" ]] && echo "--use-peft" ) \
$( [[ "$PUSH_TO_HUB" == "True" ]] && echo "--push-to-hub --token ${HF_TOKEN} --repo-id ${REPO_ID}" )

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


> [1mINFO    Running LLM[0m
> [1mINFO    Params: Namespace(version=False, text_column='text', rejected_text_column='rejected', prompt_text_column='prompt', model_ref=None, warmup_ratio=0.1, optimizer='adamw_torch', scheduler='linear', weight_decay=0.01, max_grad_norm=1.0, add_eos_token=False, block_size=1024, peft=True, lora_r=16, lora_alpha=32, lora_dropout=0.045, logging_steps=-1, evaluation_strategy='epoch', save_total_limit=1, save_strategy='epoch', auto_find_batch_size=False, mixed_precision='fp16', quantization='int4', model_max_length=1024, trainer='default', target_modules=None, merge_adapter=True, use_flash_attention_2=False, dpo_beta=0.1, chat_template=None, padding=None, train=True, deploy=False, inference=False, username=None, backend='local-cli', token=None, repo_id=None, push_to_hub=False, model='mistralai/Mistral-7B-Instruct-v0.2', project_name='jokemaker', seed=42, epochs=4, gradient_accumulation=4, disable_gradient_checkpointing=False, lr=0.0002, log='none', data_pa

## Inference Time

### Load Model

In [239]:
from peft import PeftModel, PeftConfig
adapter_model ='firstmodel'
base_model_name = 'mistralai/Mistral-7B-Instruct-v0.2'
tokenizer = AutoTokenizer.from_pretrained(adapter_model)
model = AutoModelForCausalLM.from_pretrained(base_model_name)
model = PeftModel.from_pretrained(model, adapter_model)

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [250]:
prompt='[hot girls, pollution, sky]'
model_inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
generated_ids = model.generate(**model_inputs, max_new_tokens=20,)
output = tokenizer.batch_decode(generated_ids)[0]   

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


In [247]:

inputs = tokenizer("[hot girls, pollution, sky]", return_tensors="pt").to('cuda')


with torch.no_grad():
    outputs = model.generate(input_ids=inputs["input_ids"].to("cuda"), max_new_tokens=10)
    


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


In [289]:
check='jokemaker'
model_custom = AutoModelForCausalLM.from_pretrained(check,torch_dtype=torch.float16).to('cuda')
tokenizer_custom = AutoTokenizer.from_pretrained(check)

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

### Joke for topics: [hot girls, pollution, sky]

In [291]:
## Finetuned Mistral
next_inst = '<INST>Write a joke from the given list of topics</INST>\nTopics: '
prompt=next_inst+'[hot girls, pollution, sky]\nJoke: '
model_inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
generated_ids = model.generate(**model_inputs, max_new_tokens=20,)
output = tokenizer.batch_decode(generated_ids)[0]
output

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


'<s> <INST>Write a joke from the given list of topics</INST>\nTopics: [hot girls, pollution, sky]\nJoke: \nWhy did the hot girl go to the junkyard?\nTo find a sky-diver'

In [6]:
## General Mistral
next_inst = '<INST>Write a joke from the given list of topics</INST>\nTopics: '
prompt=next_inst+'[hot girls, pollution, sky]\nJoke: '
model_inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
generated_ids = model.generate(**model_inputs, max_new_tokens=100,)
output = tokenizer.batch_decode(generated_ids)[0]
output

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


'<s> <INST>Write a joke from the given list of topics</INST>\nTopics: [hot girls, pollution, sky]\nJoke: \nWhy did the hot girl carry an umbrella in the clear sky?\nBecause she was following the instructions on the bottle of pollution! \n(Note: This joke is meant to be a commentary on the sad reality of pollution and how it affects everyone, even the most beautiful among us.)</s>'

### Joke for topics: [students, classroom, food, teacher, strict]

In [301]:
## Finetuned Mistral
next_inst = '<INST>Write a joke from the given list of topics</INST>\nTopics: '
prompt=next_inst+'[students, classroom, food, teacher, strict]\nJoke: '
model_inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
generated_ids = model.generate(**model_inputs, max_new_tokens=100,temperature=0.6, do_sample=True)
output = tokenizer.batch_decode(generated_ids)[0]
output

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


"<s> <INST>Write a joke from the given list of topics</INST>\nTopics: [students, classroom, food, teacher, strict]\nJoke: \nWhy did the teacher tell her student to eat his homework?\nBecause it was a piece of cake, and she knew he couldn't 'bake' it. (Replace 'it' with 'that')</s>"

In [3]:
## General Mistral Performance
next_inst = '<INST>Write a joke from the given list of topics</INST>\nTopics: '
prompt=next_inst+'[students, classroom, food, teacher, strict]\nJoke: '
model_inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
generated_ids = model.generate(**model_inputs, max_new_tokens=100,temperature=0.6, do_sample=True)
output = tokenizer.batch_decode(generated_ids)[0]
output

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


"<s> <INST>Write a joke from the given list of topics</INST>\nTopics: [students, classroom, food, teacher, strict]\nJoke: \nWhy don't teachers ever go hungry at lunchtime in a strict classroom?\nBecause their students are always bringing them sandwiches! (get it, bringing, like they're allowed to bring things to class but in a classroom that's strict, it's usually not allowed) \n\nI hope you found that joke amusing! Let me know if you need help with anything else. 😊</s>"

### Joke for topics: [man, dogs, cats, hot girls]

In [302]:
## Finetuned Mistral
next_inst = '<INST>Write a joke from the given list of topics</INST>\nTopics: '
prompt=next_inst+'[man, dogs, cats, hot girls]\nJoke: '
model_inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
generated_ids = model.generate(**model_inputs, max_new_tokens=100,temperature=0.6, do_sample=True)
output = tokenizer.batch_decode(generated_ids)[0]
output

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


'<s> <INST>Write a joke from the given list of topics</INST>\nTopics: [man, dogs, cats, hot girls]\nJoke: \nWhy did the man name his dog "Startled" ?\nBecause it was a \'Shock\' to see him with a \'Hot\' girl and a \'Barking\' dog!</s>'

In [4]:
## General Mistal

next_inst = '<INST>Write a joke from the given list of topics</INST>\nTopics: '
prompt=next_inst+'[man, dogs, cats, hot girls]\nJoke: '
model_inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
generated_ids = model.generate(**model_inputs, max_new_tokens=100,temperature=0.6, do_sample=True)
output = tokenizer.batch_decode(generated_ids)[0]
output

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


'<s> <INST>Write a joke from the given list of topics</INST>\nTopics: [man, dogs, cats, hot girls]\nJoke: \nWhy did the man name his dog "Hot Stuff"?\nBecause he wanted a little "bark" in his life, but when he got home, he found his cat had named his new kitten "Hotter Stuff"!\n\nSo now the man had a dilemma, two pets with the same name! He decided to call the dog "Hot Rod" instead, and the cat\'s kitten became "Hot Tuna". From that day on, the'

### Jokes by General Mistral

In [1]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
check='mistralai/Mistral-7B-Instruct-v0.2'
model = AutoModelForCausalLM.from_pretrained(check,torch_dtype=torch.float16).to('cuda')
tokenizer = AutoTokenizer.from_pretrained(check)

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]