# 1 Load the model

In [1]:
import torch
import transformers
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments
from peft import prepare_model_for_kbit_training, LoraConfig, get_peft_model, PeftModel


tokenizer = AutoTokenizer.from_pretrained("mistralai/Mixtral-8x7B-Instruct-v0.1", use_auth_token="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")
model = AutoModelForCausalLM.from_pretrained(
    "mistralai/Mixtral-8x7B-Instruct-v0.1",
    load_in_4bit=True,
    torch_dtype=torch.float16,
    device_map="auto",
    use_auth_token="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"
)

  from .autonotebook import tqdm as notebook_tqdm
The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.
Loading checkpoint shards: 100%|██████████| 19/19 [02:05<00:00,  6.59s/it]


# 2 Prepare the dataset

In [2]:
# from datasets import load_dataset

# dataset = load_dataset("TeeZee/dolly-15k-pirate-speech")

# train_data = dataset["train"].select(range(4000))

In [3]:
# filtered_train_data = [item for item in train_data if item.category in ["summarization", "information_extraction", "closed_qa"]]

# filtered_train_data[1]

# filtered_dataset = dataset.filter(
#     lambda item: item["category"]
#     in ["summarization", "information_extraction", "closed_qa"]
# )

# filtered_train_data = filtered_dataset["train"].select(range(400))

# Prepare the training prompts

In [4]:
# def generate_prompt(joke):
#     sys_mes = "Give me a punchline for this joke: "

#     question = str(joke["question"]) if joke["question"] is not None else ""
#     response = str(joke["response"]) if joke["response"] is not None else ""
#     return "<s> [INST]" + sys_mes + "\n" + question + "[/INST]" + response + "</s>"


def generate_pirate_prompt(item):
    story = item["context"]
    pirate_story = item["response"]
    sys_mes = "Convert this story to pirate language: "
    return "<s> [INST]" + sys_mes + story + " [/INST] " + pirate_story + " </s>"

def generate_glaswegian_prompt(english, glaswegian):
    sys_mes = "Convert this to the Glaswegian language: "
    return "<s> [INST]" + sys_mes + english + " [/INST] " + glaswegian + " </s>"


def tokenize(prompt):
    return tokenizer(
        prompt + tokenizer.eos_token,
        truncation=True,
        max_length=CUTOFF_LEN,
        padding="max_length",
    )


CUTOFF_LEN = 256  # Our dataset has shot text
LORA_R = 8
LORA_ALPHA = 2 * LORA_R
LORA_DROPOUT = 0.1

In [5]:
# tokenizer.pad_token = tokenizer.eos_token

# train_data_prompts = filtered_train_data.map(
#     lambda x: tokenize(generate_pirate_prompt(x)),
#     remove_columns=["instruction", "context", "response", "category"],
# )

# print(tokenizer.decode(train_data_prompts[0]["input_ids"], skip_special_tokens=True))

# 4 Train the model

In [24]:
def inference(input):
    sys_msg = "<s> [INST ] Convert this story to glaswegian language: "
    prompt = f"{sys_msg} {input} [/INST]"
    
    print(prompt)

    with torch.no_grad():
        input_ids = tokenizer.encode(prompt, return_tensors="pt").to("cuda")
        outputs = model.generate(input_ids, max_length=150)

    notes = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return notes

def glasgow_inference(input):
    sys_msg = "<s> [INST] Convert this story to glaswegian language: "
    prompt = f"{sys_msg} {input} [/INST]"
    
    print(prompt)

    with torch.no_grad():
        input_ids = tokenizer.encode(prompt, return_tensors="pt").to("cuda")
        outputs = model.generate(input_ids, max_length=200)

    notes = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return notes

In [7]:
model.eval()

print(glasgow_inference("I had 4 bottles of rum and now I don't, did somebody else drink it?"))

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


<s> [INST ] Convert this story to glaswegian language:  I had 4 bottles of rum and now I don't, did somebody else drink it? [/INST]




 [INST ] Convert this story to glaswegian language:  I had 4 bottles of rum and now I don't, did somebody else drink it? [/INST] Here's the story in Glaswegian language:

Ah huvet hud four bottles o' rum an' noo Ah dinnae, did some yin else dae it?

Translation:

I had four bottles of rum and now I don't, did someone else drink it?

Note: Glaswegian language, also known as Scots language, has many variations and regional dialects. The above example is a simplified version of Glaswegian language and may not be fully accurate or representative of all Glaswegian dialects.


In [8]:
# Prepare model for k-bit training
model = prepare_model_for_kbit_training(model)

config = LoraConfig(
    r=LORA_R,
    lora_alpha=LORA_ALPHA,
    target_modules=["w1", "w2", "w3"],  # just targetting the MoE layers.
    lora_dropout=LORA_DROPOUT,
    bias="none",
    task_type="CAUSAL_LM",
)

In [9]:
# model = get_peft_model(model, config)

# trainer = Trainer(
#     model=model,
#     train_dataset=train_data_prompts,
#     args=TrainingArguments(
#         per_device_train_batch_size=20,
#         gradient_accumulation_steps=4,
#         num_train_epochs=6,
#         learning_rate=1e-4,
#         logging_steps=2,
#         optim="adamw_torch",
#         save_strategy="epoch",
#         output_dir="mixtral-moe-lora-instruct-shapeskeare",
#     ),
#     data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False),
# )
# model.config.use_cache = False

# trainer.train()

In [10]:
# model.eval()

# print(inference("I had 4 bottles of rum and now I don't, did somebody else drink it?"))

In [11]:
def read_file(filename):
    with open(filename, 'r') as file:
        return file.readlines()

def get_sentences():
    file1_name = '../english.txt'
    file2_name = '../glaswegian.txt'

    try:
        # Read file 1
        file1_content = read_file(file1_name)
        # Convert content of file 1 into an array split by newline
        english_sentences = [line.strip() for line in file1_content]

        # Read file 2
        file2_content = read_file(file2_name)
        # Convert content of file 2 into an array split by newline
        glaswegian_sentences = [line.strip() for line in file2_content]
        
    except FileNotFoundError:
        print("One or both files not found.")
        
    return english_sentences, glaswegian_sentences

english_sentences, glaswegian_sentences = get_sentences()

prompts = []
for i in range(len(english_sentences)):
    prompt = generate_glaswegian_prompt(english_sentences[i], glaswegian_sentences[i])    
    prompts.append(prompt)

In [12]:
tokenizer.pad_token = tokenizer.eos_token
prompts_tokenized = [tokenize(x) for x in prompts]


In [13]:
model = get_peft_model(model, config)

trainer = Trainer(
    model=model,
    train_dataset=prompts_tokenized,
    args=TrainingArguments(
        per_device_train_batch_size=20,
        gradient_accumulation_steps=4,
        num_train_epochs=6,
        learning_rate=1e-4,
        logging_steps=2,
        optim="adamw_torch",
        save_strategy="epoch",
        output_dir="mixtral-moe-lora-instruct-shapeskeare",
    ),
    data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False),
)
model.config.use_cache = False

trainer.train()

Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Step,Training Loss
2,5.2574
4,3.5958
6,2.5247
8,1.9021
10,1.6396
12,1.4942
14,1.3964
16,1.3601
18,1.2964
20,1.2614



Cannot access gated repo for url https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1/resolve/main/config.json.
Repo model mistralai/Mixtral-8x7B-Instruct-v0.1 is gated. You must be authenticated to access it. - silently ignoring the lookup for the file config.json in mistralai/Mixtral-8x7B-Instruct-v0.1.

Cannot access gated repo for url https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1/resolve/main/config.json.
Repo model mistralai/Mixtral-8x7B-Instruct-v0.1 is gated. You must be authenticated to access it. - silently ignoring the lookup for the file config.json in mistralai/Mixtral-8x7B-Instruct-v0.1.

Cannot access gated repo for url https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1/resolve/main/config.json.
Repo model mistralai/Mixtral-8x7B-Instruct-v0.1 is gated. You must be authenticated to access it. - silently ignoring the lookup for the file config.json in mistralai/Mixtral-8x7B-Instruct-v0.1.

Cannot access gated repo for url https://huggingface.c

TrainOutput(global_step=30, training_loss=1.8559503078460693, metrics={'train_runtime': 2903.599, 'train_samples_per_second': 0.874, 'train_steps_per_second': 0.01, 'total_flos': 1.6600427274436608e+17, 'train_loss': 1.8559503078460693, 'epoch': 5.454545454545454})

In [20]:
model.eval()

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


<s> [INST] Convert this story to glaswegian language:  What time is our social booked for? [/INST]
 [INST] Convert this story to glaswegian language:  What time is our social booked for? [/INST] Whit time's oor social booked fur? 

This is a rough translation of the sentence "What time is our social booked for?" into Glaswegian language. Glaswegian, also known as the Glasgow patter or the Glasgow dialect, is a form of Scottish English spoken in Glasgow and its surrounding areas. It is


In [25]:
print(glasgow_inference("The quick brown fox jumped over the lazy dog"))

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


<s> [INST] Convert this story to glaswegian language:  The quick brown fox jumped over the lazy dog [/INST]
 [INST] Convert this story to glaswegian language:  The quick brown fox jumped over the lazy dog [/INST] The quick brown fox jumped ower the lazy dog. 

Just a quick translation, but here are some common Glaswegian phrases and words that could be used in this sentence:

* Quick - could be replaced with "gallus" or "skelped"
* Jumped - could be replaced with "skipped" or "bounced"
* Over - could be replaced with "doon" or "oot"
* Lazy - could be replaced with "feart" or "dozy"

So a more Glaswegian version of the sentence could be:

The gallus brown fox skipped doon the feart dog.

But remember, Glaswegian language is not just about replacing words, it's also about the accent and inton
