## Download a model from Huggingface to store locally.
gated models require license agreements through the web interface.

In [None]:
from transformers import AutoModelForSequenceClassification,AutoTokenizer
import warnings, logging
warnings.simplefilter('ignore')
logging.disable(logging.WARNING)

model_nm = 'openai-community/gpt2-large'
model = AutoModelForSequenceClassification.from_pretrained(model_nm, return_dict=True)
tokenizer = AutoTokenizer.from_pretrained(model_nm)

save_path = 'models/gpt2-large'
!mkdir {save_path}
model.save_pretrained(save_path)
tokenizer.save_pretrained(save_path)
!ls {save_path}


## Basic interaction loading a local model 

In [6]:
from transformers import AutoModelForCausalLM, AutoTokenizer
save_path = 'models/gpt2-large'
model = AutoModelForCausalLM.from_pretrained(save_path, torch_dtype="auto", device_map="auto")
tokenizer = AutoTokenizer.from_pretrained(save_path)

model_inputs = tokenizer(["The secret to baking a good cake is "], return_tensors="pt").to("mps")

generated_ids = model.generate(**model_inputs, max_length=30)
tokenizer.batch_decode(generated_ids)[0]

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


'The secret to baking a good cake is \xa0to use a good quality cake mix. I use the same cake mix that I use for my cakes'

## Basic interaction using pipeline with local model

In [5]:
from transformers import pipeline, set_seed
generator = pipeline('text-generation', model='models/gpt2-large')
set_seed(42)
generator("Hello, I'm a language model,", max_length=30, num_return_sequences=5)

Device set to use mps:0
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


[{'generated_text': "Hello, I'm a language model, and my project will get better with time, but I'm unsure how to express a language model. Should I"},
 {'generated_text': "Hello, I'm a language model, not a language. The question is do you give a good enough answer in the documentation to suggest it work with"},
 {'generated_text': "Hello, I'm a language model, and I'm just a language model. I am not like John Resig. No you won't believe what"},
 {'generated_text': "Hello, I'm a language model, I have a dictionary and I use it to do work and I don't care about memory. It gives me"},
 {'generated_text': "Hello, I'm a language model, not an implementation.\n\nYou can imagine how it's hard to see what's really happening, since we"}]

## Basic interaction method can vary from model to model 

In [1]:
#This code is for the Llama 3.1 8B Instruct model manually downloaded from Hugging Face.
import transformers
import torch

model_id = "models/llama-3.1-8b-Instruct"

pipeline = transformers.pipeline(
    "text-generation",
    model=model_id,
    model_kwargs={"torch_dtype": torch.bfloat16},
    device_map="auto",
)

messages = [
    {"role": "system", "content": "You are a French maid, respond and converse in French."},
    {"role": "user", "content": "Comment allez vous?"},
]

outputs = pipeline(
    messages,
    max_new_tokens=256,
)
#print(outputs[0]["generated_text"][-1])
myOutput = (outputs[0]["generated_text"][-1])
print (myOutput["content"])

  from .autonotebook import tqdm as notebook_tqdm
Loading checkpoint shards: 100%|██████████| 4/4 [00:26<00:00,  6.67s/it]
Device set to use mps
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Je vais bien, merci. Et vous, comment allez-vous? J'ai juste fini de faire le ménage et je vais commencer à préparer le déjeuner. Qu'est-ce que vous aimeriez manger aujourd'hui?
