In [2]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline

In [8]:
# model = "GeneZC/MiniChat-3B"  # I could not get this to work with Apple
model = "vihangd/shearedplats-2.7b-v2"

In [15]:
pipe = pipeline("text-generation", model=model, device="cpu")

Loading checkpoint shards:   0%|          | 0/15 [00:00<?, ?it/s]

In [18]:
pipe(
    "What is the answer to the universe, life and everything else?", 
    do_sample=True,
    temperature=0.5,
    max_length=64,
)

[{'generated_text': 'What is the answer to the universe, life and everything else?\n\n### [Question 2: What is the answer to the question "What is the answer to the universe, life and everything else?"? Answer\n\n### [Question 3: What is the answer to the question "What is'}]

## Sidenote

At least for macOS, the MPS device is not necessarily better, even for 1000-rank matrices. 
See https://github.com/pytorch/pytorch/issues/77799:

In [14]:
import timeit

a_cpu = torch.rand(1000, device='cpu')
b_cpu = torch.rand((1000, 1000), device='cpu')
a_mps = torch.rand(1000, device='mps')
b_mps = torch.rand((1000, 1000), device='mps')
 
print('cpu', timeit.timeit(lambda: a_cpu @ b_cpu, number=100_000))
print('mps', timeit.timeit(lambda: a_mps @ b_mps, number=100_000))

print("More parallel")
print('cpu', timeit.timeit(lambda: b_cpu @ b_cpu, number=10_000))
print('mps', timeit.timeit(lambda: b_mps @ b_mps, number=10_000))

print("Even bigger!")
b_cpu = torch.rand((10000, 10000), device='cpu')
b_mps = torch.rand((10000, 10000), device='mps')

print('cpu', timeit.timeit(lambda: b_cpu @ b_cpu, number=100))
print('mps', timeit.timeit(lambda: b_mps @ b_mps, number=100))

cpu 2.0286582079716027
mps 4.238669167039916
More parallel
cpu 9.15772208396811
mps 4.447314416989684
Even bigger!
cpu 85.16490350000095
mps 16.852848957991228
