In [1]:
import torch
import transformers

model = transformers.AutoModelForCausalLM.from_pretrained(
  'mosaicml/mpt-7b',
  trust_remote_code=True,
  torch_dtype=torch.bfloat16,
)
model.eval()
model.to("cuda:0")

  from .autonotebook import tqdm as notebook_tqdm
Explicitly passing a `revision` is encouraged when loading a configuration with custom code to ensure no malicious code has been contributed in a newer revision.
Explicitly passing a `revision` is encouraged when loading a model with custom code to ensure no malicious code has been contributed in a newer revision.
Loading checkpoint shards: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:08<00:00,  4.36s/it]


MPTForCausalLM(
  (transformer): MPTModel(
    (wte): Embedding(50432, 4096)
    (emb_drop): Dropout(p=0, inplace=False)
    (blocks): ModuleList(
      (0): MPTBlock(
        (norm_1): LPLayerNorm((4096,), eps=1e-05, elementwise_affine=True)
        (attn): MultiheadAttention(
          (Wqkv): Linear(in_features=4096, out_features=12288, bias=False)
          (out_proj): Linear(in_features=4096, out_features=4096, bias=False)
        )
        (norm_2): LPLayerNorm((4096,), eps=1e-05, elementwise_affine=True)
        (ffn): MPTMLP(
          (up_proj): Linear(in_features=4096, out_features=16384, bias=False)
          (act): GELU(approximate='none')
          (down_proj): Linear(in_features=16384, out_features=4096, bias=False)
        )
        (resid_attn_dropout): Dropout(p=0, inplace=False)
        (resid_ffn_dropout): Dropout(p=0, inplace=False)
      )
      (1): MPTBlock(
        (norm_1): LPLayerNorm((4096,), eps=1e-05, elementwise_affine=True)
        (attn): MultiheadAttent

In [2]:
!nvidia-smi

Sat May  6 20:58:09 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 450.80.02    Driver Version: 450.80.02    CUDA Version: 11.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla V100-SXM2...  On   | 00000000:00:1B.0 Off |                    0 |
| N/A   47C    P0    66W / 300W |  13496MiB / 16160MiB |     26%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
|   1  Tesla V100-SXM2...  On   | 00000000:00:1C.0 Off |                    0 |
| N/A   43C    P0    37W / 300W |      3MiB / 16160MiB |      0%      Default |
|       

In [3]:
model_size = sum(t.numel() for t in model.parameters())
print(f"Modelsize: {model_size/1000**2:.1f}M parameters")

Modelsize: 6649.3M parameters


In [3]:
tokenizer = transformers.AutoTokenizer.from_pretrained('mosaicml/mpt-7b')

In [4]:
txt = """\
image_path = "/mnt/image.png"

# load image
"""
tokenized_example = tokenizer(txt, return_tensors='pt')

In [72]:
tokenized_example['input_ids']

tensor([[ 5695,    64,  3967,   426, 13357,    78,  2649,    16,  5695,    15,
          8567,     3,   187,   187,     4,  3301,  2460,   187]])

In [5]:
outputs = model.generate(tokenized_example['input_ids'].to('cuda:0'), max_new_tokens=150, do_sample=False, top_k=5, top_p=0.95)


In [6]:
answer = tokenizer.batch_decode(outputs, skip_special_tokens=True)
print(answer[0].rstrip())

image_path = "/mnt/image.png"

# load image
image = cv2.imread(image_path)

# convert to grayscale
gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)

# blur
blur = cv2.GaussianBlur(gray, (5, 5), 0)

# find edges
edged = cv2.Canny(blur, 50, 150)

# find contours
contours, hierarchy = cv2.findContours(edged, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE)

# draw contours
for cnt in contours:
    cv2.drawContours


In [7]:
answer_text = """\
from PIL import Image

image = Image.read(image_path)
"""
tokenized_answer = tokenizer.encode(answer_text ,return_tensors='pt')

In [8]:
outputs = model(**tokenized_example.to("cuda:0"))

In [11]:
outputs.logits.shape

torch.Size([1, 18, 50432])

In [12]:
tokenized_example['input_ids'].shape

torch.Size([1, 18])

In [9]:
last_token_output = outputs.logits[0,-1].view(1,-1)
last_token_output.shape

torch.Size([1, 50432])

In [17]:
torch.argmax(last_token_output)

tensor(5695, device='cuda:0')

In [14]:
tokenized_answer.shape
labels = tokenized_answer[0][0].view(1)
labels

tensor([4064])

In [10]:
for name, param in model.named_parameters():
    print(f"{name}   Modelsize: {param.numel()/1000**2:.1f}M parameters")
    if "31" not in name:
        param.requires_grad = False
    print(name, param.requires_grad)

transformer.wte.weight   Modelsize: 206.6M parameters
transformer.wte.weight False
transformer.blocks.0.norm_1.weight   Modelsize: 0.0M parameters
transformer.blocks.0.norm_1.weight False
transformer.blocks.0.attn.Wqkv.weight   Modelsize: 50.3M parameters
transformer.blocks.0.attn.Wqkv.weight False
transformer.blocks.0.attn.out_proj.weight   Modelsize: 16.8M parameters
transformer.blocks.0.attn.out_proj.weight False
transformer.blocks.0.norm_2.weight   Modelsize: 0.0M parameters
transformer.blocks.0.norm_2.weight False
transformer.blocks.0.ffn.up_proj.weight   Modelsize: 67.1M parameters
transformer.blocks.0.ffn.up_proj.weight False
transformer.blocks.0.ffn.down_proj.weight   Modelsize: 67.1M parameters
transformer.blocks.0.ffn.down_proj.weight False
transformer.blocks.1.norm_1.weight   Modelsize: 0.0M parameters
transformer.blocks.1.norm_1.weight False
transformer.blocks.1.attn.Wqkv.weight   Modelsize: 50.3M parameters
transformer.blocks.1.attn.Wqkv.weight False
transformer.blocks.1.a

In [60]:
params = sum(t.numel() for t in model.transformer.blocks[-1].parameters())
print(f"Modelsize: {params/1000**2:.1f}M parameters")

Modelsize: 201.3M parameters


In [11]:
lossfct = torch.nn.CrossEntropyLoss()
optimizer = transformers.AdamW(model.parameters(), lr=5e-5)



In [12]:
labels = tokenized_answer[0][0].view(1)
loss = lossfct(last_token_output,labels.to("cuda:0"))

In [13]:
loss.item()

3.671875

In [14]:
model.train()
loss.backward()
optimizer.step()
optimizer.zero_grad(set_to_none=True)


In [15]:
outputs = model.generate(tokenized_example['input_ids'].to('cuda:0'), max_new_tokens=50, do_sample=False, top_k=5, top_p=0.95)


In [16]:
answer = tokenizer.batch_decode(outputs, skip_special_tokens=True)
print(answer[0].rstrip())

image_path = "/mnt/image.png"

# load image
from PIL import Image
from PIL import ImageDraw
from PIL import ImageFont
from PIL import ImageFilter
from PIL import ImageEnhance
from PIL import ImageOps
from PIL import ImageChops
from PIL import ImageColor
from PIL import ImageTk
from PIL import ImageGrab
from PIL import ImageSequence
from PIL import ImageSequenceClip
from PIL import ImageSequenceClipDraw
from PIL import ImageSequenceClipDraw
from PIL import ImageSequenceClipDraw
from PIL import ImageSequenceClipDraw
from PIL import ImageSequenceClipDraw
from PIL import ImageSequenceClipDraw
from PIL import ImageSequenceClip
