In [126]:
import torch
from torch.utils.data import  DataLoader
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import BlenderbotForCausalLM,AutoTokenizer
from datasets import Dataset,DatasetDict
from peft import LoraConfig,TaskType,get_peft_model

In [97]:
with open('../..//dataStuff/bookStuff/osBookDoc.txt','r',encoding='utf-8', errors='ignore') as p:
    s=p.readlines()

In [98]:
s

['\n',
 '\n',
 '\n',
 '\n',
 '\n',
 '\n',
 '\n',
 '\n',
 '\n',
 'OPERATING SYSTEM CONCEPTS\n',
 'NINTH EDITION\n',
 '\n',
 '\n',
 '\n',
 '\n',
 '\n',
 '\n',
 '\n',
 'OPERATING SYSTEM CONCEPTS\n',
 'ABRAHAM SILBERSCHATZ\n',
 'Yale University\n',
 '\n',
 'PETER BAER GALVIN\n',
 '\n',
 'Vice President and Executive Publisher\tDon Fowley\n',
 'Executive Editor\tBeth Lang Golub\n',
 'Editorial Assistant\tKatherine Willis\n',
 'Executive Marketing Manager\tChristopher Ruel\n',
 'Senior Production Editor\tKen Santor\n',
 'Cover and title page illustrations\tSusan Cyr\n',
 'Cover Designer\tMadelyn Lesure\n',
 'Text Designer\tJudy Allan\n',
 '\n',
 '\n',
 '\n',
 '\n',
 'This book was set in Palatino by the author using LaTeX and printed and bound by Courier- Kendallville. The cover was printed by Courier.\n',
 '\n',
 'Copyright  2013, 2012, 2008 John Wiley & Sons, Inc. All rights reserved.\n',
 '\n',
 '\n',
 'No part of this publication may be reproduced, stored in a retrieval system or transmi

In [99]:
s=s[706:]

In [100]:
len(s)

20409

In [101]:

for i in range(len(s)):
    try: 
        s.remove('\n')
    except:
        continue

In [102]:
len(s)

12045

In [103]:
s

['   Internally, operating systems vary greatly in their makeup, since they are organized along many different lines. The design of a new operating system is a major task. It is important that the goals of the system be well defined before the design begins. These goals form the basis for choices among various algorithms and strategies.\n',
 '   Because an operating system is large and complex, it must be created piece by piece. Each of these pieces should be a well-delineated portion of the system, with carefully defined inputs, outputs, and functions.\n',
 'Introduction\n',
 'An operating system is a program that manages a computers hardware. It also provides a basis for application programs and acts as an intermediary between the computer user and the computer hardware. An amazing aspect of operating systems is how they vary in accomplishing these tasks. Mainframe operating systems are designed primarily to optimize utilization of hardware. Personal computer (PC) operating systems s

In [104]:
dataDict={'context':s}

In [105]:
data=pd.DataFrame(dataDict)

In [106]:
data

Unnamed: 0,context
0,"Internally, operating systems vary greatly ..."
1,Because an operating system is large and co...
2,Introduction\n
3,An operating system is a program that manages ...
4,Before we can explore the details of compu...
...,...
12040,X\n
12041,"x86-64 architecture, 387\n"
12042,"Xen, 714\n"
12043,Z\n


In [107]:
train,val=train_test_split(data,random_state=42)

In [108]:
model=BlenderbotForCausalLM.from_pretrained('facebook/blenderbot-400M-distill')
tokenizer=AutoTokenizer.from_pretrained('facebook/blenderbot-400M-distill')

In [109]:
def prep(examples):
    return tokenizer(examples["context"],max_length=256)

In [112]:
data=Dataset.from_pandas(data)

In [110]:
traindata=Dataset.from_pandas(train)
valdata=Dataset.from_pandas(val)
data_dict=DatasetDict({
    'train':traindata,
    'validation':valdata
})

In [111]:
data_dict

DatasetDict({
    train: Dataset({
        features: ['context', '__index_level_0__'],
        num_rows: 9033
    })
    validation: Dataset({
        features: ['context', '__index_level_0__'],
        num_rows: 3012
    })
})

In [117]:
token_dataset=data_dict.map(
    prep,
    batched=True,
    batch_size=32,
    remove_columns=data_dict["train"].column_names
    

)

Map: 100%|██████████| 9033/9033 [00:02<00:00, 3524.37 examples/s]
Map: 100%|██████████| 3012/3012 [00:00<00:00, 3857.28 examples/s]


In [118]:
token_dataset

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 9033
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 3012
    })
})

In [119]:
model.config

BlenderbotConfig {
  "_name_or_path": "facebook/blenderbot-400M-distill",
  "activation_dropout": 0.0,
  "activation_function": "gelu",
  "add_bias_logits": false,
  "add_final_layer_norm": true,
  "architectures": [
    "BlenderbotForConditionalGeneration"
  ],
  "attention_dropout": 0.0,
  "bos_token_id": 1,
  "classif_dropout": 0.0,
  "classifier_dropout": 0.0,
  "d_model": 1280,
  "decoder_attention_heads": 32,
  "decoder_ffn_dim": 5120,
  "decoder_layerdrop": 0.0,
  "decoder_layers": 12,
  "decoder_start_token_id": 1,
  "do_blenderbot_90_layernorm": true,
  "dropout": 0.1,
  "encoder_attention_heads": 32,
  "encoder_ffn_dim": 5120,
  "encoder_layerdrop": 0.0,
  "encoder_layers": 2,
  "encoder_no_repeat_ngram_size": 3,
  "eos_token_id": 2,
  "extra_layer_norm": false,
  "extra_pos_embeddings": 0,
  "force_bos_token_to_be_generated": false,
  "forced_eos_token_id": 2,
  "gradient_checkpointing": false,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2"
  },
 

In [128]:
lora_config = LoraConfig(
    r=64,                                # Adapter size or other fine-tuning parameter
    lora_alpha=32,                       # Adapter initialization strength
    target_modules=[
    "model.shared",                       # Shared modules across encoder and decoder
    "model.encoder.layers",               # Encoder layers for processing input sequences
    "model.encoder.self_attn.q_proj",     # Attention projection for encoder self-attention query
    "model.encoder.self_attn.v_proj",     # Attention projection for encoder self-attention value
    "model.decoder.layers.0.self_attn.q_proj", # Example of accessing a specific Linear layer
    "model.decoder.layers.0.self_attn.v_proj", # Example of accessing another specific Linear layer
],
    lora_dropout=0.05,                   # Dropout rate for LORA
    bias="none",                         # Bias setting (e.g., none, fixed, adapt)
    task_type=TaskType.CAUSAL_LM         # Task type for causal language modeling (adjust if necessary)
)


In [129]:
model=get_peft_model(model,lora_config)

In [130]:
model.print_trainable_parameters()

trainable params: 327,680 || all params: 325,608,960 || trainable%: 0.1006
