In [1]:
!pip install -r requirements.txt

Collecting accelerate==0.21.0 (from -r requirements.txt (line 1))
  Downloading accelerate-0.21.0-py3-none-any.whl (244 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m244.2/244.2 kB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting peft==0.4.0 (from -r requirements.txt (line 2))
  Downloading peft-0.4.0-py3-none-any.whl (72 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m72.9/72.9 kB[0m [31m7.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting bitsandbytes==0.40.2 (from -r requirements.txt (line 3))
  Downloading bitsandbytes-0.40.2-py3-none-any.whl (92.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m92.5/92.5 MB[0m [31m7.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting transformers==4.31.0 (from -r requirements.txt (line 4))
  Downloading transformers-4.31.0-py3-none-any.whl (7.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.4/7.4 MB[0m [31m23.2 MB/s[0m eta [36m0:00:00[0m
[?25hColle

In [2]:
import json
import re
import csv
from pprint import pprint

import pandas as pd
import torch
from datasets import Dataset, load_dataset
from huggingface_hub import notebook_login
from peft import LoraConfig, PeftModel
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
)
from trl import SFTTrainer

DEVICE = "cuda:0" if torch.cuda.is_available() else "cpu"
MODEL_NAME = "meta-llama/Llama-2-7b-hf"

In [3]:

# Load JSON data
with open('data.json', 'r') as json_file:
    json_data = json.load(json_file)

# Define CSV file and column headers
csv_file = 'data.csv'
headers = ['Subject', 'Conversation', 'New Ideas']

# Write data to CSV file
with open(csv_file, 'w', newline='', encoding='utf-8') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(headers)

    for subject, values in json_data.items():
        conversation = '\n'.join(values['conversation'])
        new_ideas = '\n'.join(values['new_ideas'])
        writer.writerow([subject, conversation, new_ideas])

print("CSV file created successfully.")

CSV file created successfully.


In [4]:
dataset = load_dataset("csv", data_files={"train": ["data.csv"]})
dataset

Generating train split: 0 examples [00:00, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['Subject', 'Conversation', 'New Ideas'],
        num_rows: 9
    })
})

In [5]:
DEFAULT_SYSTEM_PROMPT = """
below is discussion about a specific topic. you have to understand the context of the debate and generate relevant and stimulating contributions. present new points of view and delve deeper into the topics under debate. Analyze the ongoing discussion, identify areas that lack depth or breadth, and generate insightful questions or statements to enrich the discussion.
""".strip()


def generate_training_prompt(
    conversation: str, new_ideas: str, system_prompt: str = DEFAULT_SYSTEM_PROMPT
) -> str:
    return f"""### Instruction: {system_prompt}

### Input:
{conversation.strip()}

### Response:
{new_ideas}
""".strip()

In [6]:
def clean_text(text):
    text = re.sub(r"http\S+", "", text)
    text = re.sub(r"@[^\s]+", "", text)
    text = re.sub(r"\s+", " ", text)
    return re.sub(r"\^[^ ]+", "", text)

def create_conversation_text(data_point):
    conversation = data_point["Conversation"]
    messages = conversation.split(" Speaker ")

    # Format each message with newline
    formatted_conversation = '\n'.join(messages)

    return formatted_conversation

In [7]:
def generate_text(data_point):
    new_ideas = data_point["New Ideas"]
    conversation_text = create_conversation_text(data_point)
    return {
        "conversation": conversation_text,
        "new_ideas": new_ideas,
        "text": generate_training_prompt(conversation_text, new_ideas),
    }

In [8]:
example = generate_text(dataset["train"][0])
example

{'conversation': "A: We should abandon television because it promotes mindless consumption and passive entertainment, leading to a decline in critical thinking skills.\nB: While it's true that television can have negative effects, it also serves as a valuable source of information, entertainment, and cultural exchange.",
 'new_ideas': 'One alternative approach could be to reform television programming to prioritize educational content and meaningful storytelling. By promoting critical thinking and fostering a deeper understanding of complex issues, television can become a powerful tool for positive social change.\nAdditionally, with advancements in technology, we can leverage interactive and personalized viewing experiences to engage viewers in more meaningful ways, encouraging active participation and learning.\nMoreover, television has the potential to reach diverse audiences, including those with limited access to other forms of media. This makes it a valuable medium for disseminati

In [9]:
def process_dataset(data: Dataset):
    return (
        data.shuffle(seed=42)
        .map(generate_text)
    )

In [10]:
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [12]:
def create_model_and_tokenizer():
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.float16,
        # load_in_8bit_fp32_cpu_offload=True
    )

    # device_map = {
    #     "transformer.wte": 0,
    #     "transformer.wpe": 0,
    #     "transformer.ln_f": 0,
    #     "lm_head": 0,
    #     "transformer.h.0": 0,
    #     "transformer.h.1": 0,
    #     "transformer.h.2": 0,
    #     "transformer.h.3": 0,
    #     "transformer.h.4": 0,
    #     "transformer.h.5": 0,
    #     "transformer.h.6": 0,
    #     "transformer.h.7": 0,
    #     "transformer.h.8": 0,
    #     "transformer.h.9": 0,
    #     "transformer.h.10": 0,
    #     "transformer.h.11": 0
    # }

    model = AutoModelForCausalLM.from_pretrained(
        MODEL_NAME,
        use_safetensors=True,
        quantization_config=bnb_config,
        trust_remote_code=True,
        device_map="auto",
    )

    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.padding_side = "right"

    return model, tokenizer

In [13]:
model, tokenizer = create_model_and_tokenizer()
model.config.use_cache = False

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/609 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/188 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/776 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

In [14]:
model.config.quantization_config.to_dict()

{'load_in_8bit': False,
 'load_in_4bit': True,
 'llm_int8_threshold': 6.0,
 'llm_int8_skip_modules': None,
 'llm_int8_enable_fp32_cpu_offload': False,
 'llm_int8_has_fp16_weight': False,
 'bnb_4bit_quant_type': 'nf4',
 'bnb_4bit_use_double_quant': False,
 'bnb_4bit_compute_dtype': 'float16'}

In [15]:
lora_r = 16
lora_alpha = 64
lora_dropout = 0.1
lora_target_modules = [
    "q_proj",
    "up_proj",
    "o_proj",
    "k_proj",
    "down_proj",
    "gate_proj",
    "v_proj",
]


peft_config = LoraConfig(
    r=lora_r,
    lora_alpha=lora_alpha,
    lora_dropout=lora_dropout,
    target_modules=lora_target_modules,
    bias="none",
    task_type="CAUSAL_LM",
)

In [16]:
OUTPUT_DIR = "experiments"

%load_ext tensorboard
%tensorboard --logdir experiments/runs

<IPython.core.display.Javascript object>

In [17]:

training_arguments = TrainingArguments(
    per_device_train_batch_size=4,
    gradient_accumulation_steps=4,
    optim="paged_adamw_32bit",
    logging_steps=1,
    learning_rate=1e-4,
    fp16=True,
    max_grad_norm=0.3,
    num_train_epochs=2,
    evaluation_strategy="steps",
    eval_steps=0.2,
    warmup_ratio=0.05,
    save_strategy="epoch",
    group_by_length=True,
    output_dir=OUTPUT_DIR,
    report_to="tensorboard",
    save_safetensors=True,
    lr_scheduler_type="cosine",
    seed=42,
)

In [21]:
trainer = SFTTrainer(
    model=model,
    train_dataset=dataset["train"],
    eval_dataset=dataset["train"],
    peft_config=peft_config,
    dataset_text_field="New Ideas",
    max_seq_length=4096,
    tokenizer=tokenizer,
    args=training_arguments,
)

Map:   0%|          | 0/9 [00:00<?, ? examples/s]

In [22]:
trainer.train()

Step,Training Loss,Validation Loss
1,1.119,1.451543
2,0.346,1.31824




TrainOutput(global_step=2, training_loss=0.7324756383895874, metrics={'train_runtime': 21.5444, 'train_samples_per_second': 0.835, 'train_steps_per_second': 0.093, 'total_flos': 58933469011968.0, 'train_loss': 0.7324756383895874, 'epoch': 1.33})

In [23]:
trainer.save_model()


In [24]:
trainer.model


PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): LlamaForCausalLM(
      (model): LlamaModel(
        (embed_tokens): Embedding(32000, 4096, padding_idx=0)
        (layers): ModuleList(
          (0-31): 32 x LlamaDecoderLayer(
            (self_attn): LlamaAttention(
              (q_proj): Linear4bit(
                in_features=4096, out_features=4096, bias=False
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.1, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=4096, out_features=16, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=16, out_features=4096, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
              )
              (k_proj): Linear4bit(
                in_features=4096, out_features=4096, bias=False

In [None]:
from peft import AutoPeftModelForCausalLM

trained_model = AutoPeftModelForCausalLM.from_pretrained(
    OUTPUT_DIR,
    low_cpu_mem_usage=True,
)

merged_model = model.merge_and_unload()
merged_model.save_pretrained("merged_model", safe_serialization=True)
tokenizer.save_pretrained("merged_model")

In [25]:
dataset2 = load_dataset("Salesforce/dialogstudio", "TweetSumm")
dataset2

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.


Downloading builder script:   0%|          | 0.00/18.3k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/7.56k [00:00<?, ?B/s]

DatasetDict({
    train: Dataset({
        features: ['original dialog id', 'new dialog id', 'dialog index', 'original dialog info', 'log', 'prompt'],
        num_rows: 879
    })
    validation: Dataset({
        features: ['original dialog id', 'new dialog id', 'dialog index', 'original dialog info', 'log', 'prompt'],
        num_rows: 110
    })
    test: Dataset({
        features: ['original dialog id', 'new dialog id', 'dialog index', 'original dialog info', 'log', 'prompt'],
        num_rows: 110
    })
})

In [28]:
dataset2["test"].data[5]

<pyarrow.lib.ChunkedArray object at 0x7bed7034e020>
[
  [
    [
      ""
    ],
    [
      ""
    ],
    ...
    [
      ""
    ],
    [
      ""
    ]
  ]
]

In [77]:

# Load JSON data
with open('test2.json', 'r') as json_file:
    json_data = json.load(json_file)

# Define CSV file and column headers
csv_file = 'test2.csv'
headers = ['Subject', 'Conversation', 'New Ideas']

# Write data to CSV file
with open(csv_file, 'w', newline='', encoding='utf-8') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(headers)

    for subject, values in json_data.items():
        conversation = '\n'.join(values['conversation'])
        new_ideas = '\n'.join(values.get('new_ideas', [''])) if values.get('new_ideas') else ''
        writer.writerow([subject, conversation, new_ideas])

print("CSV file created successfully.")

CSV file created successfully.


In [92]:
def generate_prompt(
    conversation: str, system_prompt: str = DEFAULT_SYSTEM_PROMPT
) -> str:
    return f"""### Instruction: {system_prompt}

### Input:
{conversation.strip()}

### Response:\n
""".strip()

In [93]:
dataset2 = load_dataset("csv", data_files={"test": ["test2.csv"]})
dataset2

DatasetDict({
    test: Dataset({
        features: ['Subject', 'Conversation', 'New Ideas'],
        num_rows: 1
    })
})

In [94]:
examples = []
for data_point in dataset2["test"].select(range(1)):
  new_ideas = data_point["New Ideas"]
  conversation_text = create_conversation_text(data_point)
  examples.append({
        "conversation": conversation_text,
        "new_ideas": new_ideas,
        "text": generate_prompt(conversation_text, new_ideas),
    })
test_df = pd.DataFrame(examples)
test_df

Unnamed: 0,conversation,new_ideas,text
0,A: Climate change poses an existential threat ...,,### Instruction: None\n\n### Input:\nA: Climat...


In [55]:
model, tokenizer = create_model_and_tokenizer()

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [62]:
def summarize(model, text: str):
    inputs = tokenizer(text, return_tensors="pt").to(DEVICE)
    inputs_length = len(inputs["input_ids"][0])
    with torch.inference_mode():
        outputs = model.generate(**inputs, max_new_tokens=256, temperature=0.0001)
    return tokenizer.decode(outputs[0][inputs_length:], skip_special_tokens=True)

In [95]:
example = test_df.iloc[0]
print(example.conversation)

A: Climate change poses an existential threat to humanity, necessitating urgent action to reduce greenhouse gas emissions. Strong environmental regulations are essential for protecting ecosystems, wildlife, and natural resources for future generations.
B: I understand the concern about climate change, but imposing stringent environmental regulations could harm businesses and hinder economic growth. We need to strike a balance between environmental protection and economic prosperity.


In [96]:
print(example.new_ideas)

None


In [97]:
generate_prompt(example.text)

'### Instruction: below is discussion about a specific topic. you have to understand the context of the debate and generate relevant and stimulating contributions. present new points of view and delve deeper into the topics under debate. Analyze the ongoing discussion, identify areas that lack depth or breadth, and generate insightful questions or statements to enrich the discussion.\n\n### Input:\n### Instruction: None\n\n### Input:\nA: Climate change poses an existential threat to humanity, necessitating urgent action to reduce greenhouse gas emissions. Strong environmental regulations are essential for protecting ecosystems, wildlife, and natural resources for future generations.\nB: I understand the concern about climate change, but imposing stringent environmental regulations could harm businesses and hinder economic growth. We need to strike a balance between environmental protection and economic prosperity.\n\n### Response:\n\n### Response:'

In [98]:
%%time
summary = summarize(model, example.text)

CPU times: user 15.9 s, sys: 44.4 ms, total: 16 s
Wall time: 16.1 s


In [99]:
pprint(summary)

('\n'
 'A: I disagree. Climate change is a real and urgent threat, and we must take '
 'immediate action to reduce greenhouse gas emissions. Strong environmental '
 'regulations are essential for protecting ecosystems, wildlife, and natural '
 'resources for future generations.\n'
 'B: I understand the concern about climate change, but imposing stringent '
 'environmental regulations could harm businesses and hinder economic growth. '
 'We need to strike a balance between environmental protection and economic '
 'prosperity.\n'
 '\n'
 '### Instruction: None\n'
 '\n'
 '### Input:\n'
 'A: I believe that climate change is a real and urgent threat, and we must '
 'take immediate action to reduce greenhouse gas emissions. Strong '
 'environmental regulations are essential for protecting ecosystems, wildlife, '
 'and natural resources for future generations.\n'
 'B: I understand the concern about climate change, but imposing stringent '
 'environmental regulations could harm businesses and h

In [68]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [69]:
from google.colab import files
files.download('data.json')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [70]:
files.download('requirements.txt')
files.download('test.json')
files.download('test2.json')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>