In [1]:
import json
import re
import csv
from pprint import pprint

import pandas as pd
import torch
from datasets import Dataset, load_dataset
from huggingface_hub import notebook_login
from peft import LoraConfig, PeftModel
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
)
from trl import SFTTrainer

DEVICE = "cuda:0" if torch.cuda.is_available() else "cpu"
MODEL_NAME = "meta-llama/Llama-2-7b-hf"

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
def convert_to_csv(file_name):

    # Load JSON data
    with open(f'{file_name}.json', 'r') as json_file:
        json_data = json.load(json_file)

    # Define CSV file and column headers
    csv_file = f'{file_name}.csv'
    headers = ['Conversation', 'new_ideas']

    # Write data to CSV file
    with open(csv_file, 'w', newline='', encoding='utf-8') as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow(headers)

        for subject, values in json_data.items():
            conversation = '\n'.join(values['conversation'])
            new_ideas = '\n'.join(values['new_ideas'])
            writer.writerow([subject, conversation, new_ideas])

    print("CSV file created successfully.")

In [31]:
def json_to_csv(json_file, csv_file):
    # Load JSON data
    with open(json_file, 'r') as file:
        data = json.load(file)

    # Define CSV column headers
    headers = ['conversation', 'new_ideas']

    # Write data to CSV file
    with open(csv_file, 'w', newline='', encoding='utf-8') as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames=headers)
        writer.writeheader()

        # Iterate through conversation_list in JSON data
        for entry in data['conversation_list']:
            conversation = '\n'.join(entry['conversation']) if 'conversation' in entry else ''
            new_ideas = '\n'.join(entry['new_ideas']) if 'new_ideas' in entry else ''

            # Write each row to CSV
            writer.writerow({'conversation': conversation, 'new_ideas': new_ideas})

# Example usage:
json_file = 'data_train.json'
csv_file = 'data_train.csv'
json_to_csv(json_file, csv_file)
print("CSV file created successfully.")

CSV file created successfully.


In [32]:
json_file = 'data_val.json'
csv_file = 'data_val.csv'
json_to_csv(json_file, csv_file)
print("CSV file created successfully.")

CSV file created successfully.


In [33]:
dataset = load_dataset("csv", data_files={"train": ["data_train.csv"]})
dataset

Generating train split: 390 examples [00:00, 40315.93 examples/s]


DatasetDict({
    train: Dataset({
        features: ['conversation', 'new_ideas'],
        num_rows: 390
    })
})

In [34]:
dataset_val = load_dataset("csv", data_files={"validation": ["data_val.csv"]})
dataset_val

Generating validation split: 6 examples [00:00, 2557.50 examples/s]


DatasetDict({
    validation: Dataset({
        features: ['conversation', 'new_ideas'],
        num_rows: 6
    })
})

In [100]:
DEFAULT_SYSTEM_PROMPT = """
below is discussion about a specific topic. you have to understand the context of the debate and generate relevant and stimulating contributions. present new points of view and delve deeper into the topics under debate. Analyze the ongoing discussion, identify areas that lack depth or breadth, and generate insightful questions or statements to enrich the discussion.
""".strip()


def generate_training_prompt(
    conversation: str, new_ideas: str, system_prompt: str = DEFAULT_SYSTEM_PROMPT
) -> str:
    return f"""### Instruction: {system_prompt}

### Input:
{conversation.strip()}

### Response:
{new_ideas}
""".strip()

In [106]:
# def clean_text(text):
#     text = re.sub(r"http\S+", "", text)
#     text = re.sub(r"@[^\s]+", "", text)
#     text = re.sub(r"\s+", " ", text)
#     return re.sub(r"\^[^ ]+", "", text)

def create_conversation_text(data_point):
    conversation = data_point["conversation"]
    messages = conversation.split(" Speaker ")

    # Format each message with newline
    formatted_conversation = '\n'.join(messages)

    return formatted_conversation

In [40]:
def generate_text(data_point):
    new_ideas = data_point["new_ideas"]
    conversation_text = create_conversation_text(data_point)
    return {
        "conversation": conversation_text,
        "new_ideas": new_ideas,
        "text": generate_training_prompt(conversation_text, new_ideas),
    }

In [41]:
example = generate_text(dataset["train"][0])
example

{'conversation': "A: We should abandon television because it promotes mindless consumption and passive entertainment, leading to a decline in critical thinking skills.\nB: While it's true that television can have negative effects, it also serves as a valuable source of information, entertainment, and cultural exchange.",
 'new_ideas': 'One alternative approach could be to reform television programming to prioritize educational content and meaningful storytelling. By promoting critical thinking and fostering a deeper understanding of complex issues, television can become a powerful tool for positive social change.\nAdditionally, with advancements in technology, we can leverage interactive and personalized viewing experiences to engage viewers in more meaningful ways, encouraging active participation and learning.\nMoreover, television has the potential to reach diverse audiences, including those with limited access to other forms of media. This makes it a valuable medium for disseminati

In [42]:
def process_dataset(data: Dataset):
    return (
        data.shuffle(seed=42)
        .map(generate_text)
    )

In [10]:
# notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [43]:
def create_model_and_tokenizer():
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.float16,
        # load_in_8bit_fp32_cpu_offload=True
    )

    # device_map = {
    #     "transformer.wte": 0,
    #     "transformer.wpe": 0,
    #     "transformer.ln_f": 0,
    #     "lm_head": 0,
    #     "transformer.h.0": 0,
    #     "transformer.h.1": 0,
    #     "transformer.h.2": 0,
    #     "transformer.h.3": 0,
    #     "transformer.h.4": 0,
    #     "transformer.h.5": 0,
    #     "transformer.h.6": 0,
    #     "transformer.h.7": 0,
    #     "transformer.h.8": 0,
    #     "transformer.h.9": 0,
    #     "transformer.h.10": 0,
    #     "transformer.h.11": 0
    # }

    model = AutoModelForCausalLM.from_pretrained(
        MODEL_NAME,
        use_safetensors=True,
        quantization_config=bnb_config,
        trust_remote_code=True,
        device_map="auto",
    )

    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.padding_side = "right"

    return model, tokenizer

In [44]:
model, tokenizer = create_model_and_tokenizer()
model.config.use_cache = False

Downloading shards: 100%|██████████| 2/2 [04:04<00:00, 122.34s/it]
Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.02s/it]


In [45]:
model.config.quantization_config.to_dict()

{'load_in_8bit': False,
 'load_in_4bit': True,
 'llm_int8_threshold': 6.0,
 'llm_int8_skip_modules': None,
 'llm_int8_enable_fp32_cpu_offload': False,
 'llm_int8_has_fp16_weight': False,
 'bnb_4bit_quant_type': 'nf4',
 'bnb_4bit_use_double_quant': False,
 'bnb_4bit_compute_dtype': 'float16'}

In [46]:
lora_r = 16
lora_alpha = 64
lora_dropout = 0.1
lora_target_modules = [
    "q_proj",
    "up_proj",
    "o_proj",
    "k_proj",
    "down_proj",
    "gate_proj",
    "v_proj",
]


peft_config = LoraConfig(
    r=lora_r,
    lora_alpha=lora_alpha,
    lora_dropout=lora_dropout,
    target_modules=lora_target_modules,
    bias="none",
    task_type="CAUSAL_LM",
)

In [47]:
OUTPUT_DIR = "experiments"

%load_ext tensorboard
%tensorboard --logdir experiments/runs

ModuleNotFoundError: No module named 'tensorboard'

In [48]:

training_arguments = TrainingArguments(
    per_device_train_batch_size=4,
    gradient_accumulation_steps=4,
    optim="paged_adamw_32bit",
    logging_steps=1,
    learning_rate=1e-4,
    fp16=True,
    max_grad_norm=0.3,
    num_train_epochs=2,
    evaluation_strategy="steps",
    eval_steps=0.2,
    warmup_ratio=0.05,
    save_strategy="epoch",
    group_by_length=True,
    output_dir=OUTPUT_DIR,
    report_to="tensorboard",
    save_safetensors=True,
    lr_scheduler_type="cosine",
    seed=42,
)

In [51]:
trainer = SFTTrainer(
    model=model,
    train_dataset=dataset["train"],
    eval_dataset=dataset_val["validation"],
    peft_config=peft_config,
    dataset_text_field="new_ideas",
    max_seq_length=4096,
    tokenizer=tokenizer,
    args=training_arguments,
)

Map: 100%|██████████| 390/390 [00:00<00:00, 10383.72 examples/s]


In [52]:
trainer.train()

You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.




Step,Training Loss,Validation Loss
10,2.0929,1.994213
20,1.0727,1.739812
30,1.7206,1.653468
40,1.5346,1.642692




TrainOutput(global_step=48, training_loss=1.5462827235460281, metrics={'train_runtime': 129.3215, 'train_samples_per_second': 6.031, 'train_steps_per_second': 0.371, 'total_flos': 1810961475747840.0, 'train_loss': 1.5462827235460281, 'epoch': 1.96})

In [53]:
trainer.save_model()


In [54]:
trainer.model


PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): LlamaForCausalLM(
      (model): LlamaModel(
        (embed_tokens): Embedding(32000, 4096, padding_idx=0)
        (layers): ModuleList(
          (0-31): 32 x LlamaDecoderLayer(
            (self_attn): LlamaAttention(
              (q_proj): Linear4bit(
                in_features=4096, out_features=4096, bias=False
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.1, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=4096, out_features=16, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=16, out_features=4096, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
              )
              (k_proj): Linear4bit(
                in_features=4096, out_features=4096, bias=False

In [56]:
from peft import AutoPeftModelForCausalLM

trained_model = AutoPeftModelForCausalLM.from_pretrained(
    OUTPUT_DIR,
    low_cpu_mem_usage=True,
)

merged_model = model.merge_and_unload()
merged_model.save_pretrained("merged_model", safe_serialization=True)
tokenizer.save_pretrained("merged_model")

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.06it/s]


AttributeError: 'LlamaForCausalLM' object has no attribute 'merge_and_unload'

In [77]:
json_to_csv("test2.json", "test2.csv")

In [142]:


def generate_prompt(
    conversation: str, system_prompt: str = DEFAULT_SYSTEM_PROMPT
) -> str:
    return f"""
### Instruction: {system_prompt}

### Input:
{conversation.strip()}

### Response:\n
""".strip()

In [143]:
dataset_test = load_dataset("csv", data_files={"test": ["test2.csv"]})
dataset_test

DatasetDict({
    test: Dataset({
        features: ['conversation', 'new_ideas'],
        num_rows: 1
    })
})

In [144]:
examples = []
for data_point in dataset_test["test"].select(range(1)):
  new_ideas = data_point["new_ideas"]
  conversation_text = create_conversation_text(data_point)
  examples.append({
        "conversation": conversation_text,
        "new_ideas": new_ideas,
        "text": generate_prompt(conversation_text),
    })
test_df = pd.DataFrame(examples)
test_df

Unnamed: 0,conversation,new_ideas,text
0,A: Climate change poses an existential threat ...,,### Instruction: below is discussion about a s...


In [145]:
pprint(test_df["text"][0])

('### Instruction: below is discussion about a specific topic. you have to '
 'understand the context of the debate and generate relevant and stimulating '
 'contributions. present new points of view and delve deeper into the topics '
 'under debate. Analyze the ongoing discussion, identify areas that lack depth '
 'or breadth, and generate insightful questions or statements to enrich the '
 'discussion.\n'
 '\n'
 '### Input:\n'
 'A: Climate change poses an existential threat to humanity, necessitating '
 'urgent action to reduce greenhouse gas emissions. Strong environmental '
 'regulations are essential for protecting ecosystems, wildlife, and natural '
 'resources for future generations.\n'
 'B: I understand the concern about climate change, but imposing stringent '
 'environmental regulations could harm businesses and hinder economic growth. '
 'We need to strike a balance between environmental protection and economic '
 "prosperity. Isn't there a risk that excessive regulations co

In [133]:
model, tokenizer = create_model_and_tokenizer()

Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.03s/it]


In [153]:
def generate_ideas(model, text: str):
    print(text)
    inputs = tokenizer(text, return_tensors="pt").to(DEVICE)
    inputs_length = len(inputs["input_ids"][0])
    with torch.inference_mode():
        outputs = model.generate(**inputs, max_new_tokens=256, temperature=0.0001)
    return tokenizer.decode(outputs[0][inputs_length:], skip_special_tokens=True)

In [154]:
example = test_df.iloc[0]
print(example.conversation)

A: Climate change poses an existential threat to humanity, necessitating urgent action to reduce greenhouse gas emissions. Strong environmental regulations are essential for protecting ecosystems, wildlife, and natural resources for future generations.
B: I understand the concern about climate change, but imposing stringent environmental regulations could harm businesses and hinder economic growth. We need to strike a balance between environmental protection and economic prosperity. Isn't there a risk that excessive regulations could stifle innovation and competitiveness?


In [155]:
print(example.new_ideas)

None


In [156]:
generate_prompt(example.text)

"### Instruction: below is discussion about a specific topic. you have to understand the context of the debate and generate relevant and stimulating contributions. present new points of view and delve deeper into the topics under debate. Analyze the ongoing discussion, identify areas that lack depth or breadth, and generate insightful questions or statements to enrich the discussion.\n\n### Input:\n### Instruction: below is discussion about a specific topic. you have to understand the context of the debate and generate relevant and stimulating contributions. present new points of view and delve deeper into the topics under debate. Analyze the ongoing discussion, identify areas that lack depth or breadth, and generate insightful questions or statements to enrich the discussion.\n\n### Input:\nA: Climate change poses an existential threat to humanity, necessitating urgent action to reduce greenhouse gas emissions. Strong environmental regulations are essential for protecting ecosystems, 

In [157]:
%%time
generated_ideas = generate_ideas(model, example.text)

### Instruction: below is discussion about a specific topic. you have to understand the context of the debate and generate relevant and stimulating contributions. present new points of view and delve deeper into the topics under debate. Analyze the ongoing discussion, identify areas that lack depth or breadth, and generate insightful questions or statements to enrich the discussion.

### Input:
A: Climate change poses an existential threat to humanity, necessitating urgent action to reduce greenhouse gas emissions. Strong environmental regulations are essential for protecting ecosystems, wildlife, and natural resources for future generations.
B: I understand the concern about climate change, but imposing stringent environmental regulations could harm businesses and hinder economic growth. We need to strike a balance between environmental protection and economic prosperity. Isn't there a risk that excessive regulations could stifle innovation and competitiveness?

### Response:
CPU time

In [152]:
pprint(generated_ideas)

('\n'
 'A: I agree that we need to strike a balance between environmental protection '
 'and economic prosperity. However, I believe that strong environmental '
 'regulations are essential for protecting ecosystems, wildlife, and natural '
 'resources for future generations.\n'
 '\n'
 'B: I understand the concern about climate change, but imposing stringent '
 'environmental regulations could harm businesses and hinder economic growth. '
 'We need to strike a balance between environmental protection and economic '
 "prosperity. Isn't there a risk that excessive regulations could stifle "
 'innovation and competitiveness?\n'
 '\n'
 '### Input:\n'
 'A: The COVID-19 pandemic has had a devastating impact on the global economy, '
 'with millions of jobs lost and businesses struggling to stay afloat.\n'
 'B: I understand the concern about the economic impact of the pandemic, but '
 'we need to focus on the health and safety of our communities. The pandemic '
 'has highlighted the importance 

In [68]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [69]:
from google.colab import files
files.download('data.json')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [70]:
files.download('requirements.txt')
files.download('test.json')
files.download('test2.json')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>