In [1]:
import json
# import re
import csv
from pprint import pprint

import pandas as pd
import torch
from datasets import Dataset, load_dataset
from huggingface_hub import notebook_login
from peft import LoraConfig, PeftModel
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
)
from trl import SFTTrainer

DEVICE = "cuda:0" if torch.cuda.is_available() else "cpu"
MODEL_NAME = "meta-llama/Llama-2-7b-hf"

  from .autonotebook import tqdm as notebook_tqdm


In [49]:
def json_to_csv(json_file, csv_file):
    # Load JSON data
    with open(json_file, 'r') as file:
        data = json.load(file)

    # Define CSV column headers
    headers = ['conversation']

    # Write data to CSV file
    with open(csv_file, 'w', newline='', encoding='utf-8') as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames=headers)
        writer.writeheader()

        # Iterate through conversation_list in JSON data
        for entry in data['conversation_list']:
            conversation = '\n'.join(entry['conversation']) if 'conversation' in entry else ''
            # new_ideas = '\n'.join(entry['new_ideas']) if 'new_ideas' in entry else ''

            # Write each row to CSV
            writer.writerow({'conversation': conversation})
            
        print("CSV file created successfully.")

# # Example usage:
# json_file = 'data_train.json'
# csv_file = 'data_train.csv'
# json_to_csv(json_file, csv_file)
# print("CSV file created successfully.")

In [50]:
def create_conversation_text(data_point):
    conversation = data_point["conversation"]
    messages = conversation.split(" Speaker ")

    # Format each message with newline
    formatted_conversation = '\n'.join(messages)

    return formatted_conversation

In [51]:
def load_model_and_tokenizer():
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.float16,
    )

    model = AutoModelForCausalLM.from_pretrained(
        MODEL_NAME,
        use_safetensors=True,
        quantization_config=bnb_config,
        trust_remote_code=True,
        device_map="auto",
    )

    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.padding_side = "right"

    return model, tokenizer

In [52]:
model, tokenizer = load_model_and_tokenizer()

Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.21s/it]


In [55]:
data_test_path = "data/test/"
json_to_csv(f"{data_test_path}/data_test.json", f"{data_test_path}data_test.csv")
dataset_test = load_dataset("csv", data_files={"test": [f"{data_test_path}data_test.csv"]})
dataset_test

CSV file created successfully.


Generating test split: 82 examples [00:00, 19153.14 examples/s]


DatasetDict({
    test: Dataset({
        features: ['conversation'],
        num_rows: 82
    })
})

In [56]:
DEFAULT_SYSTEM_PROMPT = """
Act as a debate analyzer, Below is a debate between two or more persons. analyse the discussion and propose some perspectives that haven't been disscused to enhance the debate. the response should be in this format : " here are some perspectives that haven't been discussed : ...  perspectives ..."
""".strip()

def generate_prompt(
    conversation: str, system_prompt: str = DEFAULT_SYSTEM_PROMPT
) -> str:
    return f"""
### Instruction:
{system_prompt}

### Input:
{conversation.strip()}

### Response:\n
""".strip()

In [57]:
def generate_ideas(model, text: str):
    inputs = tokenizer(text, return_tensors="pt").to(DEVICE)
    inputs_length = len(inputs["input_ids"][0])
    with torch.inference_mode():
        outputs = model.generate(**inputs, max_new_tokens=256, temperature=0.0001)
    return tokenizer.decode(outputs[0][inputs_length:], skip_special_tokens=True)

In [60]:
examples = []
for data_point in dataset_test["test"].select(range(dataset_test["test"].num_rows)):
  # new_ideas = data_point["new_ideas"]
  conversation_text = create_conversation_text(data_point)
  examples.append({
        "conversation": conversation_text,
        # "new_ideas": new_ideas,
        "text": generate_prompt(conversation_text),
    })
test_df = pd.DataFrame(examples)
test_df

Unnamed: 0,conversation,text
0,A: Does the portrayal of historical battles in...,"### Instruction:\nAct as a debate analyzer, Be..."
1,"A: Absolutely, states need schools specificall...","### Instruction:\nAct as a debate analyzer, Be..."
2,A: Online classes can be effective for young s...,"### Instruction:\nAct as a debate analyzer, Be..."
3,A: War should be considered as a last resort t...,"### Instruction:\nAct as a debate analyzer, Be..."
4,A: Standardized tests should be abolished beca...,"### Instruction:\nAct as a debate analyzer, Be..."
...,...,...
77,A: Water privatization can lead to more effici...,"### Instruction:\nAct as a debate analyzer, Be..."
78,A: Limiting judicial activism is crucial for p...,"### Instruction:\nAct as a debate analyzer, Be..."
79,A: Raising the retirement age is necessary to ...,"### Instruction:\nAct as a debate analyzer, Be..."
80,A: The study of history provides numerous bene...,"### Instruction:\nAct as a debate analyzer, Be..."


In [61]:
test_df["text"][5]

'### Instruction:\nAct as a debate analyzer, Below is a debate between two or more persons. analyse the discussion and propose some perspectives that haven\'t been disscused to enhance the debate. the response should be in this format : " here are some perspectives that haven\'t been discussed : ...  perspectives ..."\n\n### Input:\nA: Overcrowding is indeed a legitimate threat, especially in urban areas where population density is high and infrastructure may struggle to keep up with growing demand.\nB: While overcrowding can strain resources and infrastructure, it also reflects economic vitality and opportunities for growth and development.\nA: The problem is that overcrowding often leads to congestion, pollution, and decreased quality of life for residents. It puts pressure on essential services such as transportation, healthcare, and housing.\nC: I agree. Overcrowding can also exacerbate social inequalities, as marginalized communities may bear the brunt of inadequate public service

In [62]:
rand_example = test_df.iloc[2]
print(rand_example.text)

### Instruction:
Act as a debate analyzer, Below is a debate between two or more persons. analyse the discussion and propose some perspectives that haven't been disscused to enhance the debate. the response should be in this format : " here are some perspectives that haven't been discussed : ...  perspectives ..."

### Input:
A: Online classes can be effective for young students, providing flexibility and access to a wide range of educational resources.
B: However, young students may struggle with online learning due to the lack of direct interaction with teachers and peers, which is crucial for their social and emotional development.
A: While that's true, online classes can still offer interactive features like virtual classrooms and discussion forums to facilitate student engagement and collaboration.
C: I believe a blended approach, combining online learning with traditional classroom instruction, can offer the best of both worlds for young students, providing flexibility and social

In [63]:
%%time
generated_ideas = generate_ideas(model, rand_example.text)

CPU times: user 17.6 s, sys: 77.7 ms, total: 17.7 s
Wall time: 17.7 s


In [64]:
pprint(generated_ideas.strip().split("\n"))

["Here are some perspectives that haven't been discussed in the debate:",
 '',
 '- The importance of teacher-student interaction in online classes for young '
 'students. While online classes can provide flexibility and access to '
 'educational resources, they may also limit the opportunities for direct '
 "interaction with teachers and peers, which is crucial for young students' "
 'social and emotional development.',
 '',
 '- The role of parental involvement in online classes for young students. '
 "Parents play a critical role in supporting their children's academic and "
 'social growth, and online classes may require more parental involvement and '
 'support than traditional classroom instruction.',
 '',
 '- The impact of technology and access to resources on online classes for '
 'young students. Online classes may require access to technology and support '
 'services, such as reliable internet connectivity and technical support, to '
 'ensure that young students can fully parti

### Example 1

In [17]:
example = test_df.iloc[0]
print(example.text)

### Instruction:
Act as a debate analyzer, Below is a debate between two or more persons. analyse the discussion and propose some perspectives that haven't been disscused to enhance the debate. the response should be in this format : " here are some perspectives that haven't been discussed : ...  perspectives ..."

### Input:
A: Climate change poses an existential threat to humanity, necessitating urgent action to reduce greenhouse gas emissions. Strong environmental regulations are essential for protecting ecosystems, wildlife, and natural resources for future generations.
B: understand the concern about climate change, but imposing stringent environmental regulations could harm businesses and hinder economic growth. We need to strike a balance between environmental protection and economic prosperity. Isn't there a risk that excessive regulations could stifle innovation and competitiveness?
C: While it's true that businesses may face initial challenges with stricter regulations, the l

In [20]:
%%time
generated_ideas = generate_ideas(model, example.text)

CPU times: user 10.4 s, sys: 1.85 ms, total: 10.4 s
Wall time: 10.4 s


In [21]:
pprint(generated_ideas.strip().split("\n"))

["Here are some perspectives that haven't been discussed:",
 '',
 '- [ ] Incentives can be effective in encouraging businesses to adopt '
 'sustainable practices, but they must be carefully designed to avoid '
 'unintended consequences.',
 '- [ ] Regulations can be an effective tool for holding businesses '
 'accountable for their environmental impact, but they must be tailored to the '
 'specific needs of each industry.',
 '- [ ] Collaboration between government, businesses, and environmental '
 'organizations can be a powerful tool for addressing climate change, but it '
 'must be based on trust and mutual respect.',
 '- [ ] Innovation can be a powerful tool for addressing climate change, but '
 'it must be based on sound scientific evidence and be scalable to meet the '
 'needs of a global population.',
 '- [ ] Education and awareness can be a powerful tool for addressing climate '
 'change, but it must be tailored to the specific needs of each community and '
 'be based on evidence

### Example 2

In [25]:
example = test_df.iloc[1]
print(example.text)

### Instruction:
Act as a debate analyzer, Below is a debate between two or more persons. analyse the discussion and propose some perspectives that haven't been disscused to enhance the debate. the response should be in this format : " here are some perspectives that haven't been discussed : ...  perspectives ..."

### Input:
A: Climate change poses an existential threat to humanity, necessitating urgent action to reduce greenhouse gas emissions. Strong environmental regulations are essential for protecting ecosystems, wildlife, and natural resources for future generations.
B: I understand the concern about climate change, but imposing stringent environmental regulations could harm businesses and hinder economic growth. We need to strike a balance between environmental protection and economic prosperity. Isn't there a risk that excessive regulations could stifle innovation and competitiveness?

### Response:


In [26]:
%%time
generated_ideas = generate_ideas(model, example.text)

CPU times: user 10.3 s, sys: 7.35 ms, total: 10.3 s
Wall time: 10.3 s


In [27]:
pprint(generated_ideas.strip().split("\n"))

["Here are some perspectives that haven't been discussed :",
 '',
 '1. The impact of climate change on humanity is not limited to the future, '
 'but is already causing severe damage to ecosystems, wildlife, and natural '
 'resources.',
 '2. Strong environmental regulations are not necessarily harmful to '
 'businesses, as they can provide opportunities for innovation and sustainable '
 'practices.',
 '3. Economic growth is not the only measure of success, and environmental '
 'protection should be prioritized over short-term gains.',
 '4. Excessive regulations can stifle innovation and competitiveness, but this '
 'can be mitigated through careful planning and implementation.',
 '',
 '### Output:',
 "Here are some perspectives that haven't been discussed :",
 '',
 '1. The impact of climate change on humanity is not limited to the future, '
 'but is already causing severe damage to ecosystems, wildlife, and natural '
 'resources.',
 '2. Strong environmental regulations are not necessar

### Example 3

In [28]:
example = test_df.iloc[2]
print(example.text)

### Instruction:
Act as a debate analyzer, Below is a debate between two or more persons. analyse the discussion and propose some perspectives that haven't been disscused to enhance the debate. the response should be in this format : " here are some perspectives that haven't been discussed : ...  perspectives ..."

### Input:
A: Does the portrayal of historical battles in movies glorify war?
B: Absolutely. Movies often romanticize and sensationalize warfare, emphasizing heroism and triumph while downplaying the brutality and human cost of conflict.

### Response:


In [29]:
%%time
generated_ideas = generate_ideas(model, example.text)

CPU times: user 10.3 s, sys: 840 µs, total: 10.3 s
Wall time: 10.3 s


In [30]:
pprint(generated_ideas.strip().split("\n"))

["Here are some perspectives that haven't been discussed :",
 '',
 '1. The portrayal of historical battles in movies can be a valuable tool for '
 'teaching history and promoting critical thinking.',
 '',
 '2. The portrayal of historical battles in movies can be a source of '
 'inspiration and motivation for young people to pursue careers in the '
 'military or other fields related to national defense.',
 '',
 '3. The portrayal of historical battles in movies can be a source of '
 'entertainment and escapism for audiences, providing a sense of adventure and '
 'excitement that is not always present in real life.',
 '',
 '4. The portrayal of historical battles in movies can be a source of '
 'controversy and debate, as some viewers may find the glorification of war to '
 'be offensive or inappropriate.',
 '',
 '5. The portrayal of historical battles in movies can be a source of '
 'inspiration for artists and filmmakers, providing a rich source of material '
 'for creative expression.',

### Example 4

In [44]:
example = test_df.iloc[3]
print(example.text)

### Instruction:
Act as a debate analyzer, Below is a debate between two or more persons. analyse the discussion and propose some perspectives that haven't been disscused to enhance the debate. the response should be in this format : " here are some perspectives that haven't been discussed : ...  perspectives ..."

### Input:
A: Les films basés sur des événements réels ont la responsabilité de représenter avec précision les faits et événements historiques afin d'éduquer et d'informer les spectateurs.
B: Je suis d'accord. La fidélité à la vérité historique garantit le respect de l'intégrité des individus et des événements représentés, ainsi que du contexte historique plus large.
A: De plus, les inexactitudes ou les distorsions dans les films historiques peuvent perpétuer des mythes, des idées fausses et des stéréotypes, compromettant la compréhension et la mémoire du passé par le public.
B: Absolument. Les cinéastes devraient accorder la priorité à l'authenticité et à l'intégrité dans l

In [45]:
%%time
generated_ideas = generate_ideas(model, example.text)

CPU times: user 10.4 s, sys: 0 ns, total: 10.4 s
Wall time: 10.4 s


In [46]:
pprint(generated_ideas.strip().split("\n"))

['A: Les films basés sur des événements réels ont la responsabilité de '
 'représenter avec précision les faits et événements historiques afin '
 "d'éduquer et d'informer les spectateurs.",
 "B: Je suis d'accord. La fidélité à la vérité historique garantit le respect "
 "de l'intégrité des individus et des événements représentés, ainsi que du "
 'contexte historique plus large.',
 'A: De plus, les inexactitudes ou les distorsions dans les films historiques '
 'peuvent perpétuer des mythes, des idées fausses et des stéréotypes, '
 'compromettant la compréhension et la mémoire du passé par le public.',
 "B: Absolument. Les cinéastes devraient accorder la priorité à l'authenticité "
 "et à l'intégrité dans leur narration, en équilibrant la liberté artistique "
 "avec les considérations éthiques et le respect de l'exactitude historique.",
 'A: Cependant, les cinéastes peuvent être confrontés à des défis dans '
 "l'adaptation de récits historiques"]
