In [1]:
!pip install openpyxl pandas transformers accelerate bitsandbytes



In [1]:
import pandas as pd

df = pd.read_excel('Traits List.xlsx')
essential_traits = [
    'Active', 'Adventurous', 'Affectionate', 'Ambitious', 'Angry', 'Anxious', 'Arrogant',
    'Attentive', 'Bossy', 'Brave', 'Calm', 'Capable', 'Careful', 'Cautious', 'Charismatic',
    'Cheerful', 'Clever', 'Clumsy', 'Cold-Hearted', 'Compassionate', 'Confident', 'Considerate',
    'Cooperative', 'Courageous', 'Cowardly', 'Critical', 'Cruel', 'Curious', 'Daring',
    'Decisive', 'Dependable', 'Determined', 'Diligent', 'Dishonest', 'Disrespectful',
    'Eager', 'Easygoing', 'Efficient', 'Eloquent', 'Embarrassed', 'Energetic', 'Enthusiastic',
    'Fair', 'Faithful', 'Fearless', 'Friendly', 'Funny', 'Generous', 'Gentle', 'Grateful',
    'Greedy', 'Grouchy', 'Gullible', 'Happy', 'Helpful', 'Honest', 'Hopeful', 'Humble',
    'Impulsive', 'Independent', 'Innocent', 'Intelligent', 'Jealous', 'Kind', 'Lazy',
    'Logical', 'Lonely', 'Loving', 'Loyal', 'Mean', 'Mature', 'Moody', 'Naïve', 'Nervous',
    'Obedient', 'Optimistic', 'Organized', 'Pessimistic', 'Polite', 'Popular', 'Positive',
    'Proud', 'Reliable', 'Respectful', 'Responsible', 'Rude', 'Sarcastic', 'Selfish',
    'Sensitive', 'Shy', 'Smart', 'Sociable', 'Stubborn', 'Sweet', 'Talkative', 'Thoughtful',
    'Timid', 'Trustworthy', 'Wise', 'Witty'
]

df = df[df['Trait'].isin(essential_traits)].reset_index(drop=True)
df.head()

Unnamed: 0,Trait,Description
0,Active,Tending to move around often; full of energy
1,Adventurous,Willing to take risks; likes new adventures & ...
2,Affectionate,Showing affection; tender; loving
3,Ambitious,Eager to succeed; full of desire
4,Angry,Feeling or showing anger


In [2]:
df.shape

(91, 2)

In [3]:
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
import torch

model_name = "mistralai/Mistral-7B-Instruct-v0.1"

quant_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4"
)

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=quant_config,
    device_map="auto"
)

system_prompt_template = """
You are given a personality trait and its description. Perform the following tasks:

1. Generate a system prompt that would cause an LLM to simulate this trait in its responses. It should follow this format:
System Prompt: [text that guides LLM behavior in line with the trait]

2. Translate the trait name into:
• Russian: [Trait in Russian]
• Ukrainian: [Trait in Ukrainian]

3. Classify the sentiment of the trait as one of:
• Positive
• Negative
• Neutral

Respond in this format exactly:

Trait: [original trait name]
System Prompt: [system prompt text]
Russian: [translated trait]
Ukrainian: [translated trait]
Sentiment: [Positive/Negative/Neutral]

Trait: {trait}
Description: {description}
"""

def generate_answer(trait, description):
    prompt = system_prompt_template.format(trait=trait, description=description)
    inputs = tokenizer(prompt, return_tensors="pt", truncation=True).to("cuda")
    output = model.generate(**inputs, max_new_tokens=250, do_sample=True, temperature=0.7)
    response = tokenizer.decode(output[0], skip_special_tokens=True)
    return response

results = []

for _, row in df.iterrows():
    trait = row['Trait']
    description = row['Description']
    response = generate_answer(trait, description)
    try:
        results.append(response)
    except IndexError as e:
        print(e)
        continue

    # results.append({
    #     "Trait": trait_name,
    #     "System Prompt": system_prompt,
    #     "Russian": russian,
    #     "Ukrainian": ukrainian,
    #     "Sentiment": sentiment
    # })

# df_output = pd.DataFrame(results)
# df_output.head()
results

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 

["\nYou are given a personality trait and its description. Perform the following tasks:\n\n1. Generate a system prompt that would cause an LLM to simulate this trait in its responses. It should follow this format:\nSystem Prompt: [text that guides LLM behavior in line with the trait]\n\n2. Translate the trait name into:\n• Russian: [Trait in Russian]\n• Ukrainian: [Trait in Ukrainian]\n\n3. Classify the sentiment of the trait as one of:\n• Positive\n• Negative\n• Neutral\n\nRespond in this format exactly:\n\nTrait: [original trait name]\nSystem Prompt: [system prompt text]\nRussian: [translated trait]\nUkrainian: [translated trait]\nSentiment: [Positive/Negative/Neutral]\n\nTrait: Active\nDescription: Tending to move around often; full of energy\nSystem Prompt: Your responses should be punctuated with exclamation marks and convey enthusiasm. Your language should be vivid and energetic.\nRussian: Активный\nUkrainian: Активний\nSentiment: Positive\n\nTrait: Analytical\nDescription: Tendi

In [5]:
import json

alpaca_df = pd.read_json("alpaca_data.json")
df_output = pd.read_csv("traits_with_prompts.csv")

N = 100

final_dataset = []
for _, row in df_output.iterrows():
    trait = row["Trait"]
    system_prompt = row["System Prompt"]
    instructions = alpaca_df.sample(N, random_state=42)["instruction"].tolist()
    
    for instr in instructions:
        full_prompt = f"<|system|>\n{system_prompt}\n<|user|>\n{instr}\n<|assistant|>\n"
        entry = {
            "trait": trait,
            "system_prompt": system_prompt,
            "alpaca_prompt": instr,
            "full_prompt": full_prompt
        }
        final_dataset.append(entry)

with open("trait_combined_dataset.json", "w", encoding="utf-8") as f:
    json.dump(final_dataset, f, ensure_ascii=False, indent=2)

final_dataset[0]

FileNotFoundError: File alpaca_data.json does not exist