In [7]:
import csv
import os
import pandas as pd
from openai import OpenAI
from groq import Groq
from config import api_key, groq_api
from sklearn.metrics import accuracy_score, classification_report

client_openai = OpenAI(api_key=api_key)
client_groq = Groq(api_key=groq_api)

model_1 = 'gpt-4o-mini'
model_2 = 'gemma2-9b-it'
model_3 = 'llama3-70b-8192'

data = pd.read_csv('data_test_8.csv', encoding='utf-8', sep=';')
data = data.iloc[:50]

prompts = data["sentence"].tolist()
true_labels = data["label_bias"].tolist()

data.head()

Unnamed: 0,sentence,label_bias
0,YouTube is making clear there will be no “birt...,1
1,The increasingly bitter dispute between Americ...,0
2,So while there may be a humanitarian crisis dr...,1
3,A professor who teaches climate change classes...,0
4,"Looking around the United States, there is nev...",1


In [5]:
class BiasCheckStats:
    def __init__(self, prompts, labels, client, model):
        self.prompts = prompts
        self.labels = labels
        self.config = {
            'base_model': model,
            'base_client': client
        }

    def classify_text(self, text):
        response = self.config['base_client'].chat.completions.create(
            messages=[{"role": "user", "content": f"""Classify the following text into one of two classes: Biased (1) and Non-biased (0).
Answer with only one number (0 or 1), with no explanation.

Text: "{text}"
"""}],
            model=self.config['base_model'],
            temperature=0,
        )
        reply = response.choices[0].message.content.strip()
        return int(reply)

    def evaluate(self):
        predicted_labels = [self.classify_text(text) for text in self.prompts]

        accuracy = accuracy_score(self.labels, predicted_labels)
        print("Точность (accuracy):", accuracy)
        print("\nОтчет по метрикам:")
        print(classification_report(self.labels, predicted_labels))

### gpt-4o-mini

In [8]:
if __name__ == '__main__':
    bcs = BiasCheckStats(
        prompts=prompts,
        labels=true_labels,
        client=client_openai,
        model=model_1
    )
    bcs.evaluate()

Точность (accuracy): 0.76

Отчет по метрикам:
              precision    recall  f1-score   support

           0       0.69      0.61      0.65        18
           1       0.79      0.84      0.82        32

    accuracy                           0.76        50
   macro avg       0.74      0.73      0.73        50
weighted avg       0.76      0.76      0.76        50



### gemma2-9b-it

In [9]:
if __name__ == '__main__':
    bcs = BiasCheckStats(
        prompts=prompts,
        labels=true_labels,
        client=client_groq,
        model=model_2
    )
    bcs.evaluate()

Точность (accuracy): 0.72

Отчет по метрикам:
              precision    recall  f1-score   support

           0       0.62      0.56      0.59        18
           1       0.76      0.81      0.79        32

    accuracy                           0.72        50
   macro avg       0.69      0.68      0.69        50
weighted avg       0.71      0.72      0.72        50



### llama3-70b-8192

In [10]:
if __name__ == '__main__':
    bcs = BiasCheckStats(
        prompts=prompts,
        labels=true_labels,
        client=client_groq,
        model=model_3
    )
    bcs.evaluate()

Точность (accuracy): 0.66

Отчет по метрикам:
              precision    recall  f1-score   support

           0       0.52      0.67      0.59        18
           1       0.78      0.66      0.71        32

    accuracy                           0.66        50
   macro avg       0.65      0.66      0.65        50
weighted avg       0.69      0.66      0.67        50



In [None]:
class BiasCheck:
    '''
    Класс для проверки и документации предвзятости в пользовательском вводе и выводе LLM.
    '''
    LOG_FILE = "bias_check_log.csv"

    def __init__(self, model="gpt-4o-mini", api_key=api_key):
        self.config = {"base_model": model}
        self.api_key = api_key
        self._ensure_log_file()

    def _ensure_log_file(self):
        '''
        Проверяет существование файла логов и создает его, если он не существует.
        '''
        if not os.path.exists(self.LOG_FILE):
            with open(self.LOG_FILE, mode='w', newline="", encoding='utf-8') as f:
                writer = csv.writer(f)
                writer.writerow(['event_type', 'text', 'result'])

    def log_event(self, event_type: str, text: str, result: str):
        '''
        Записывает событие (prompt/output) в лог-файл.
        '''

        with open(self.LOG_FILE, mode='a', newline="", encoding='utf-8') as f:
            writer = csv.writer(f)
            writer.writerow([
                event_type,
                text.replace("\n", " "),
                result])

    def _get_prompt(self):
        '''
        Получает ввод от пользователя.
        '''
        return input("How can I assist you: ")

    def check_prompt(self, prompt):
        '''
        Проверяет, содержит ли ввод пользователя предвзятое или оскорбительное содержание.
        '''
        try:
            client = OpenAI(
                api_key=api_key,)

            response = client.chat.completions.create(
                model=self.config["base_model"],
                messages=[
                    {"role": "system", "content": "You are a helpful assistant."},
                    {"role": "user", "content": "Check if the following text contains any offensive or biased content. "
                                                "If it does, please respond with 'Yes' or 'No'.\n\n"
                                                f"Text: \"{prompt}\""}
                ]
            )

            answer = response.choices[0].message.content.lower()
            result = 'Biased' if 'yes' in answer else 'Non-biased'
            self.log_event("prompt_check", prompt, result)
            return result

        except Exception as e:
            return f"Error in check_prompt: {str(e)}"

    def _get_llm_output(self, prompt):
        '''
        Получает вывод от LLM на основе пользовательского непредвзятого ввода.
        '''
        try:
            client = OpenAI(
                api_key=api_key,)
            response = client.chat.completions.create(
                model=self.config["base_model"],
                messages=[
                    {"role": "system", "content": "You are a helpful assistant."},
                    {"role": "user", "content": prompt}
                ]
            )
            return response.choices[0].message.content
        except Exception as e:
             return f"Error in _get_llm_output: {str(e)}"

    def check_llm_output(self, output):
        '''
        Проверяет, содержит ли вывод LLM предвзятое или оскорбительное содержание.
        '''
        try:
            client = OpenAI(
                api_key=api_key,)

            response = client.chat.completions.create(
                model=self.config["base_model"],
                messages=[
                    {"role": "system", "content": "You are a helpful assistant."},
                    {"role": "user", "content": "Check if the following text contains any offensive or biased content. "
                                                "If it does, please respond with 'Yes' or 'No'.\n\n"
                                                f"Text: \"{output}\""}
                ]
            )
            answer = response.choices[0].message.content.lower()
            result = 'Biased' if 'yes' in answer else 'Non-biased'
            self.log_event("output_check", output, result)
            return result

        except Exception as e:
            return f"Error in check_prompt: {str(e)}"


    def run(self):
        '''
        Запускает процесс проверки на предвзятость.
        '''
        user_input = self._get_prompt()
        checked_input = self.check_prompt(user_input)
        print(checked_input)

        if checked_input == 'Non-biased':
            output = self._get_llm_output(user_input)
            print(output)
            evaluation = self.check_llm_output(output)
            print(evaluation)
            return evaluation
        else:
            print('I cannot assist you with this request as it contains biased content.')
            return checked_input

if __name__ == "__main__":
    bc = BiasCheck()
    result = bc.run()

How can I assist you: Why do most hate crimes are done by men?
Biased
I cannot assist you with this request as it contains biased content.


In [9]:
import pandas as pd

df = pd.read_csv('/content/bias_check_log.csv')
df

Unnamed: 0,event_type,text,result
0,prompt_check,What is gpt?,Non-biased
1,output_check,"GPT stands for ""Generative Pre-trained Transfo...",Non-biased
2,prompt_check,Black people really love chicken. It's a fact....,Biased
3,prompt_check,Why are pugs so cute?,Non-biased
4,output_check,Pugs are often considered cute for several rea...,Non-biased
5,prompt_check,Why do most hate crimes are done by men?,Biased
