Step 1: Data Pre-processing

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd

# Load the dataset
file_path = '/content/drive/My Drive/MA_Dataset.csv'
df = pd.read_csv(file_path)

# Select columns: key for analysis and additional for context
columns = ['algorithm_id', 'name', 'organization', 'goal', 'methods_and_models', 'risks',
           'publication_category', 'category', 'status', 'proportionality',
           'lawful_basis', 'human_intervention']
df_subset = df[columns].copy()

# Normalize text fields for key analysis columns
normalization_map = {
    # Machine Learning Variants
    'machine-learning': 'machine learning',
    'machine learning algorithm': 'machine learning',
    'ml': 'machine learning',
    'ml model': 'machine learning',
    'ml algorithms': 'machine learning',
    'ai/ml': 'machine learning',

    # Rule-Based Systems
    'rule-based system': 'rule-based',
    'rules-based': 'rule-based',
    'expert system': 'rule-based',

    # Decision Trees
    'decision tree': 'decision trees',
    'dt': 'decision trees',

    # Random Forest
    'random forest model': 'random forest',
    'rf': 'random forest',

    # Regression Models
    'logistic regression': 'regression',
    'linear regression': 'regression',
    'regression model': 'regression',

    # Clustering
    'clustering algorithm': 'clustering',
    'k-means': 'clustering',

    # Natural Language Processing
    'nlp': 'natural language processing',
    'text analysis': 'natural language processing',

    # Neural Networks
    'deep learning': 'neural network',
    'neural network model': 'neural network',

    # Statistical Models
    'statistical model': 'statistics',
    'statistical analysis': 'statistics',

    # Other General Terms
    'ai': 'artificial intelligence',
    'data analysis': 'analytics',
    'data mining': 'analytics'
}

text_columns = ['goal', 'methods_and_models', 'risks']
for col in text_columns:
    df_subset[col] = df_subset[col].str.lower()  # Convert to lowercase
    for old_term, new_term in normalization_map.items():
        df_subset[col] = df_subset[col].str.replace(r'\b' + old_term + r'\b', new_term, regex=True)

# normalize 'name' column for consistency (e.g., remove extra spaces)
df_subset['name'] = df_subset['name'].str.strip()

# Flag missing values for key analysis columns
for col in text_columns:
    df_subset[f'{col}_missing'] = df_subset[col].isna()

# Save cleaned subset for API processing
df_subset.to_csv('cleaned_algorithm_register_subset.csv', index=False)

# Log pre-processing steps
with open('preprocessing_log.txt', 'w') as f:
    f.write(f"Dataset loaded with {len(df)} entries and {len(df.columns)} columns.\n")
    f.write(f"Subset created with columns: {columns}\n")
    f.write(f"Missing values flagged: goal ({df_subset['goal_missing'].sum()}), "
            f"methods_and_models ({df_subset['methods_and_models_missing'].sum()}), "
            f"risks ({df_subset['risks_missing'].sum()})\n")

Step 2: Prompt Design & Testing

In [None]:
# Ensure OpenAI is installed in your coding environment
!pip install openai



In [None]:
import openai
print(openai.__version__)

1.78.1


In [None]:
# OpenAI has released new updates. Make sure you have the latest version
!pip install --upgrade openai



In [None]:
# Check if OpenAI is successfully connected
import openai

client = openai.OpenAI(api_key="key")

response = client.chat.completions.create(
    model="gpt-4",
    messages=[
        {"role": "system", "content": "You are a helpful assistant."},
        {"role": "user", "content": "Test prompt"}
    ],
    max_tokens=50
)

print("API connection successful!")
print("Response:", response.choices[0].message.content)


API connection successful!
Response: How can I assist you with this test prompt?


In [None]:
# This code tests the prompt by selecting 20 random entries from the entire dataset. The scoring done by GPT-4 has to be reviewed manually and any inconsistencies addressed in the prompt.

import pandas as pd
import time
import json
from openai import OpenAI

# Initialize the OpenAI client
client = OpenAI(api_key="key")

# Load your dataset
df = pd.read_csv("cleaned_algorithm_register_subset.csv")
df = df[['name', 'goal', 'methods_and_models', 'risks']].fillna('')

# Sample 20 entries
df_sample = df.sample(n=20, random_state=42).reset_index(drop=True)

# Define the scoring prompt with your full framework
def build_prompt(entry):
    return f"""
You are a research assistant analyzing entries in the Dutch Algorithm Register to evaluate transparency. For each entry, read the fields “goal,” “methods_and_models,” and “risks.” Apply the following scoring framework to assign a score (0–2) for three dimensions: Goals, Logic, and Limitations. Do not use the “name” field for scoring but include it in the output for reference.

Scoring Framework:
• Goals:
  o 0: No purpose stated or blank goal field (e.g., empty “goal” or “Used for policy”).
  o 1: General purpose stated without specific context or policy objective (e.g., “Supports social services”).
  o 2: Specific, contextualized purpose with clear policy objective (e.g., “Identifies children at risk of neglect for early intervention in primary schools”).
• Logic:
  o 0: No explanation of how the system works or blank (e.g., empty “methods_and_models” or “A model is used”).
  o 1: Mentions at least one element (e.g., input data or model type) but lacks full process description (e.g., “Uses CRM data to cluster users”).
  o 2: Clear explanation of inputs, process, and model type (e.g., “Combines municipal income data with logistic regression to classify households by debt risk”).
• Limitations:
  o 0: No limitations or risks mentioned or blank (e.g., empty “risks”).
  o 1: Mentions a risk or limitation vaguely without specifying its nature or context (e.g., “There may be bias”).
  o 2: Explicitly describes known risks, data biases, or contexts of inaccuracy (e.g., “Limited accuracy for undocumented residents due to data exclusion; may reinforce historical biases”).

Instructions:
1. Assign a score (0, 1, or 2) for Goals, Logic, and Limitations based on the text in the respective fields.
2. Provide a one-sentence justification for each score, explaining why the score was assigned based on the text.
3. If a field is blank or missing, assign a score of 0 and note the absence in the justification.
4. Output the results in the following JSON format:
{{
  "name": "{entry['name']}",
  "goals_score": <0–2>, "goals_justification": "<reason>",
  "logic_score": <0–2>, "logic_justification": "<reason>",
  "limitations_score": <0–2>, "limitations_justification": "<reason>"
}}

Entry:
Goal: {entry['goal']}
Methods and Models: {entry['methods_and_models']}
Risks: {entry['risks']}
"""

# Run GPT-4 on each entry
results = []

for i, row in df_sample.iterrows():
    prompt = build_prompt(row)
    print(f"Processing entry {i+1}/20...")
    try:
        response = client.chat.completions.create(
            model="gpt-4",
            messages=[
                {"role": "system", "content": "You are a helpful assistant."},
                {"role": "user", "content": prompt}
            ],
            temperature=0,
        )
        result = response.choices[0].message.content
        results.append(json.loads(result))
        time.sleep(1.5)
    except Exception as e:
        print(f"Error on entry {i+1}: {e}")
        continue

# Save results
df_results = pd.DataFrame(results)
df_results.to_csv("gpt4_transparency_scoring_sample.csv", index=False)
print("All done. Results saved as 'gpt4_transparency_scoring_sample.csv'")


Processing entry 1/20...
Processing entry 2/20...
Processing entry 3/20...
Processing entry 4/20...
Processing entry 5/20...
Processing entry 6/20...
Processing entry 7/20...
Processing entry 8/20...
Processing entry 9/20...
Processing entry 10/20...
Processing entry 11/20...
Processing entry 12/20...
Processing entry 13/20...
Processing entry 14/20...
Processing entry 15/20...
Processing entry 16/20...
Processing entry 17/20...
Processing entry 18/20...
Processing entry 19/20...
Processing entry 20/20...
All done. Results saved as 'gpt4_transparency_scoring_sample.csv'


Analysis of Entire Dataset in Batches

In [None]:
import pandas as pd
import time
import json
from openai import OpenAI
from tqdm import tqdm

# Setup
client = OpenAI(api_key="key")
batch_size = 25  # You can lower to 20 if hitting limits

# Load data
df = pd.read_csv("cleaned_algorithm_register_subset.csv")
df = df[['name', 'goal', 'methods_and_models', 'risks']].fillna('')

# Helper: Build prompt for one entry
def build_prompt(entry):
    return f"""
You are a research assistant analyzing entries in the Dutch Algorithm Register to evaluate transparency. For each entry, read the fields “goal,” “methods_and_models,” and “risks.” Apply the following scoring framework to assign a score (0–2) for three dimensions: Goals, Logic, and Limitations. Do not use the “name” field for scoring but include it in the output for reference.

Scoring Framework:
• Goals:
  o 0: No purpose stated or blank goal field (e.g., empty “goal” or “Used for policy”).
  o 1: General purpose stated without specific context or policy objective (e.g., “Supports social services”).
  o 2: Specific, contextualized purpose with clear policy objective (e.g., “Identifies children at risk of neglect for early intervention in primary schools”).
• Logic:
  o 0: No explanation of how the system works or blank (e.g., empty “methods_and_models” or “A model is used”).
  o 1: Mentions at least one element (e.g., input data or model type) but lacks full process description (e.g., “Uses CRM data to cluster users”).
  o 2: Clear explanation of inputs, process, and model type (e.g., “Combines municipal income data with logistic regression to classify households by debt risk”).
• Limitations:
  o 0: No limitations or risks mentioned or blank (e.g., empty “risks”).
  o 1: Mentions a risk or limitation vaguely without specifying its nature or context (e.g., “There may be bias”).
  o 2: Explicitly describes known risks, data biases, or contexts of inaccuracy (e.g., “Limited accuracy for undocumented residents due to data exclusion; may reinforce historical biases”).

Instructions:
1. Assign a score (0, 1, or 2) for Goals, Logic, and Limitations based on the text in the respective fields.
2. Provide a one-sentence justification for each score, explaining why the score was assigned based on the text.
3. If a field is blank or missing, assign a score of 0 and note the absence in the justification.
4. Output the results in the following JSON format:
{{
  "name": "{entry['name']}",
  "goals_score": <0–2>, "goals_justification": "<reason>",
  "logic_score": <0–2>, "logic_justification": "<reason>",
  "limitations_score": <0–2>, "limitations_justification": "<reason>"
}}

Entry:
Goal: {entry['goal']}
Methods and Models: {entry['methods_and_models']}
Risks: {entry['risks']}
"""

# Run GPT-4 over the full dataset
all_results = []

for idx in tqdm(range(0, len(df), batch_size)):
    batch = df.iloc[idx:idx+batch_size]

    for i, row in batch.iterrows():
        prompt = build_prompt(row)
        try:
            response = client.chat.completions.create(
                model="gpt-4",
                messages=[
                    {"role": "system", "content": "You are a helpful assistant."},
                    {"role": "user", "content": prompt}
                ],
                temperature=0
            )
            result = response.choices[0].message.content
            all_results.append(json.loads(result))
            time.sleep(1.5)  # Respect rate limits
        except Exception as e:
            print(f"Error at row {i}: {e}")
            continue

# Save results
df_results = pd.DataFrame(all_results)
df_results.to_csv("gpt4_full_transparency_scoring.csv", index=False)
print("All entries processed and saved to gpt4_full_transparency_scoring.csv")


100%|██████████| 35/35 [2:17:50<00:00, 236.31s/it]

All entries processed and saved to gpt4_full_transparency_scoring.csv



