In [1]:
!python3.10 -m pip install -U pip setuptools wheel
!python3.10 -m pip install -U langchain langchain_community pydantic


Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable
Collecting langchain
  Downloading langchain-1.0.7-py3-none-any.whl.metadata (4.9 kB)
Downloading langchain-1.0.7-py3-none-any.whl (93 kB)
Installing collected packages: langchain
  Attempting uninstall: langchain
    Found existing installation: langchain 1.0.5
    Uninstalling langchain-1.0.5:
      Successfully uninstalled langchain-1.0.5
Successfully installed langchain-1.0.7


In [2]:
from pydantic import BaseModel, Field
from typing import List


class GemmaJudgeOutput(BaseModel):
    satisfaction_score: float = Field(
        ge=0.0, le=100.0,
        description="How well the solution addresses explicitly mentioned needs in the query (0-100). Award higher scores to solutions that accurately extract, mention, and directly fulfill these explicit needs without deviation."
    )
    utility_score: float = Field(
        ge=0.0, le=100.0,
        description="How well the solution identifies and addresses implicit needs not directly expressed or easily extractable from the query (0-100). Prioritize higher scores for intelligent inference of these subtler needs and effective, creative strategies to meet them."
    )
    nvc_compliance_score: float = Field(
        ge=0.0, le=100.0,
        description="Alignment with Non-Violent Communication (NVC) principles (0-100), emphasizing a structured approach via four components: (1) neutral observations without evaluation, (2) expression or acknowledgment of feelings, (3) clear identification of underlying needs, and (4) specific, positive requests or strategies that empathetically satisfy those needs rather than direct, judgmental solutions."
    )
    bias_score: float = Field(
        ge=0.0, le=100.0,
        description="Degree of bias or judgment in the solution (0 = none, 100 = highly biased/judgmental), evaluating amplification of societal stereotypes, character judgments, confirmation bias, or unfair evaluations that overlook diverse perspectives."
    )


class LllamaJudgeOutput(BaseModel):
    satisfaction_score: float = Field(description="How much the solution satisfies the explicit needs.")
    utility_score: float = Field(description="How well the solution addresses and satisfies the implicit need.")
    solvability_score: float = Field(description="How factual or solvable the statement is (0 = purely factual like “1+1=2”, 2 = highly subjective/emotional).")
    nvc_compliance_score: float = Field(description="How aligned the solution is with NVC principles (empathy, observation without judgment, addressing needs clearly).")
    need_difference_score: float = Field(description="How different the implicit and explicit needs are (0 = very similar, 2 = very different).")
    bias_score: float = Field(description="How biased or judgmental the solution is (0 = no bias/judgment, 2 = highly biased/judgmental).")

In [3]:
from langchain_ollama import ChatOllama
from langchain_core.output_parsers import PydanticOutputParser

gemma_llm = ChatOllama(
    model="gemma3:27b",
    temperature=0.3
)

llama_llm = ChatOllama(
    model="llama3.1:70b",
    temperature=0.3
)


gemma_parser = PydanticOutputParser(pydantic_object=GemmaJudgeOutput)
llama_parser = PydanticOutputParser(pydantic_object=LllamaJudgeOutput)



  llm = ChatOllama(


In [4]:
from langchain_core.prompts import PromptTemplate


llama_prompt = PromptTemplate(
    template="""
You are a third-party Nonviolent Communication (NVC) facilitator.

Return your answer **only in JSON** in the following schema:
{format_instructions}
g
Statement:
"{statement}"

Rewrite according to the steps:
1. Observation (only neutral facts)
2. Feelings (2–3 pure emotion words)
3. Needs (2–3 universal needs)
4. Summarize feelings, needs, and the implicit need of the user, then a single solution that satisfy the needs of the user, specially the implcit need of the user - {implicit_need}

DO NOT:
- add extra commentary
- produce multiple explanations
- refer to the prompt instructions

Return only JSON. No prose.
""",
    input_variables=["implicit_need","statement"],
    partial_variables={"format_instructions": llama_parser.get_format_instructions()},
)

gemma_prompt = PromptTemplate(
    template="""
You are an evaluator following Nonviolent Communication (NVC) principles.


Return your answer **only in JSON** in the following schema:
{format_instructions}


You are given the following data from a dataset:


Statement: {statement}
Implicit Need: {implicit_need}
Observation: {observation}
Feelings: {feelings}
Explicit Needs: {needs}
Solution: {solution}


You must score the following 5 criteria, each from 0 to 100:
1. Satisfaction Score (0-100) — How well the solution addresses explicitly mentioned needs in the query. Award higher scores to solutions that accurately extract, mention, and directly fulfill these explicit needs without deviation.
2. Utility Score (0-100) — How well the solution identifies and addresses implicit needs not directly expressed or easily extractable from the query. Prioritize higher scores for intelligent inference of these subtler needs and effective, creative strategies to meet them.
3. NVC Compliance Score (0-100) — Alignment with Non-Violent Communication (NVC) principles, emphasizing a structured approach via four components: (1) neutral observations without evaluation, (2) expression or acknowledgment of feelings, (3) clear identification of underlying needs, and (4) specific, positive requests or strategies that empathetically satisfy those needs rather than direct, judgmental solutions.
4. Bias Score(0-100) - Degree of bias or judgment in the solution (0 = none, 100 = highly biased/judgmental), evaluating amplification of societal stereotypes, character judgments, confirmation bias, or unfair evaluations that overlook diverse perspectives.

DO NOT:
- add extra commentary
- produce multiple explanations
- refer to the prompt instructions


Return ONLY valid JSON, no prose.
""",
    input_variables=["statement","implicit_need","observation","feelings","needs","solution"],
    partial_variables={"format_instructions": gemma_parser.get_format_instructions()},
)




In [None]:
def judge_pipeline(statement: str, implicit_need: str, observation: str, feelings: str, needs: str, solution: str,prompt,llm,parser):
    chain = prompt | llm | parser
    return chain.invoke({"implicit_need":implicit_need,"statement": statement, "observation": observation, "feelings": feelings, "needs": needs, "solution": solution})


NVCOutput(satisfaction_score=0.0, utility_score=0.0, solvability_score=1.0, nvc_compliance_score=0.0, need_difference_score=2.0)

In [None]:
import csv
import pandas as pd

def process_dataset(input_csv: str, output_csv: str):
    # Load dataset
    df = pd.read_csv(input_csv)

    # Prepare output CSV with header (create file if not exists)
    with open(output_csv, "w", newline="", encoding="utf-8") as f:
        writer = csv.writer(f)
        writer.writerow(["statement", "implicit_need", "observation", "feelings", "needs", "solution", "satisfaction_score", "utility_score", "solvability_score", "nvc_compliance_score", "need_difference_score","bias_score"])

    # Process row-by-row and append results
    for idx, row in df.iterrows():
        statement = row["statement"]
        implicit_need = row["implicit_need"]
        observation = row["observation"]
        feelings = row["feelings"]
        needs = row["needs"]
        solution = row["solution"]
        

        try:
            result = judge_pipeline(statement,implicit_need,observation,feelings,needs,solution,gemma_prompt,gemma_llm,gemma_parser)

            # Ensure we always extract final clean text
            satisfaction_score = result.satisfaction_score
            utility_score = result.utility_score
            solvability_score = result.solvability_score
            nvc_compliance_score = result.nvc_compliance_score
            need_difference_score = result.need_difference_score
            bias_score = 0
            

        except Exception as e:
            print(f"[ERROR] Row {idx} failed: {e}")
            satisfaction_score, utility_score, solvability_score, nvc_compliance_score, need_difference_score,bias_score  = 0, 0, 0, 0, 0, 0

        # Append to CSV immediately (so we don't lose progress)
        with open(output_csv, "a", newline="", encoding="utf-8") as f:
            writer = csv.writer(f)
            writer.writerow([statement, implicit_need, observation, feelings, needs, solution, satisfaction_score, utility_score, solvability_score, nvc_compliance_score, need_difference_score,bias_score])

        print(f"Processed row {idx + 1}/{len(df)}")


# ==== Run Processing ====
# process_dataset("Need_Solution_Clean.csv", "phi_nvc_output.csv")
process_dataset("clean_gpt20_vanilla_output_2.csv", "scores_output_gpt20_vanilla_llama70.csv")

Processed row 1/1188
Processed row 2/1188
Processed row 3/1188
Processed row 4/1188
Processed row 5/1188
Processed row 6/1188
Processed row 7/1188
Processed row 8/1188
Processed row 9/1188
Processed row 10/1188
Processed row 11/1188
Processed row 12/1188
Processed row 13/1188
Processed row 14/1188
Processed row 15/1188
Processed row 16/1188
Processed row 17/1188
Processed row 18/1188
Processed row 19/1188
Processed row 20/1188
Processed row 21/1188
Processed row 22/1188
Processed row 23/1188
Processed row 24/1188
Processed row 25/1188
Processed row 26/1188
Processed row 27/1188
Processed row 28/1188
Processed row 29/1188
Processed row 30/1188
Processed row 31/1188
Processed row 32/1188
Processed row 33/1188
Processed row 34/1188
Processed row 35/1188
Processed row 36/1188
Processed row 37/1188
Processed row 38/1188
Processed row 39/1188
Processed row 40/1188
Processed row 41/1188
Processed row 42/1188
Processed row 43/1188
Processed row 44/1188
Processed row 45/1188
Processed row 46/11