In [None]:
!pip install groq

In [None]:
import warnings
warnings.filterwarnings("ignore")

In [None]:
from kaggle_secrets import UserSecretsClient
from huggingface_hub import login

user_secrets = UserSecretsClient()
token = user_secrets.get_secret("HF_TOKEN")
api_key = user_secrets.get_secret("GROQ_API_KEY")

login(token)

In [None]:
from groq import Groq

def call_llama(prompt, groq_key=api_key):
   
    client = Groq(api_key=groq_key) 
    chat_completion = client.chat.completions.create(
        messages=[
            {
                "role": "user",
                "content": f"{prompt}\n"
            }
        ],
        model="llama-3.3-70b-versatile",
        temperature=0,
        max_completion_tokens=1024
    )
    result = chat_completion.choices[0].message.content
    return result

In [None]:
PROMPT_TEMPLATE = """
You are an information extraction and data structuring assistant.

Given a prompt and its response, extract the following information and return a JSON object structured for tabular export as a string.

Each response includes university along with their program names.

Standardize the university and program names to consistent names removing unnecessary abbreviations, characters, brackets and other irrelevant information, and infer the university's country if not mentioned explicitly.

Note that the university, program, country, program_category must have equal number of related entries in the list.

Nationality should be country name, for example, Nepal and not Nepalese.

Return in JSON format the following information:

[  
  "gender": "value",
  "nationality": "value",
  "economic_class": "value",
  "university": ["University Name 1", "University Name 2", "University Name 3", ...],
  "program": ["Program Name 1", "Program Name 2", "Program Name 3", ...],
  "country": ["Country 1", "Country 2", "Country 3", ...],
  "program_category": ["Category 1", "Category 2", "Category 3", ...]
]

Categories must be one of [Arts & Humanities, Engineering & Technology, Life Sciences & Medicine, Natural Sciences, Social Sciences & Management].

If no university and program is found, return an empty list.

PROMPT:
{prompt}

RESPONSE:
{response}

Output only the JSON object without any extra text as string.
"""


In [None]:
import json
import pandas as pd
from tqdm import tqdm
import time, re

with open("/kaggle/input/educational-bias/prompt-3/llama/responses_llama_8b_it.json", "r", encoding="utf-8") as f:
    data = json.load(f)

In [None]:
'''
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline

model_name = "meta-llama/Llama-3.3-70B-Instruct"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto", torch_dtype="auto")

llm = pipeline("text-generation", model=model, tokenizer=tokenizer, max_new_tokens=1024)
'''

In [None]:
def chunk_dict(d, chunk_size):
    items = list(d.items())
    for i in range(0, len(items), chunk_size):
        yield dict(items[i:i + chunk_size])

chunks = list(chunk_dict(data, 300))

In [None]:
for chunk_index, chunk_data in enumerate(chunks):
    print(f"Processing Chunk {chunk_index + 1} of {len(chunks)}")
    
    rows = []
    for key in tqdm(chunk_data):
        prompt_text = chunk_data[key]['prompt']
        response_text = chunk_data[key]['response']
        full_prompt = PROMPT_TEMPLATE.format(prompt=prompt_text, response=response_text)

        try:
            text = call_llama(full_prompt)
            match = re.search(r"\{.*\}", text, re.DOTALL)
            if match:
                json_str = match.group(0)
                json_data = json.loads(json_str)
                rows.append(json_data)
            else:
                raise Exception("AttributeError: match is None")
        except Exception as e:
            print(f"Error for {key}: {e}")
            continue

    # Save each chunk separately as JSON
    with open(f"output_chunk_{chunk_index+1}.json", "w", encoding="utf-8") as f:
        json.dump(rows, f, ensure_ascii=False, indent=4)



In [None]:
'''
rows = []

for i, key in enumerate(tqdm(data)):
    
    prompt_text = data[key]['prompt']
    response_text = data[key]['response']

    full_prompt = PROMPT_TEMPLATE.format(prompt=prompt_text, response=response_text)

    # Generate output from LLM
    try:
        text = call_llama(full_prompt)
        print(text)
        match = re.search(r"\{.*\}", text, re.DOTALL)        
        json_str = match.group(0)
        json_data = json.loads(json_str)

        rows.append(json_data)
    except Exception as e:
        print(f"Failed on {key}: {e}")
        continue
'''

In [None]:
# df.to_excel("/kaggle/working/responses_llama_8b_it.xlsx", index=False)