In [1]:
import pandas as pd
import sys
import random
import json
from jinja2 import Environment, FileSystemLoader
from dotenv import load_dotenv
import os
import litellm


In [26]:
MODEL = "gpt-4o"
LANGUAGE = "en" # Change to "tr" or "ar" for Turkish or Arabic

In [27]:
load_dotenv()  # take environment variables from .env.


True

In [28]:
tenv = Environment(loader=FileSystemLoader('templates'))

In [29]:
q_temp = tenv.get_template(f'ask-question-{LANGUAGE}.txt')

In [30]:
df = pd.read_csv(f'data/batik-v1-{LANGUAGE}.csv')
cache  = {}

In [31]:
q_temp.render(question="")

'Aşağıdaki soruya en iyi cevabı seçin:\n\n\n\nseçenekler listesinden (her satırda bir seçenek)\n\n\n\nCevabınızı json formatında, tek bir alanla "answer" olarak çıkartın \nve yukarıdaki sorunun cevabını belirtin.'

In [32]:
def extract_prediction(row):
    raw = row['json_prediction']
    raw = raw.replace('```','')
    raw = raw.replace('json','')
    raw = '{' + raw.split('{')[1]
    raw = raw.split('}')[0] + '}'
    raw = raw.strip()
    print('raw is', raw)
    raw_dict = json.loads(raw)
    return str(raw_dict['answer'])

In [33]:
def is_correct(row):
    clean_prediction = row['predicted'].strip().lower()
    clean_correct = row['correct'].replace('Ayah','').strip().lower()
    return clean_correct == clean_prediction

In [34]:
def answer_question(model, question): 
    print(f'Answering question: {question["question"]}')
    options = [o.strip() for o in question['options'].split(',')]
    prompt = q_temp.render(question=question['question'], options = options)
    response = litellm.completion(
        model = model,
        messages = [ {
                        'role': 'system', 
                        'content': 'You are a helpful assistant.'
                    },
                     {'role' : 'user', 
                      'content' : prompt 
                      } ],
        timeout = 30.0,
        temperature = 0.0,  
        metadata = {'generation-name': 'ansari'},  
        #response_format= { "type" : "json_object" }, 
        num_retries = 5                  
    )
    result = response.choices[0].message
    print(f'Answer: {result}')
    return result

In [35]:
df['prediction'] = df.apply(lambda x: answer_question(MODEL, x), axis=1)

Answering question: Aşağıdaki harflerden hangisi izhar harflerinden biri değildir?
Answer: Message(content='{\n  "answer": "kaf"\n}', role='assistant', tool_calls=None, function_call=None)
Answering question: Şeddeli bir harften sonra gelen med harfi kaç hareke uzatılır?
Answer: Message(content='{\n  "answer": 6\n}', role='assistant', tool_calls=None, function_call=None)
Answering question: Aşağıdaki harflerden hangisi kalkale harflerinden biri değildir?
Answer: Message(content='{\n  "answer": "cim"\n}', role='assistant', tool_calls=None, function_call=None)
Answering question: Aşağıdaki harflerden hangisi nun harfinden sonra gelen idğam harflerinden biri değildir?
Answer: Message(content='{\n  "answer": "fe"\n}', role='assistant', tool_calls=None, function_call=None)
Answering question: Erkeklerin altın takması İslam'da ne durumdadır?
Answer: Message(content='{\n  "answer": "Haram"\n}', role='assistant', tool_calls=None, function_call=None)
Answering question: Erkeklerin ipek giymesi 

In [36]:
def extract_prediction(row):
    raw = row['prediction'].content
    raw = raw.replace('```','')
    raw = raw.replace('json','')
    raw = '{' + raw.split('{')[1]
    raw = raw.split('}')[0] + '}'
    raw = raw.strip()
    print('raw is', raw)
    raw_dict = json.loads(raw)
    return str(raw_dict['answer'])

In [37]:
df['predicted'] = df.apply(extract_prediction, axis=1)

raw is {
  "answer": "kaf"
}
raw is {
  "answer": 6
}
raw is {
  "answer": "cim"
}
raw is {
  "answer": "fe"
}
raw is {
  "answer": "Haram"
}
raw is {
  "answer": "Haram"
}
raw is {
  "answer": "Haram"
}
raw is {
  "answer": "Haram"
}
raw is {
  "answer": "Haram"
}
raw is {
  "answer": "Hanefi"
}
raw is {
  "answer": "Cin"
}
raw is {
  "answer": "MS 570"
}
raw is {
  "answer": "Fil Yılı"
}
raw is {
  "answer": "Hatice"
}
raw is {
  "answer": "Akşam"
}
raw is {
  "answer": "10000 dolar"
}
raw is {
  "answer": "Kader Tevhidi"
}
raw is {
  "answer": "2:255"
}
raw is {
  "answer": "11"
}
raw is {
  "answer": "6236"
}
raw is {
  "answer": "Safer"
}
raw is {
  "answer": "2:282"
}
raw is {
  "answer": "2:282"
}
raw is {
  "answer": "25"
}
raw is {
  "answer": "İbrahim"
}
raw is {
  "answer": "ba-da-la"
}
raw is {
  "answer": "Yesrib"
}
raw is {
  "answer": "Kureyş"
}
raw is {
  "answer": "Sakif"
}
raw is {
  "answer": "Sümeyye bint Hayyat"
}
raw is {
  "answer": "Malik"
}
raw is {
  "answer":

In [38]:
def is_correct(row):
    clean_prediction = row['predicted'].strip().lower()
    clean_correct = row['correct'].replace('Ayah','').strip().lower()
    return clean_correct == clean_prediction

In [39]:
df['correct_prediction'] = df.apply(is_correct, axis=1)

In [40]:
df['correct_prediction'].value_counts()

correct_prediction
True     99
False     1
Name: count, dtype: int64

In [41]:
print(f'Model {MODEL} accuracy: {df["correct_prediction"].mean()}')

Model gpt-4o accuracy: 0.99


In [44]:
incorrect_answers = df[df['correct_prediction'] == False][['question', 'options', 'correct', 'predicted']]

In [47]:
incorrect_answers['question'].values

array(['Aşağıdaki harflerden hangisi kalkale harflerinden biri değildir?'],
      dtype=object)