In [39]:
import ollama
from ollama import chat
from ollama import ChatResponse
from pydantic import BaseModel
import json
import pandas as pd

In [40]:
df = pd.read_csv("../data/lisa_sheets.csv")

In [41]:
file_path = "../data/train_test_split/test_folders.json"

In [42]:
with open(file_path, "r", encoding="utf-8") as file:
    test_folders = json.load(file)

In [43]:
df_test = df[df.folder.isin(test_folders)]

In [44]:
class MCQQuestion(BaseModel):
    question: str
    option_a: str
    option_b: str
    option_c: str
    option_d: str
    correct_option: str

In [45]:
from ollama import generate

def generate_mcq(content, model_name, temperature):
    prompt = f"""
    Based on the following educational content, generate a multiple-choice question with four answer 
    options where only one is correct. The question should assess understanding of the main ideas, 
    and the options should be clear, informative, and relevant. Ensure that the distractors (incorrect options) 
    follow a logical but incorrect interpretation, based on common misconceptions or misunderstandings of the topic.
    Answer options must be as short as possible.

    **Educational Content**
    {content}
    """
    
    generate_params = {
        'model': model_name,
        'options': {'temperature': temperature, 'num_ctx': 8192, 'top_p': 1}, 
        'prompt': prompt,
        'format': MCQQuestion.model_json_schema()
    }
    
    # Get a response
    response = generate(**generate_params)
    
    return response['response']

In [46]:
%%time
df_test['generated_questions_0.1'] = df_test['content_gpt'].apply(
    lambda content: generate_mcq(content, model_name="llama3.2:1b-instruct-q8_0", temperature=0.1)
)

CPU times: user 5.07 s, sys: 978 ms, total: 6.05 s
Wall time: 1h 15min 26s


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [47]:
%%time
df_test['generated_questions_0.5'] = df_test['content_gpt'].apply(
    lambda content: generate_mcq(content, model_name="llama3.2:1b-instruct-q8_0", temperature=0.5)
)

CPU times: user 4.84 s, sys: 795 ms, total: 5.64 s
Wall time: 6h 17min 55s


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [48]:
%%time
df_test['generated_questions_0.7'] = df_test['content_gpt'].apply(
    lambda content: generate_mcq(content, model_name="llama3.2:1b-instruct-q8_0", temperature=0.7)
)

CPU times: user 4.03 s, sys: 519 ms, total: 4.55 s
Wall time: 1h 19min 16s


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [49]:
from pydantic import ValidationError

In [50]:
def validate_mcq(mcq_json):
    try:
        return MCQQuestion.model_validate_json(mcq_json)
    except ValidationError as e:
        print(f"Validation failed: {e}")
        return None
        


In [51]:
df_test["validated_mcq_0.7"] = df_test['generated_questions_0.7'].apply(validate_mcq)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test["validated_mcq_0.7"] = df_test['generated_questions_0.7'].apply(validate_mcq)


In [52]:
df_test["validated_mcq_0.5"] = df_test['generated_questions_0.5'].apply(validate_mcq)

Validation failed: 1 validation error for MCQQuestion
  Invalid JSON: EOF while parsing a string at line 1 column 1647587 [type=json_invalid, input_value='{"question":"What is the...one replacement therapy', input_type=str]
    For further information visit https://errors.pydantic.dev/2.10/v/json_invalid


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test["validated_mcq_0.5"] = df_test['generated_questions_0.5'].apply(validate_mcq)


In [53]:
df_test["validated_mcq_0.1"] = df_test['generated_questions_0.1'].apply(validate_mcq)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test["validated_mcq_0.1"] = df_test['generated_questions_0.1'].apply(validate_mcq)


In [54]:
df_test.to_csv('llama1b_mcqs.csv', index=False)

In [58]:
from pandas import DataFrame

def flatten_and_export_mcq(df: DataFrame, export_filename: str, mcq_column_name: str):
    result_df = df[['id']].copy()
    
    result_df['question'] = df[mcq_column_name].apply(lambda x: x.question)
    result_df['option_a'] = df[mcq_column_name].apply(lambda x: x.option_a)
    result_df['option_b'] = df[mcq_column_name].apply(lambda x: x.option_b)
    result_df['option_c'] = df[mcq_column_name].apply(lambda x: x.option_c)
    result_df['option_d'] = df[mcq_column_name].apply(lambda x: x.option_d)
    result_df['correct_option'] = df[mcq_column_name].apply(lambda x: x.correct_option)
    
    result_df.to_csv(export_filename, index=False)

In [68]:
flatten_and_export_mcq(df_test, '../data/base_models/llama1b/temp0.7.csv', 'validated_mcq_0.7')
flatten_and_export_mcq(df_test, '../data/base_models/llama1b/temp0.5.csv', 'validated_mcq_0.5')
flatten_and_export_mcq(df_test, '../data/base_models/llama1b/temp0.1.csv', 'validated_mcq_0.1')

In [67]:
# One-liner version
df_test[df_test['validated_mcq_0.5'].isna()].index.tolist()

[]

In [66]:
empty_mcq = MCQQuestion(
    question="",
    option_a="",
    option_b="",
    option_c="",
    option_d="",
    correct_option=""
)

# Set the value at the specified index
df_test.at[3742, 'validated_mcq_0.5'] = empty_mcq