In [1]:
%%writefile requirements.txt
datasets==2.20.0
pandas==2.2.2
mistralai==0.4.1
python-dotenv==1.0.1


Overwriting requirements.txt


In [2]:
!python -m pip install -r requirements.txt



In [3]:
from dotenv import load_dotenv
load_dotenv()

True

In [4]:
from datasets import load_dataset

mt_bench_data = load_dataset("m-a-p/MusicTheoryBench")
mt_bench_dev = mt_bench_data["dev"].to_pandas()
mt_bench_test = mt_bench_data["test"].to_pandas()

print(f"Number of rows in eval dataset is {len(mt_bench_test)}")
mt_bench_dev.head()


  from .autonotebook import tqdm as notebook_tqdm


Number of rows in eval dataset is 367


Unnamed: 0,id,instruction,stem,options,subject,answer,split,abc_score,analysis
0,13,Read the following questions from the four opt...,Use the example below to answer the question t...,"{'A': 'Major seventh in third inversion', 'B':...",knowledge,A,dev,L:1/4\nM:4/4\nK:D\n [FGBd]4 |] %1,The chord in the given example has G as its ro...
1,41,Read the following questions from the four opt...,Use the example below to answer the question t...,"{'A': 'C-sharp', 'B': 'E-sharp', 'C': 'A-sharp...",knowledge,C,dev,L:1/4\nM:4/4\nK:E\n ^a4 |] %1,"In the treble clef, the lines from bottom to t..."
2,47,Read the following questions from the four opt...,Use the example below to answer the question t...,"{'A': 'B-flat', 'B': 'D', 'C': 'B', 'D': 'D-fl...",knowledge,D,dev,"L:1/4\nM:4/4\nK:Cb\n D,4 |] %1",The note in the example is D-flat. The key sig...
3,88,Read the following questions from the four opt...,Use the example below to answer the question t...,"{'A': 'ii 6 / 4 – V – vi 6 - iii ', 'B': 'I 6 ...",knowledge,B,dev,"L:1/4\nM:4/4\nK:E\n [G,B,E] [A,CE] [F,B,D] [F,...",The chords in the example are: E major in firs...
4,100,Read the following questions from the four opt...,Use the example below to answer the question t...,"{'A': 'viio', 'B': 'V', 'C': 'ii', 'D': 'iv'}",knowledge,A,dev,L:1/4\nM:4/4\nK:F#\n [EGB]4 |] %1,The chord in the given example is an E# fully ...


In [5]:
mt_bench_dev.fillna("", inplace=True)
mt_bench_test.fillna("", inplace=True)

In [6]:
def get_options_text(options_dict):
  text = ""
  for i, value in enumerate(options_dict.values()):
    text += f"{i+1}: {value}\n"
  return text

mt_bench_dev["options_text"] = mt_bench_dev["options"].apply(get_options_text)
mt_bench_test["options_text"] = mt_bench_test["options"].apply(get_options_text)

print(mt_bench_dev["options_text"][0])

1: Major seventh in third inversion
2: Dominant seventh in second inversion
3: Major/minor seventh in third inversion
4: Minor seventh in second inversion



In [7]:
output_tag = """
**Additional Instruction:**
1. Strictly adhere to the output format. No yapping, no chit-chat, just the output format.
2. Strict use double quotes for the keys in the output JSON.

**Output Format:**
{
  "analysis": <The explanation goes here, typeLstr>,
  "answer": <index of the correct option, type:int>
}
"""

mt_bench_dev["input_text"] = "**Instruction:**\n" + mt_bench_dev["instruction"] + "\n\n**ABC Score:**\n" + mt_bench_dev["abc_score"] + "\n\n**Question:**\n" + mt_bench_dev["stem"] + "\n\n**Options**\n" + mt_bench_dev["options_text"] + "\n" + output_tag
mt_bench_test["input_text"] = "**Instruction:**\n" + mt_bench_test["instruction"] + "\n\n**ABC Score:**\n" + mt_bench_test["abc_score"] + "\n\n**Question:**\n" + mt_bench_test["stem"] + "\n\n**Options**\n" + mt_bench_test["options_text"] + "\n" + output_tag

print(mt_bench_dev["input_text"][0])

**Instruction:**
Read the following questions from the four options (A, B, C and D) given in each question. Choose the best option

**ABC Score:**
L:1/4
M:4/4
K:D
 [FGBd]4 |] %1

**Question:**
Use the example below to answer the question that follows.
Which of the following best describes the seventh chord in the above example?

**Options**
1: Major seventh in third inversion
2: Dominant seventh in second inversion
3: Major/minor seventh in third inversion
4: Minor seventh in second inversion


**Additional Instruction:**
1. Strictly adhere to the output format. No yapping, no chit-chat, just the output format.
2. Strict use double quotes for the keys in the output JSON.

**Output Format:**
{
  "analysis": <The explanation goes here, typeLstr>,
  "answer": <index of the correct option, type:int>
}



In [8]:
import os
from mistralai.client import MistralClient
from mistralai.models.chat_completion import ChatMessage

api_key = os.environ["MISTRAL_API_KEY"]
model = "open-mistral-7b"

client = MistralClient(api_key=api_key)

chat_response = client.chat(
    model=model,
    messages=[ChatMessage(role="user", content=mt_bench_dev["input_text"][0])]
)

output_text = chat_response.choices[0].message.content
print(output_text)

{
  "analysis": "The ABC notation provided represents a chord with four notes: F, A, C, and Bd (flat). This chord is in the key of F major, and the seventh note is Bd (flat). Since the chord is in the F major key, it is a dominant seventh chord. In a dominant seventh chord, the seventh note is a minor seventh above the root. In this case, Bd (flat) is a minor seventh above F. The chord is also in third inversion because the third (A) is the highest note. However, the specific inversion is determined by the bass note, which in this case is not given. But since A is the third note in the chord, the chord is in third inversion relative to its root (F).",
  "answer": 4
}


In [9]:
import json
output_text = output_text.replace("`", "")
json_resp = json.loads(output_text)
print(json_resp["answer"])

4


In [10]:
def generate_answer(input_text):

    chat_response = client.chat(
        model=model,
        messages=[ChatMessage(role="user", content=input_text)]
    )

    output_text = chat_response.choices[0].message.content
    
    # Replace ' with " to convert the output to a valid json format
    try:    
        json_resp = json.loads(output_text)
        answer, analysis = json_resp["answer"], json_resp["analysis"]
    except Exception as e:
        print(f"Model output: {output_text}")
        print("Failed due to error: ", e)
        answer, analysis = -1, "Failed due to error"
            
    return answer, analysis

In [11]:
mt_bench_dev_pred = []
mt_bench_dev_analysis = []

for input_text in mt_bench_dev["input_text"]:
    answer, analysis = generate_answer(input_text)
    mt_bench_dev_pred.append(answer)
    mt_bench_dev_analysis.append(analysis)

mt_bench_dev["prediction"] = mt_bench_dev_pred
mt_bench_dev["analysis"] = mt_bench_dev_analysis

In [12]:
from tqdm.auto import tqdm

mt_bench_test_pred = []
mt_bench_test_analysis = []
   e3nnn    
for input_text in tqdm(mt_bench_test["input_text"]):
    answer, analysis = generate_answer(input_text)
    mt_bench_test_pred.append(answer)
    mt_bench_test_analysis.append(analysis)

mt_bench_test["prediction"] = mt_bench_test_pred  
mt_bench_test["analysis"] = mt_bench_test_analysis

 86%|████████▌ | 314/367 [07:05<01:30,  1.70s/it]

Model output: {
  "analysis": "Passing chromatic tones are tones that move from one note to another note that is chromatically adjacent. In this case, to find passing chromatic tones, we should look for notes where the melody moves by a semitone. Let's examine the options provided:",
  "answer": 4
}

Explanation:
1: d^cB moves by a whole tone, not a semitone.
2: F_GE moves by a whole tone, not a semitone.
3: ABc moves by a whole tone, not a semitone.
4: E^DE moves by a semitone (E to D), which makes it a passing chromatic tone.
Failed due to error:  Extra data: line 6 column 1 (char 288)


100%|██████████| 367/367 [08:28<00:00,  1.39s/it]


In [17]:
answer_to_idx = {
    "A": 1,
    "B": 2,
    "C": 3,
    "D": 4,
}
mt_bench_dev["Answer_Idx"] = mt_bench_dev["answer"].map(answer_to_idx)
mt_bench_test["Answer_Idx"] = mt_bench_test["answer"].map(answer_to_idx)

In [18]:
dev_accuracy = 100*(mt_bench_dev["prediction"] == mt_bench_dev["Answer_Idx"]).sum() / len(mt_bench_dev)
print(f"Accuracy on the dev set is {dev_accuracy:.2f}%")

accuracy = 100*(mt_bench_test["prediction"] == mt_bench_test["Answer_Idx"]).sum() / len(mt_bench_test)
print(f"Accuracy on the test set is {accuracy:.2f}%")

Accuracy on the dev set is 20.00%
Accuracy on the test set is 43.60%


In [16]:
# from tqdm.auto import tqdm

# def get_accuracy_on_dataset(dataset):
#     options_text = []
#     for options_dict in dataset["options"]:
#         text = ""
#         for i, value in enumerate(options_dict.values()):
#             text += f"{i+1}: {value}\n"
#         options_text.append(text)
#     dataset["options_text"] = options_text

#     dataset["input_text"] = "**Instruction:**\n" + dataset["instruction"] + "\n\n**ABC Score:**\n" + dataset["abc_score"] + "\n\n**Question:**\n" + dataset["stem"] + "\n\n**Options**\n" + dataset["options_text"] + "\n" + output_tag
    
#     dataset_pred = []
#     dataset_analysis = []
#     for input_text in tqdm(dataset["input_text"]):
#         answer, analysis = generate_answer(input_text)
#         dataset_pred.append(answer)
#         dataset_analysis.append(analysis)

#     dataset["prediction"] = dataset_pred
#     dataset["analysis"] = dataset_analysis

#     dataset["Answer_Idx"] = dataset["answer"].map(answer_to_idx)
#     accuracy = 100*(dataset["prediction"] == dataset["Answer_Idx"]).sum() / len(dataset)

#     return accuracy

# test_dataset = mt_bench_test.copy()
# accuracy = get_accuracy_on_dataset(test_dataset)
# print(f"Accuracy on the test set is {accuracy:.2f}%")