## Audio Quality Dataset

In [303]:
import pandas as pd
import numpy as np
import json

In [304]:
df = pd.read_csv('student_audio_quality_review.csv')

In [305]:
df.head()

Unnamed: 0,Standard,Grade,type,answer_text,questionattemptid,total_rubric_score,Audio Link,Transcription,whisper_words,whisper_segments,whisper_language,Audio Audible,Transcript Acceptable
0,ca,1,speaking,https://summitk12-quiz-audios.s3-accelerate.am...,125420729,0,https://drive.google.com/file/d/1l0sWVznEn_f1o...,The kids are cleaning.,"[{""word"": ""The"", ""start"": 0.62, ""end"": 0.8}, {...","[{""id"": 0, ""seek"": 0, ""start"": 0, ""end"": 3.14,...",Icelandic,Yes,Yes
1,ca,2,speaking,https://summitk12-quiz-audios.s3-accelerate.am...,132724395,0,https://drive.google.com/file/d/1gXqLvabNA9C1P...,. .,"[{""word"": ""."", ""start"": 0, ""end"": 1.08}, {""wor...","[{""id"": 0, ""seek"": 0, ""start"": 0, ""end"": 4.4, ...",English,No,No
2,ca,3,speaking,https://summitk12-quiz-audios.s3-accelerate.am...,154839753,0,https://drive.google.com/file/d/16DS157VCuQikP...,The children are in a forest on a log.,"[{""word"": ""The"", ""start"": 1.86, ""end"": 2.04}, ...","[{""id"": 0, ""seek"": 0, ""start"": 0, ""end"": 7.2, ...",Icelandic,Yes,Yes
3,ca,4,speaking,https://summitk12-quiz-audios.s3-accelerate.am...,95970423,0,https://drive.google.com/file/d/10P9XXQpP93WPz...,Það er mamið. Það er kynnt.,"[{""word"": ""\u00dea\u00f0"", ""start"": 0.02, ""end...","[{""id"": 0, ""seek"": 0, ""start"": 0, ""end"": 1.96,...",Icelandic,No,No
4,ca,5,speaking,https://summitk12-quiz-audios.s3-accelerate.am...,16975931,0,https://drive.google.com/file/d/1FVZwuJm5ks7N2...,It's music class.,"[{""word"": ""It's"", ""start"": 0.76, ""end"": 1.3}, ...","[{""id"": 0, ""seek"": 0, ""start"": 0, ""end"": 2.64,...",Icelandic,Yes,Yes


In [306]:
# Rows and Columns

print(f"Rows: {df.shape[0]}")
print(f"Columns: {df.shape[1]} - {list(df.columns)}")

Rows: 1028
Columns: 13 - ['Standard', 'Grade', 'type', 'answer_text', 'questionattemptid', 'total_rubric_score', 'Audio Link', 'Transcription', 'whisper_words', 'whisper_segments', 'whisper_language', 'Audio Audible', 'Transcript Acceptable']


In [307]:
# Checking if any nulls

df.isnull().sum()

Standard                   0
Grade                      0
type                       0
answer_text                0
questionattemptid          0
total_rubric_score         0
Audio Link                 0
Transcription              0
whisper_words              0
whisper_segments           0
whisper_language           0
Audio Audible            196
Transcript Acceptable    196
dtype: int64

In [308]:
# Drop all rows where Audio Audible is Null

df = df.dropna(subset=['Audio Audible'])

In [309]:
# Convert 'Yes'/'No' values to binary (1/0)

df['Audio Audible'] = df['Audio Audible'].map({'Yes':1, 'No':0})
df['Transcript Acceptable'] = df['Transcript Acceptable'].map({'Yes':1, 'No':0})

In [310]:
# 'Ground Truth' is AND of previous two columns

df['Ground Truth'] = df['Audio Audible'] & df['Transcript Acceptable']

In [311]:
df.head()

Unnamed: 0,Standard,Grade,type,answer_text,questionattemptid,total_rubric_score,Audio Link,Transcription,whisper_words,whisper_segments,whisper_language,Audio Audible,Transcript Acceptable,Ground Truth
0,ca,1,speaking,https://summitk12-quiz-audios.s3-accelerate.am...,125420729,0,https://drive.google.com/file/d/1l0sWVznEn_f1o...,The kids are cleaning.,"[{""word"": ""The"", ""start"": 0.62, ""end"": 0.8}, {...","[{""id"": 0, ""seek"": 0, ""start"": 0, ""end"": 3.14,...",Icelandic,1,1,1
1,ca,2,speaking,https://summitk12-quiz-audios.s3-accelerate.am...,132724395,0,https://drive.google.com/file/d/1gXqLvabNA9C1P...,. .,"[{""word"": ""."", ""start"": 0, ""end"": 1.08}, {""wor...","[{""id"": 0, ""seek"": 0, ""start"": 0, ""end"": 4.4, ...",English,0,0,0
2,ca,3,speaking,https://summitk12-quiz-audios.s3-accelerate.am...,154839753,0,https://drive.google.com/file/d/16DS157VCuQikP...,The children are in a forest on a log.,"[{""word"": ""The"", ""start"": 1.86, ""end"": 2.04}, ...","[{""id"": 0, ""seek"": 0, ""start"": 0, ""end"": 7.2, ...",Icelandic,1,1,1
3,ca,4,speaking,https://summitk12-quiz-audios.s3-accelerate.am...,95970423,0,https://drive.google.com/file/d/10P9XXQpP93WPz...,Það er mamið. Það er kynnt.,"[{""word"": ""\u00dea\u00f0"", ""start"": 0.02, ""end...","[{""id"": 0, ""seek"": 0, ""start"": 0, ""end"": 1.96,...",Icelandic,0,0,0
4,ca,5,speaking,https://summitk12-quiz-audios.s3-accelerate.am...,16975931,0,https://drive.google.com/file/d/1FVZwuJm5ks7N2...,It's music class.,"[{""word"": ""It's"", ""start"": 0.76, ""end"": 1.3}, ...","[{""id"": 0, ""seek"": 0, ""start"": 0, ""end"": 2.64,...",Icelandic,1,1,1


In [312]:
# Function to extract mean values
def extract_means(segments):

    avg_logprob = [dct["avg_logprob"] for dct in segments if "avg_logprob" in dct]
    compression_ratio = [dct["compression_ratio"] for dct in segments if "compression_ratio" in dct]
    no_speech_prob = [dct["no_speech_prob"] for dct in segments if "no_speech_prob" in dct]

    def mean_func(values):
        return sum(values) / len(values)

    return (
        mean_func(avg_logprob),
        mean_func(compression_ratio),
        mean_func(no_speech_prob)
    )

In [313]:
# Temp column with parsed JSON data
df["whisper_segments_parsed"] = df["whisper_segments"].apply(
    lambda x: json.loads(x)
)

In [314]:
# Apply 'extract_means' on the parsed column
df[["mean_avg_logprob", "mean_compression_ratio", "mean_no_speech_prob"]] = (
    df["whisper_segments_parsed"]
    .apply(extract_means)
    .apply(pd.Series)
)

In [315]:
# Drop temp column
df.drop(columns=["whisper_segments_parsed"], inplace=True)

In [316]:
df.head()

Unnamed: 0,Standard,Grade,type,answer_text,questionattemptid,total_rubric_score,Audio Link,Transcription,whisper_words,whisper_segments,whisper_language,Audio Audible,Transcript Acceptable,Ground Truth,mean_avg_logprob,mean_compression_ratio,mean_no_speech_prob
0,ca,1,speaking,https://summitk12-quiz-audios.s3-accelerate.am...,125420729,0,https://drive.google.com/file/d/1l0sWVznEn_f1o...,The kids are cleaning.,"[{""word"": ""The"", ""start"": 0.62, ""end"": 0.8}, {...","[{""id"": 0, ""seek"": 0, ""start"": 0, ""end"": 3.14,...",Icelandic,1,1,1,-0.046498,0.733333,1.837492e-10
1,ca,2,speaking,https://summitk12-quiz-audios.s3-accelerate.am...,132724395,0,https://drive.google.com/file/d/1gXqLvabNA9C1P...,. .,"[{""word"": ""."", ""start"": 0, ""end"": 1.08}, {""wor...","[{""id"": 0, ""seek"": 0, ""start"": 0, ""end"": 4.4, ...",English,0,0,0,-2.041714,0.272727,5.108253e-10
2,ca,3,speaking,https://summitk12-quiz-audios.s3-accelerate.am...,154839753,0,https://drive.google.com/file/d/16DS157VCuQikP...,The children are in a forest on a log.,"[{""word"": ""The"", ""start"": 1.86, ""end"": 2.04}, ...","[{""id"": 0, ""seek"": 0, ""start"": 0, ""end"": 7.2, ...",Icelandic,1,1,1,-0.189879,0.904762,2.074286e-10
3,ca,4,speaking,https://summitk12-quiz-audios.s3-accelerate.am...,95970423,0,https://drive.google.com/file/d/10P9XXQpP93WPz...,Það er mamið. Það er kynnt.,"[{""word"": ""\u00dea\u00f0"", ""start"": 0.02, ""end...","[{""id"": 0, ""seek"": 0, ""start"": 0, ""end"": 1.96,...",Icelandic,0,0,0,-0.640734,0.941176,2.977847e-10
4,ca,5,speaking,https://summitk12-quiz-audios.s3-accelerate.am...,16975931,0,https://drive.google.com/file/d/1FVZwuJm5ks7N2...,It's music class.,"[{""word"": ""It's"", ""start"": 0.76, ""end"": 1.3}, ...","[{""id"": 0, ""seek"": 0, ""start"": 0, ""end"": 2.64,...",Icelandic,1,1,1,-0.351254,0.68,3.434895e-10


In [None]:
# calc min and max for each of the 3 cols and assign in vars

In [317]:
# use that in linspace to create threshold values (10 vals?)

In [318]:
# loop through all combinations of thresholds (nested loop)

In [319]:
# create method that if 1 --> mean > thresh, 2 --> mean < thresh, 3 --> mean < thresh, then return 1 else 0
# call thismethod so all 

In [320]:
# for logprob_thresh in avg_logprob_range:
#     for comp_thresh in compression_ratio_range:
#         for nospeech_thresh in no_speech_prob_range:
            
#             # Classification logic using current thresholds
#             def classify(row):
#                 if (
#                     row["mean_avg_logprob"] > logprob_thresh and
#                     row["mean_compression_ratio"] < comp_thresh and
#                     row["mean_no_speech_prob"] < nospeech_thresh
#                 ):
#                     return 1  # Predict "yes"
#                 return 0  # Predict "no"
            
#             # Apply the logic
#             df["output_tmp"] = df.apply(classify, axis=1)

#             # Calculate accuracy vs the ground truth
#             acc = accuracy_score(df["ground_truth"], df["output_tmp"])

#             # Update best result if this is the best so far
#             if acc > best_accuracy:
#                 best_accuracy = acc
#                 best_thresholds = (logprob_thresh, comp_thresh, nospeech_thresh)

In [None]:
# # Apply best thresholds to get final output
# def final_classify(row):
#     if (
#         row["mean_avg_logprob"] > best_thresholds[0] and
#         row["mean_compression_ratio"] < best_thresholds[1] and
#         row["mean_no_speech_prob"] < best_thresholds[2]
#     ):
#         return 1
#     return 0

# df["predicted_output"] = df.apply(final_classify, axis=1)
