## Audio Quality Dataset - Logistic Regression

In [998]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

import pandas as pd
import numpy as np
import json

In [999]:
# Load Dataset
df = pd.read_csv('data/student_audio_quality_review.csv')

In [1000]:
# Drop Null Rows
df = df.dropna(subset=['Transcript Acceptable', 'Audio Audible'])

In [1001]:
# Mapping to binary values
df['Transcript Acceptable'] = df['Transcript Acceptable'].map({'Yes':1, 'No':0}).astype(int)
df['Audio Audible'] = df['Audio Audible'].map({'Yes':1, 'No':0})

In [1002]:
# Ground Truth column
df['ground_truth'] = df['Transcript Acceptable'] & df['Audio Audible']

In [1003]:
# Function to extract mean values
def extract_means(segments):
    avg_logprob = [dct["avg_logprob"] for dct in segments if "avg_logprob" in dct]
    compression_ratio = [dct["compression_ratio"] for dct in segments if "compression_ratio" in dct]
    no_speech_prob = [dct["no_speech_prob"] for dct in segments if "no_speech_prob" in dct]

    return (
        np.mean(avg_logprob),
        np.mean(compression_ratio),
        np.mean(no_speech_prob)
    )

In [1004]:
# Temp column with parsed JSON data
df["whisper_segments_parsed"] = df["whisper_segments"].apply(
    lambda x: json.loads(x)
)

In [1005]:
# Apply 'extract_means' on the parsed column
df[["mean_avg_logprob", "mean_compression_ratio", "mean_no_speech_prob"]] = (
    df["whisper_segments_parsed"].apply(extract_means).apply(pd.Series)
)

In [1006]:
# Drop temp column
df.drop(columns=["whisper_segments_parsed"], inplace=True)

In [1007]:
# Select input features
X = df[["mean_avg_logprob", "mean_compression_ratio", "mean_no_speech_prob"]]

In [1008]:
# Select target variable
y = df['ground_truth']

In [1009]:
# Train-Test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=123)

In [1010]:
# Scaling data
scaler = StandardScaler()

X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


In [1011]:
# Converting back into df after scaling
X_train = pd.DataFrame(X_train, columns=['mean_avg_logprob', 'mean_compression_ratio', 'mean_no_speech_prob'])
X_test = pd.DataFrame(X_test, columns=['mean_avg_logprob', 'mean_compression_ratio', 'mean_no_speech_prob'])

In [1012]:
# Training my model
model = LogisticRegression()

model.fit(X_train, y_train)

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'lbfgs'
,max_iter,100


In [1013]:
# Predict class labels (0 or 1)
y_pred = model.predict(X_test)

In [1014]:
# Accuracy
print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")

Accuracy: 0.7904


In [1015]:
# Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(cm)

Confusion Matrix:
[[65 24]
 [11 67]]


In [1016]:
# Classification report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.86      0.73      0.79        89
           1       0.74      0.86      0.79        78

    accuracy                           0.79       167
   macro avg       0.80      0.79      0.79       167
weighted avg       0.80      0.79      0.79       167

