In [24]:
# 📘 Grammar Score Prediction from Audio - SHL Internship Task
# ------------------------------------------------------------

# ✅ Section 1: Imports & Setup
import os
import pandas as pd
import numpy as np
import librosa
import librosa.display
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from scipy.stats import pearsonr

# ✅ Section 2: Load Data
train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")
sample_submission = pd.read_csv("sample_submission.csv")

print("Train shape:", train_df.shape)
print("Test shape:", test_df.shape)

# ✅ Section 3: Audio Feature Extraction
AUDIO_DIR = "audios/train"  # update if different

# If your column is 'filename' instead of 'file_name', clean it accordingly
train_df['filename'] = train_df['filename'].apply(lambda x: os.path.basename(str(x).split("\\")[-1].strip()))

def extract_features(file_path):
    y, sr = librosa.load(file_path, sr=None)
    mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13)
    mfccs_mean = np.mean(mfccs.T, axis=0)
    return mfccs_mean

# Extract features for train files
train_features = []
skipped_files = 0
for i, fname in enumerate(train_df['filename']):
    if i % 50 == 0:
        print(f"Processing training file {i+1}/{len(train_df)}")
    path = os.path.join(AUDIO_DIR, fname)
    if not os.path.exists(path):
        print(f"File not found: {path}")
        skipped_files += 1
        continue
    try:
        features = extract_features(path)
        train_features.append(features)
    except Exception as e:
        print(f"Error processing {fname}: {e}")
        skipped_files += 1

print(f"\n✅ Extracted features from {len(train_features)} files. Skipped {skipped_files} files.")

if len(train_features) == 0:
    raise ValueError("No training features extracted. Please check file paths and filenames.")

X = np.array(train_features)
y = train_df.loc[:len(train_features)-1, 'label'].values  # match y length with X

# ✅ Section 4: Train-Test Split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# ✅ Section 5: Modeling
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# ✅ Section 6: Evaluation
val_preds = model.predict(X_val)
rmse = mean_squared_error(y_val, val_preds, squared=False)
pearson_corr = pearsonr(y_val, val_preds)[0]

print(f"\n✅ RMSE on validation: {rmse:.4f}")
print(f"✅ Pearson Correlation: {pearson_corr:.4f}")

# ✅ Section 7: Test Predictions
TEST_AUDIO_DIR = "audios/test"  # update if needed
test_df['file_name'] = test_df['file_name'].apply(lambda x: os.path.basename(str(x).strip()))
test_features = []
for i, fname in enumerate(test_df['file_name']):
    if i % 50 == 0:
        print(f"Processing test file {i+1}/{len(test_df)}")
    path = os.path.join(TEST_AUDIO_DIR, fname)
    if not os.path.exists(path):
        print(f"File not found: {path}")
        continue
    try:
        features = extract_features(path)
        test_features.append(features)
    except Exception as e:
        print(f"Error processing {fname}: {e}")
        continue

X_test = np.array(test_features)
test_preds = model.predict(X_test)

# ✅ Section 8: Submission
submission = pd.DataFrame({
    'file_name': test_df['file_name'][:len(test_preds)],
    'label': test_preds
})

submission.to_csv("submission.csv", index=False)
print("\n📦 Submission file saved as submission.csv")

# ✅ Section 9: Visualization
plt.figure(figsize=(6, 6))
sns.scatterplot(x=y_val, y=val_preds)
plt.xlabel("True Labels")
plt.ylabel("Predicted Labels")
plt.title("True vs Predicted Grammar Scores")
plt.grid()
plt.show()

# ✅ Section 10: Report Summary
print("""
--- Report Summary ---
- Preprocessing: Extracted MFCC features (13 dimensions per file).
- Model: Random Forest Regressor.
- Metric on Validation:
    RMSE: {:.4f}
    Pearson Correlation: {:.4f}
- Test predictions saved in submission.csv.

Tip: Try using Wav2Vec2 or HuBERT embeddings for better performance.
""".format(rmse, pearson_corr))


Train shape: (444, 2)
Test shape: (204, 1)
Processing training file 1/444
Processing training file 51/444
Processing training file 101/444
Processing training file 151/444
Processing training file 201/444
Processing training file 251/444
Processing training file 301/444
Processing training file 351/444
Processing training file 401/444

✅ Extracted features from 444 files. Skipped 0 files.


TypeError: got an unexpected keyword argument 'squared'