<a href="https://colab.research.google.com/github/aadipatodia/Speech-Emotion-Recognization/blob/main/EmotionSpeech.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install librosa soundfile numpy scikit-learn pandas

In [None]:
import zipfile
import os

base_dir = '/content/dataset'
os.makedirs(base_dir, exist_ok=True)
zip_files = ['satisfied_calls.zip', 'unsatisfied_calls.zip', 'average_calls.zip']
for zip_file in zip_files:
  with zipfile.ZipFile(f'/content/{zip_file}', 'r') as zip_ref:
        zip_ref.extractall(base_dir)

In [None]:
"""
AIM : To pocess audio files within a specific directory structure. It primarily focuses on converting.ulaw.wav files to standard .wav
      format and then using librosa to inspect the converted (or original) .wav files
"""

import os
import librosa                   # It's used here to load audio files and get their properties like sample rate and duration
from pydub import AudioSegment   # for converting audio file formats.

base_dir = '/content/dataset'
folders = ['Average call quailty', 'Satisfied Call Quality', 'Not-Satisfied Call Quailty']    # subfolders within base_dir that contain

for folder in folders:   # iterates through all 3 folders in dataset folder
    data_dir = os.path.join(base_dir, folder)                  # This constructs the full path to the source directory for the current category.
                                                               # For example, /content/dataset/Average call quailty
    wav_dir_name = folder.replace(' ', '_').lower() + '_wav'   # standardized name for the output directory for converted WAV files
    wav_dir = os.path.join(base_dir, wav_dir_name)
    os.makedirs(wav_dir, exist_ok=True)                        # creates the wav_dir if it doesn't already exist. exist_ok=True prevents an
                                                               # error if the directory already exists from a previous run.

    # First, convert all .ulaw.wav files
    for file in os.listdir(data_dir):   # iterates through each file in folder
        if file.endswith('.ulaw.wav'):
            input_path = os.path.join(data_dir, file)          # Constructs the full path to the input .ulaw.wav file.
            output_path = os.path.join(wav_dir, file.replace('.ulaw.wav', '.wav'))
            try:                        # This block attempts the conversion and catches any errors that might occur
                audio = AudioSegment.from_file(input_path, format='wav')
                audio.export(output_path, format='wav')
                print(f"Converted {input_path} to {output_path}")
            except Exception as e:
                print(f"Error converting {input_path}: {e}")

    # Then, process the converted .wav files (and any original .wav files)
    for file in os.listdir(wav_dir):
        if file.endswith('.wav'):
            path = os.path.join(wav_dir, file)
            try:                                 # This block attempts to load the audio file using librosa and extract its properties.
                y, sr = librosa.load(path, sr=None)
                print(f"File: {path}, Sample rate: {sr} Hz, Duration: {librosa.get_duration(y=y, sr=sr)} seconds")
            except Exception as e:
                print(f"Error loading {path}: {e}")

In [None]:
!ls /content/dataset

In [None]:
import pandas as pd
import os
import re # Import regex for advanced string matching

# Define base directory
base_dir = '/content/dataset'
csv_file_path = '/content/CallRecords.csv'

# Initialize an empty DataFrame to store call records
df_combined = pd.DataFrame(columns=["CALL RECORDING NUMBER", "CALL STATUS"])

## Feature 1, 2, 3: Read filenames from specific folders and populate DataFrame
# Mapping of folder names to call statuses
folder_to_status_map = {
    'average_call_quailty_wav': 'Average',
    'satisfied_call_quality_wav': 'Satisfied',
    'not-satisfied_call_quailty_wav': 'Unsatisfied'
}

print("--- Populating CallRecords.csv from specific folders ---")
for folder, status in folder_to_status_map.items():
    folder_path = os.path.join(base_dir, folder)
    if os.path.exists(folder_path):
        print(f"Processing folder: {folder_path} for status: {status}")
        for filename in os.listdir(folder_path):
            if filename.endswith('.wav') or filename.endswith('.ulaw'): # Consider both formats
                new_record = pd.DataFrame([{"CALL RECORDING NUMBER": filename, "CALL STATUS": status}])
                df_combined = pd.concat([df_combined, new_record], ignore_index=True)
    else:
        print(f"Warning: Directory '{folder_path}' not found. Skipping.")

## Feature 4: Read filenames from 'audio_data' and determine status from filename
audio_data_dir = os.path.join(base_dir, 'audio_data')
if os.path.exists(audio_data_dir):
    print(f"Processing folder: {audio_data_dir} for status extraction from filenames")
    for filename in os.listdir(audio_data_dir):
        if filename.endswith('.wav') or filename.endswith('.ulaw'):
            # Extract status from filename (e.g., 'recording-xxx-avg.wav')
            match = re.search(r'-(avg|sat|unsat)\.', filename)
            call_status = None
            if match:
                status_code = match.group(1)
                if status_code == 'avg':
                    call_status = 'Average'
                elif status_code == 'sat':
                    call_status = 'Satisfied'
                elif status_code == 'unsat':
                    call_status = 'Unsatisfied'

            if call_status:
                new_record = pd.DataFrame([{"CALL RECORDING NUMBER": filename, "CALL STATUS": call_status}])
                df_combined = pd.concat([df_combined, new_record], ignore_index=True)
            else:
                print(f"Warning: Could not determine status for file: {filename} in {audio_data_dir}. Skipping.")
else:
    print(f"Warning: Directory '{audio_data_dir}' not found. Skipping.")

# Write the combined DataFrame to CSV
df_combined.to_csv(csv_file_path, index=False)
print(f"\nSuccessfully created/updated '{csv_file_path}' with {len(df_combined)} records.")

# --- Continue with the original code (modified to use the newly generated CSV) ---

# Read CSV file
df_csv = pd.read_csv(csv_file_path)

df_csv.columns = df_csv.columns.str.strip().str.upper() # Standardizes column name and removes extra spaces etc
df_csv['CALL STATUS'] = df_csv['CALL STATUS'].str.strip().str.upper() # Does the same for values inside CALL STATUS column

# Verify column names
print("\nNormalized column names:", df_csv.columns.tolist())
column_name = 'CALL RECORDING NUMBER'
if column_name not in df_csv.columns:
    raise KeyError(f"Column '{column_name}' not found. Available columns: {df_csv.columns.tolist()}")

def normalize_filename(filename_str): # function to standardize the recording filenames from the CSV.
    # Convert to string first
    filename_str = str(filename_str)
    # Remove any existing .ulaw or .wav suffixes to start fresh
    filename_str = filename_str.replace('.ulaw', '').replace('.wav', '')
    # Add the desired .wav suffix
    return filename_str + '.wav'

df_csv[column_name] = df_csv[column_name].apply(normalize_filename)

# Corrects potential misspellings and standardizes status names
df_csv['CALL STATUS'] = df_csv['CALL STATUS'].replace('SATISIFED', 'SATISFIED')
df_csv['CALL STATUS'] = df_csv['CALL STATUS'].replace('NOT SATISFIED', 'UNSATISFIED')

status_to_parent = { # This links the CSV data to the physical file organization.
    'SATISFIED': 'satisfied_call_quality_wav',
    'UNSATISFIED': 'not-satisfied_call_quailty_wav',
    'AVERAGE': 'average_call_quailty_wav'
}

# Add a specific mapping for audio_data directory
# This assumes that if a file came from 'audio_data', its true parent should be 'audio_data'
# This requires a slight adjustment in how the file path is constructed later.
status_to_actual_folder = {
    'SATISFIED': 'satisfied_call_quality_wav',
    'UNSATISFIED': 'not-satisfied_call_quailty_wav',
    'AVERAGE': 'average_call_quailty_wav'
}


# Create labels dictionary
labels = {} # will store the final mappings
# The keys will be the full paths to the audio files, and the values will be their assigned call status labels

print("\n--- Starting file matching ---")
for index, row in df_csv.iterrows():
    status = row['CALL STATUS']
    record_filename_csv = row[column_name] # e.g., 'recording-173-... .wav'
    record_num_prefix = record_filename_csv.replace('.wav', '') # Get just the base name for matching

    # Determine the *expected* folder based on the filename or status
    # This is crucial for files that might originate from 'audio_data' but have a 'SATISFIED' status

    # First, try to find the file in the specific status folders (satisfied_call_quality_wav, etc.)
    parent_folder_name = status_to_parent.get(status)
    possible_dirs = []

    if parent_folder_name:
        possible_dirs.append(os.path.join(base_dir, parent_folder_name))

    # Always check the 'audio_data' directory as well, as files from there also get statuses
    possible_dirs.append(os.path.join(base_dir, 'audio_data'))

    found_match = False
    for data_dir in possible_dirs:
        if os.path.exists(data_dir):
            for file_in_dir in os.listdir(data_dir):
                if file_in_dir.endswith('.wav') and record_num_prefix in file_in_dir:
                    # Construct the path relative to the base_dir for consistency
                    filename_for_label = os.path.join(os.path.basename(base_dir), os.path.basename(data_dir), file_in_dir)
                    labels[filename_for_label] = status
                    found_match = True
                    break # Found the file, move to the next row in CSV
            if found_match:
                break # Break from iterating possible_dirs if match found

    if not found_match:
        print(f"No match found for CSV entry '{record_filename_csv}' with status '{status}' in any expected directory.")

# Display a sample
print("\nSample labels:", list(labels.items())[:5])
print(f"\nTotal labels created: {len(labels)}")

In [None]:
import pickle
with open('/content/labels.pkl', 'wb') as f:
    pickle.dump(labels, f)

In [None]:
import os
import librosa
import numpy as np
import soundfile as sf
import pickle

def extract_features(file_path):    # It takes the path to an audio file and computes several common audio features.
    y, sr = librosa.load(file_path, sr=None)    # sr = None preserves the original sample rate
    mfccs = np.mean(librosa.feature.mfcc(y=y, sr=sr, n_mfcc=40).T, axis=0)
    chroma = np.mean(librosa.feature.chroma_stft(y=y, sr=sr).T, axis=0)
    mel = np.mean(librosa.feature.melspectrogram(y=y, sr=sr).T, axis=0)
    features = np.hstack([mfccs, chroma, mel])
    return features

base_dir_raw = '/content/dataset'
segmented_dir = '/content/segmented_audio'
os.makedirs(segmented_dir, exist_ok=True)

labels = {}
categories = ['average_call_quailty_wav', 'satisfied_call_quality_wav', 'not-satisfied_call_quailty_wav']

for category in categories:
# It walks through the original audio directories, identifies .wav files, and populates the labels dictionary with 'relative_path : category' mappings.
    category_path = os.path.join(base_dir_raw, category)
    if os.path.isdir(category_path):
        for root, _, files in os.walk(category_path):
            for file in files:
                if file.endswith('.wav'):
                    relative_path = os.path.relpath(os.path.join(root, file), base_dir_raw)
                    labels[relative_path] = category
    else:
        print(f"Category directory not found: {category_path}")

processed_segment_info = []

for filename, status in labels.items():
    file_path = os.path.join(base_dir_raw, filename)
    if os.path.exists(file_path):
        try:
            y, sr = librosa.load(file_path, sr=None)
            duration = librosa.get_duration(y=y, sr=sr)
            if duration >= 45:
                start_sample = int(0 * sr)
                end_sample = int((duration - 25) * sr)
                start_samples_duration = int(20 * sr)
                end_samples_duration = int(25 * sr)

                start_segment = y[start_sample:start_sample + start_samples_duration]
                end_segment = y[end_sample:end_sample + end_samples_duration]

                original_subdir = os.path.basename(os.path.dirname(file_path))
                output_category_dir = os.path.join(segmented_dir, original_subdir)
                os.makedirs(output_category_dir, exist_ok=True)

                base_name = os.path.splitext(os.path.basename(filename))[0]

                for i, segment in enumerate([start_segment, end_segment]):
                    segment_path = os.path.join(output_category_dir, f"{base_name}_part{i+1}.wav")
                    sf.write(segment_path, segment, sr)
                    print(f"Saved {segment_path} for {filename} (Duration: {duration:.2f}s)")

                    processed_segment_info.append({
                        'original_file': filename,
                        'original_status': status,
                        'segment_type': 'start' if i == 0 else 'end',
                        'segment_path': segment_path,
                        'sample_rate': sr,
                        'duration_original_file': duration
                    })
            else:
                print(f"Skipping {filename}: Duration {duration:.2f} seconds is less than 45 seconds")
        except Exception as e:
            print(f"Error processing {file_path}: {e}")
    else:
        print(f"File {file_path} not found")

with open('/content/segments_info.pkl', 'wb') as f:
    pickle.dump(processed_segment_info, f)

print("Segment information successfully pickled to /content/segments_info.pkl")

# Feature extraction part
base_dir_segmented = '/content/segmented_audio'
# Defines the base directory where the segmented audio files are located.
# This is where the script will now look for files to extract features from.
feature_data = []    # empty list to store the extracted numerical feature vectors
feature_labels = []  # empty list to store the corresponding category labels for each feature vector.

if 'labels' not in locals():
# This is a check to see if the labels dictionary (has the segmentation part) exists in the current script's local scope.
    try:
        with open('/content/segments_info.pkl', 'rb') as f:
            loaded_segment_info = pickle.load(f)
        labels = {info['original_file']: info['original_status'] for info in loaded_segment_info}
        print("Labels loaded from /content/segments_info.pkl")
    except FileNotFoundError:
        print("Error: segments_info.pkl not found. 'labels' variable is missing. Please run the segmentation part first or ensure the pickle file exists.")
        exit()

for original_file_relative_path, category_name in labels.items():
    file_base = os.path.splitext(os.path.basename(original_file_relative_path))[0]

    segment_paths = [
        os.path.join(base_dir_segmented, category_name, f"{file_base}_part1.wav"),
        os.path.join(base_dir_segmented, category_name, f"{file_base}_part2.wav")
    ]

    for segment_path in segment_paths:
        if os.path.exists(segment_path):
            features = extract_features(segment_path)
            feature_data.append(features)
            feature_labels.append(category_name)
            print(f"Extracted features from {segment_path} (Label: {category_name})")
        else:
            print(f"Segment {segment_path} not found")

feature_data = np.array(feature_data) #
feature_labels = np.array(feature_labels)

np.save('/content/feature_data.npy', feature_data)
# Saves the feature_data NumPy array to a binary file with a .npy extension.
np.save('/content/feature_labels.npy', feature_labels)
# Saves the feature_labels NumPy array to a .npy file.
print("\nFeatures saved to /content/feature_data.npy and /content/feature_labels.npy")

print(f"\nTotal features extracted: {len(feature_data)}")
print(f"Feature shape: {feature_data.shape}")

with open('/content/features.pkl', 'wb') as f:
    pickle.dump({'data': feature_data, 'labels': feature_labels}, f)

<h2> Purpose: Train a neural network classifier using the extracted features to predict call emotions, prioritizing the end segment’s performance.

In [None]:
from sklearn.neural_network import MLPClassifier # It's used for supervised learning tasks, specifically classification,
# where it learns to map input features to output classes by passing data through multiple layers of interconnected "neurons."
from sklearn.model_selection import train_test_split  # This function is used to divide your dataset into two subsets: a training set and a testing set.
from sklearn.metrics import accuracy_score
# It calculates the proportion of correctly classified instances (predictions) out of the total number of instances
from sklearn.preprocessing import StandardScaler
# : This is a data pre-processing tool used for standardization.
# It transforms your data so that it has a mean of 0 and a standard deviation of 1
import numpy as np
import joblib # for parallel computing

# Load features
feature_data = np.load('/content/feature_data.npy')
feature_labels = np.load('/content/feature_labels.npy')

# Normalize features
scaler = StandardScaler()
feature_data = scaler.fit_transform(feature_data)

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    feature_data, feature_labels, test_size=0.30, random_state=42
)

# Initialize and train the model
model = MLPClassifier(
    hidden_layer_sizes=(100,), # 100 hidden layers
    max_iter=300,
    learning_rate_init=0.001,
    random_state=42
)
model.fit(X_train, y_train)

# Predict and evaluate
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy:.2f}")

# Save the model and scaler
joblib.dump(model, '/content/emotion_model.joblib')
joblib.dump(scaler, '/content/scaler.joblib')
print("Model and scaler saved to /content/emotion_model.joblib and /content/scaler.joblib")

In [None]:
!pip install --upgrade librosa -q

In [None]:
import os
import pandas as pd
import shutil
from google.colab import drive
from urllib.parse import urlparse, parse_qs


# Step 1: Mount Google Drive
drive.mount('/content/drive', force_remount=True)
print("Google Drive mounted.")

# Step 2: Create a local folder to store audio files and statuses
folder_path = '/content/audio_data'
os.makedirs(folder_path, exist_ok=True)
print(f"Folder created at: {folder_path}")

# Step 3: Specify the Drive folder for form responses
drive_folder = '/content/drive/MyDrive/ResponsesEmotionalSpeech/Please upload your audio file: (File responses)'
if not os.path.exists(drive_folder):
    raise FileNotFoundError(f"Drive folder {drive_folder} not found. Please ensure the path is correct and files are present.")

# Step 4: Fetch Google Form responses from the linked spreadsheet
spreadsheet_id = '1tnRV1iIJwg2Y5sipWUceh6ZudVr3fPwGpCtUL8KJISM'
csv_url = f'https://docs.google.com/spreadsheets/d/{spreadsheet_id}/export?format=csv&gid=825495712'
print(f"Attempting to fetch CSV from: {csv_url}")

try:
    df = pd.read_csv(csv_url)
    print("Form responses fetched successfully:")
    print(df)
    # --- IMPORTANT DEBUG STEP ---
    print("\nDataFrame Column Names (Exact):")
    for col in df.columns:
        print(f"'{col}'")
    # --- END DEBUG STEP ---
except Exception as e:
    print(f"Error fetching spreadsheet: {str(e)}. This may be due to an invalid URL or access restrictions.")
    print("Please ensure the spreadsheet is shared with your Google account (edit access) and the ID/gid are correct.")
    raise

# Step 5: List files in the Drive folder
drive_files = os.listdir(drive_folder)
print(f"\nFiles found in Drive folder '{drive_folder}': {drive_files}")

if not drive_files:
    print(f"Warning: The specified Google Drive folder '{drive_folder}' is empty. No audio files to process.")

# Strip whitespace from all column names
df.columns = df.columns.str.strip()

# Define the column names
FILENAME_COLUMN = 'Please type the exact filename of your audio recording (e.g., my_speech.wav):'

def generate_possible_filenames(original_name, drive_files_list):
    name_part, ext_part = os.path.splitext(original_name)
    possible_names = [original_name]

    for drive_file in drive_files_list:
        if drive_file.startswith(name_part) and drive_file.endswith(ext_part):
            if drive_file == original_name:
                continue

            if ' - ' in drive_file:
                parts = drive_file.split(' - ')
                if len(parts) > 1 and parts[0] == name_part:
                    uploader_name_with_ext = parts[1]
                    uploader_name = os.path.splitext(uploader_name_with_ext)[0].split(' (')[0]
                    possible_names.append(f"{name_part} - {uploader_name}{ext_part}")
                    for i in range(1, 10):
                        possible_names.append(f"{name_part} - {uploader_name} ({i}){ext_part}")

    return list(set(possible_names))

# Function to generate the new filename with suffix
def rename_file(original_name, status):
    name_part, ext_part = os.path.splitext(original_name)

    # Remove everything after the first "-" if it exists, but before the extension
    if '-' in name_part:
        name_part = name_part.split('-')[0].strip() # Takes "audio - Aadi (1)" and makes it "audio"

    # Add suffix based on status
    suffix = {
        'Average': 'avg',
        'Satisfied': 'sat',
        'Unsatisfied': 'unsat'
    }.get(status, '')  # Default to empty string if status not recognized

    if suffix:
        name_part = f"{name_part}-{suffix}"

    return f"{name_part}{ext_part}"

for index, row in df.iterrows():
    form_audio_url = row['Please upload your audio file:']
    status = row.get('Choose one of the following:', 'Not provided')

    user_provided_filename = row.get(FILENAME_COLUMN)

    if pd.isna(user_provided_filename) or user_provided_filename == '':
        print(f"Skipping row {index}: User-provided filename is missing or empty in the spreadsheet.")
        continue

    if pd.isna(form_audio_url):
        print(f"Skipping row {index}: Audio file URL is missing.")
        continue

    matched_file = None

    potential_filenames = generate_possible_filenames(user_provided_filename, drive_files)

    for p_name in potential_filenames:
        if p_name in drive_files:
            matched_file = p_name
            break

    if not matched_file and user_provided_filename in drive_files:
        matched_file = user_provided_filename

    if matched_file:
        drive_file_path = os.path.join(drive_folder, matched_file)
        # Generate new filename based on status
        new_filename = rename_file(matched_file, status)
        dest_path = os.path.join(folder_path, new_filename)

        if os.path.exists(drive_file_path):
            shutil.copy(drive_file_path, dest_path)
            # Removed the status file creation
            print(f"Copied '{matched_file}' to '{folder_path}' as '{new_filename}' with status: {status}")
        else:
            print(f"Error: Matched file '{matched_file}' was supposed to be at '{drive_file_path}' but was not found. Sync issue?")
    else:
        print(f"Could not find a matching file in Drive for form entry (row {index}).")
        print(f"    User-provided filename: '{user_provided_filename}'")
        print(f"    Tried matching against potential names: {potential_filenames}")
        print(f"    Files found in Drive folder: {drive_files}")

print("\nContents of audio_data folder:")
print(os.listdir(folder_path))

In [None]:
# Update our ear toy (librosa) quietly
!pip install --upgrade librosa -q

import librosa
import numpy as np
import joblib
import os
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier

# Load the trained model and scaler
model_path = '/content/emotion_model.joblib'
scaler_path = '/content/scaler.joblib'
if not os.path.exists(model_path) or not os.path.exists(scaler_path):
    raise FileNotFoundError("Model or scaler files not found. Please ensure they are saved from Step 6.")
model = joblib.load(model_path)
scaler = joblib.load(scaler_path)

# Function to extract features
def extract_features(y, sr):
    mfccs = np.mean(librosa.feature.mfcc(y=y, sr=sr, n_mfcc=40).T, axis=0)
    chroma = np.mean(librosa.feature.chroma_stft(y=y, sr=sr).T, axis=0)
    mel = np.mean(librosa.feature.melspectrogram(y=y, sr=sr).T, axis=0)
    return np.hstack([mfccs, chroma, mel])

# Map predictions to readable labels
emotion_map = {
    'satisfied_wav': 'SATISFIED',
    'unsatisfied_wav': 'UNSATISFIED',
    'average_call_quailty_wav': 'AVERAGE'
}

# Map filename suffixes to expected emotions
suffix_map = {
    'avg': 'AVERAGE',
    'sat': 'SATISFIED',
    'unsat': 'UNSATISFIED'
}

# Process all audio files in the folder
audio_folder = '/content/audio_data'
if not os.path.exists(audio_folder):
    raise FileNotFoundError(f"Audio folder not found at {audio_folder}. Please ensure the folder exists and contains .wav files.")

audio_files = [f for f in os.listdir(audio_folder) if f.endswith('.wav')]
if not audio_files:
    print(f"No .wav files found in {audio_folder}.")
else:
    # Initial predictions
    initial_results = {}
    feature_data = np.load('/content/feature_data.npy') if os.path.exists('/content/feature_data.npy') else np.array([])
    feature_labels = np.load('/content/feature_labels.npy') if os.path.exists('/content/feature_labels.npy') else np.array([])

    for audio_file in audio_files:
        audio_path = os.path.join(audio_folder, audio_file)
        try:
            y, sr = librosa.load(audio_path, sr=None)
            duration = librosa.get_duration(y=y, sr=sr)

            if duration >= 20:
                y_start = y[:int(20 * sr)]
                start_features = extract_features(y_start, sr)
                start_features = scaler.transform(start_features.reshape(1, -1))
                start_pred = model.predict(start_features)[0]
                start_emotion = emotion_map.get(start_pred.lower().replace('_part1', '').replace('_part2', ''), start_pred)
            else:
                start_emotion = "Too short for 20s analysis!"

            if duration >= 25:
                y_end = y[int(-25 * sr):] if duration > 25 else y
                end_features = extract_features(y_end, sr)
                end_features = scaler.transform(end_features.reshape(1, -1))
                end_pred = model.predict(end_features)[0]
                end_emotion = emotion_map.get(end_pred.lower().replace('_part1', '').replace('_part2', ''), end_pred)
            else:
                end_emotion = "Too short for 25s analysis!"

            initial_results[audio_file] = {'start': start_emotion, 'end': end_emotion}
            print(f"File: {audio_file}")
            print(f"First 20 seconds emotion: {start_emotion}")
            print(f"Last 25 seconds emotion: {end_emotion}")

            # Check and collect data for retraining
            name_part = os.path.splitext(audio_file)[0]
            expected_emotion = None
            for suffix, emotion in suffix_map.items():
                if f'-{suffix}' in name_part:
                    expected_emotion = emotion
                    break

            if expected_emotion and duration >= 25 and end_emotion != expected_emotion and end_emotion not in ["Too short for 25s analysis!"]:
                print(f"Mismatch detected! Expected: {expected_emotion}, Predicted: {end_emotion}")
                new_end_features = extract_features(y_end, sr)
                feature_data = np.vstack([feature_data, new_end_features.reshape(1, -1)]) if feature_data.size else new_end_features.reshape(1, -1)
                feature_labels = np.append(feature_labels, expected_emotion) if feature_labels.size else np.array([expected_emotion])
            elif expected_emotion:
                print(f"Classification matches expected emotion: {expected_emotion}")
            print("-" * 50)

        except Exception as e:
            print(f"Error processing {audio_file}: {str(e)}")
            print("-" * 50)
            continue

    # Retrain the model if new data was added
    if feature_data.size and feature_labels.size:
        X_train, X_test, y_train, y_test = train_test_split(
            feature_data, feature_labels, test_size=0.1, random_state=42
        )

        model = MLPClassifier(
            hidden_layer_sizes=(120,),
            max_iter=1000,  # Increased iterations for better convergence
            learning_rate_init=0.0005,
            random_state=42
        )
        model.fit(X_train, y_train)

        # Re-evaluate all files with the retrained model
        print("\nRe-evaluating all files with retrained model:")
        for audio_file in audio_files:
            audio_path = os.path.join(audio_folder, audio_file)
            try:
                y, sr = librosa.load(audio_path, sr=None)
                duration = librosa.get_duration(y=y, sr=sr)

                if duration >= 20:
                    y_start = y[:int(20 * sr)]
                    start_features = extract_features(y_start, sr)
                    start_features = scaler.transform(start_features.reshape(1, -1))
                    start_pred = model.predict(start_features)[0]
                    start_emotion = emotion_map.get(start_pred.lower().replace('_part1', '').replace('_part2', ''), start_pred)
                else:
                    start_emotion = initial_results[audio_file]['start']

                if duration >= 25:
                    y_end = y[int(-25 * sr):] if duration > 25 else y
                    end_features = extract_features(y_end, sr)
                    end_features = scaler.transform(end_features.reshape(1, -1))
                    end_pred = model.predict(end_features)[0]
                    end_emotion = emotion_map.get(end_pred.lower().replace('_part1', '').replace('_part2', ''), end_pred)
                else:
                    end_emotion = initial_results[audio_file]['end']

                print(f"File: {audio_file}")
                print(f"Updated First 20 seconds emotion: {start_emotion}")
                print(f"Updated Last 25 seconds emotion: {end_emotion}")

                name_part = os.path.splitext(audio_file)[0]
                expected_emotion = next((emotion for suffix, emotion in suffix_map.items() if f'-{suffix}' in name_part), None)
                if expected_emotion and duration >= 25:
                    if end_emotion != expected_emotion and end_emotion not in ["Too short for 25s analysis!"]:
                        print(f"Retrained model still has mismatch! Expected: {expected_emotion}, Predicted: {end_emotion}")
                    else:
                        print(f"Retrained model matches expected emotion: {expected_emotion}")
                print("-" * 50)

            except Exception as e:
                print(f"Error re-evaluating {audio_file}: {str(e)}")
                print("-" * 50)
                continue

        # Save updated data and model
        np.save('/content/feature_data_updated.npy', feature_data)
        np.save('/content/feature_labels_updated.npy', feature_labels)
        joblib.dump(model, '/content/emotion_model_epic_retrained.joblib')
        joblib.dump(scaler, '/content/scaler_epic_retrained.joblib')
        print("Model retrained and saved as /content/emotion_model_epic_retrained.joblib with updated data.")
    else:
        print("No new data for retraining. Model remains unchanged.")