In [7]:
import os
import subprocess
import pandas as pd
from tqdm import tqdm

# Configuration
OPENSMILE_PATH = "opensmile/build/progsrc/smilextract/SMILExtract"  # Path to SMILExtract
CONFIG_PATH = "opensmile/config/is09-13/IS09_emotion.conf"          # Config file
INPUT_DIR = "hw3_speech_files"                              # Folder with WAV files
OUTPUT_CSV = "opensmile_features.csv"                       # Output CSV

def extract_features(wav_path, output_path):
    """Extract features from single WAV file"""
    cmd = [
        OPENSMILE_PATH,
        "-C", CONFIG_PATH,
        "-I", wav_path,
        "-O", output_path,
        "-l", "0"  # Disable logging
    ]
    try:
        subprocess.run(cmd, check=True, capture_output=True)
        return True
    except subprocess.CalledProcessError:
        return False

def main():
    # Create temporary output directory
    os.makedirs("temp_features", exist_ok=True)
    
    # Process all WAV files
    all_features = []
    failed_files = []
    
    wav_files = [f for f in os.listdir(INPUT_DIR) if f.endswith(".wav")]
    
    for wav_file in tqdm(wav_files, desc="Processing files"):
        wav_path = os.path.join(INPUT_DIR, wav_file)
        temp_output = os.path.join("temp_features", f"{os.path.splitext(wav_file)[0]}.csv")
        
        if extract_features(wav_path, temp_output):
            try:
                # Read the temporary CSV
                df = pd.read_csv(temp_output)
                # Get emotion from filename (assuming format: speaker_session_emotion_...)
                emotion = wav_file.split("_")[2]
                # Add filename and emotion
                features = df.iloc[0].to_dict()  # Use iloc[-1] for last frame if needed
                features.update({
                    "filename": wav_file,
                    "emotion": emotion
                })
                all_features.append(features)
            except Exception as e:
                failed_files.append(wav_file)
        else:
            failed_files.append(wav_file)
    
    # Combine all features into DataFrame
    if all_features:
        df_features = pd.DataFrame(all_features)
        df_features.to_csv(OUTPUT_CSV, index=False)
        print(f"Successfully processed {len(all_features)} files. Results saved to {OUTPUT_CSV}")
    else:
        print("No files were processed successfully.")
    
    if failed_files:
        print(f"Failed to process {len(failed_files)} files:")
        for f in failed_files:
            print(f" - {f}")

if __name__ == "__main__":
    main()

Processing files: 100%|██████████| 2324/2324 [01:47<00:00, 21.70it/s]

No files were processed successfully.
Failed to process 2324 files:
 - mf_001_interest_2590.84_Eight-hundred-two.wav
 - mf_001_anxiety_1171.28_Six-hundred-one.wav
 - cc_001_pride_2501.34_March-twenty-fifth.wav
 - cc_001_boredom_2278.62_Six-hundred-six.wav
 - jg_001_panic_443.70_Fifty-seven.wav
 - mm_001_cold-anger_1680.47_seventy-five.wav
 - cl_001_neutral_45.92_Two-thousand-two.wav
 - gg_001_contempt_2270.73_Six-hundred-three.wav
 - cl_001_elation_834.87_Eighteen-hundred.wav
 - mk_001_interest_2139.89_November-twelfth.wav
 - cl_001_contempt_1612.03_Eight-hundred-nine.wav
 - gg_001_interest_1824.60_Two-thousand-five.wav
 - gg_001_anxiety_811.89_June-twenty-first.wav
 - gg_001_happy_1688.29_Six-thousand-five.wav
 - jg_001_shame_1732.19_July-eighteenth.wav
 - gg_001_disgust_619.74_Six-hundred-twelve.wav
 - jg_001_sadness_1177.27_June-twenty-eighth.wav
 - gg_001_happy_1655.03_May-twentieth.wav
 - mk_001_boredom_2305.64_June-twenty-fourth.wav
 - mf_001_anxiety_1121.12_March-twenty-ninth.wa




In [8]:
import os
import subprocess
import pandas as pd
from tqdm import tqdm

# Configuration
OPENSMILE_PATH = "opensmile/build/progsrc/smilextract/SMILExtract"
CONFIG_PATH = "opensmile/config/is09-13/IS09_emotion.conf"
INPUT_DIR = "hw3_speech_files"
OUTPUT_CSV = "all_features.csv"

def process_file(wav_file):
    """Process a single WAV file and return its features"""
    temp_output = f"temp_{os.urandom(4).hex()}.arff"
    wav_path = os.path.join(INPUT_DIR, wav_file)
    
    cmd = [
        OPENSMILE_PATH,
        "-C", CONFIG_PATH,
        "-I", wav_path,
        "-O", temp_output,
        "-loglevel", "0",
        "-noconsoleoutput", "1"
    ]
    
    try:
        subprocess.run(cmd, check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
        
        # Parse ARFF output
        with open(temp_output) as f:
            lines = f.readlines()
        
        # Extract data section
        data_start = lines.index("@data\n") + 1
        features = lines[data_start].strip().split(",")
        
        # Get emotion from filename (adjust index as needed)
        emotion = wav_file.split("_")[2]
        
        # Get attribute names
        attributes = []
        for line in lines:
            if line.startswith("@attribute"):
                attr_name = line.split()[1]
                attributes.append(attr_name)
        
        # Combine into dictionary
        result = {"filename": wav_file, "emotion": emotion}
        result.update(dict(zip(attributes[1:], features[1:])))  # Skip 'name' attribute
        
        return result
        
    except Exception as e:
        print(f"Error processing {wav_file}: {str(e)}")
        return None
    finally:
        if os.path.exists(temp_output):
            os.remove(temp_output)

def main():
    # Get all WAV files
    wav_files = [f for f in os.listdir(INPUT_DIR) if f.endswith(".wav")]
    
    # Process all files
    results = []
    for wav_file in tqdm(wav_files, desc="Processing files"):
        result = process_file(wav_file)
        if result:
            results.append(result)
    
    # Save to CSV
    if results:
        df = pd.DataFrame(results)
        df.to_csv(OUTPUT_CSV, index=False)
        print(f"\nSuccessfully processed {len(results)} files. Results saved to {OUTPUT_CSV}")
        
        # Show sample output
        print("\nSample output:")
        print(df.head())
    else:
        print("No files were processed successfully.")

if __name__ == "__main__":
    main()

Processing files: 100%|██████████| 2324/2324 [01:42<00:00, 22.59it/s]


Successfully processed 2324 files. Results saved to all_features.csv

Sample output:
                                        filename   emotion
0  mf_001_interest_2590.84_Eight-hundred-two.wav  interest
1     mf_001_anxiety_1171.28_Six-hundred-one.wav   anxiety
2    cc_001_pride_2501.34_March-twenty-fifth.wav     pride
3     cc_001_boredom_2278.62_Six-hundred-six.wav   boredom
4            jg_001_panic_443.70_Fifty-seven.wav     panic





In [3]:
import os
import subprocess
import pandas as pd
import tempfile

# Paths
wav_dir = "hw3_speech_files"
opensmile_bin = "opensmile/build/progsrc/smilextract/SMILExtract"
config_file = "opensmile/config/is09-13/IS09_emotion.conf"
output_csv = "all_features_combined.csv"

# Prepare to collect all features
all_data = []
column_names = None

for wav_file in os.listdir(wav_dir):
    if wav_file.endswith(".wav"):
        wav_path = os.path.join(wav_dir, wav_file)

        # Use a temporary file to capture OpenSMILE output
        with tempfile.NamedTemporaryFile(suffix=".csv", delete=False) as tmpfile:
            tmp_csv_path = tmpfile.name

        # Run OpenSMILE on the current file
        subprocess.run([
            opensmile_bin,
            "-C", config_file,
            "-I", wav_path,
            "-O", tmp_csv_path
        ], check=True)

        # Parse OpenSMILE ARFF-style CSV
        with open(tmp_csv_path, 'r') as f:
            lines = f.readlines()

        # Extract headers
        # Extract column headers
        if column_names is None:
            column_names = [
                line.split()[1]
                for line in lines
                if line.strip().startswith("@attribute")
            ]
            column_names.insert(0, "filename")

        # Find first line that looks like feature values (comma-separated numbers)
        feature_line = next(
            (line.strip() for line in lines if line.strip() and not line.startswith("@")),
            None
        )

        if feature_line is None:
            print(f"⚠️ No feature data found for {wav_file}, skipping.")
            continue
        
        values = feature_line.split(',')
        values.insert(0, wav_file)  # add filename as first column

        all_data.append(values)

        # Clean up temporary file
        os.remove(tmp_csv_path)

# Save final merged CSV
df = pd.DataFrame(all_data, columns=column_names)
df.to_csv(output_csv, index=False)

print(f"✅ All features extracted and saved to {output_csv}")


(MSG) [2] SMILExtract: openSMILE starting!
(MSG) [2] SMILExtract: config file is: opensmile/config/is09-13/IS09_emotion.conf
(MSG) [2] cComponentManager: successfully registered 102 component types.
(MSG) [2] instance 'lldsink': No filename given, disabling this sink component.
(MSG) [2] instance 'lldhtksink': No filename given, disabling this sink component.
(MSG) [2] instance 'lldarffsink': No filename given, disabling this sink component.
(MSG) [2] instance 'csvsink': No filename given, disabling this sink component.
(MSG) [2] instance 'htksink': No filename given, disabling this sink component.
(MSG) [2] cComponentManager: successfully finished createInstances (25 component instances were finalised, 1 data memories were finalised)
(MSG) [2] cComponentManager: starting single thread processing loop
(MSG) [2] cComponentManager: Processing finished! System ran for 105 ticks.
(MSG) [2] SMILExtract: openSMILE starting!
(MSG) [2] SMILExtract: config file is: opensmile/config/is09-13/IS09

ValueError: 1 columns passed, passed data had 387 columns

In [4]:
import os
import subprocess
import pandas as pd
import tempfile

wav_dir = "hw3_speech_files"
opensmile_bin = "opensmile/build/progsrc/smilextract/SMILExtract"
config_file = "opensmile/config/is09-13/IS09_emotion.conf"
output_csv = "all_features_combined.csv"

all_data = []
column_names = None

for wav_file in os.listdir(wav_dir):
    if wav_file.endswith(".wav"):
        wav_path = os.path.join(wav_dir, wav_file)

        with tempfile.NamedTemporaryFile(suffix=".csv", delete=False) as tmpfile:
            tmp_csv_path = tmpfile.name

        subprocess.run([
            opensmile_bin,
            "-C", config_file,
            "-I", wav_path,
            "-O", tmp_csv_path
        ], check=True)

        with open(tmp_csv_path, 'r') as f:
            lines = f.readlines()

        # Try to extract @attribute headers (ARFF-style)
        attributes = [
            line.split()[1]
            for line in lines if line.strip().startswith("@attribute")
        ]

        # Get first line that looks like a feature vector
        feature_line = next(
            (line.strip() for line in lines if line.strip() and not line.startswith("@")),
            None
        )

        if feature_line is None:
            print(f"⚠️ No feature data found in {wav_file}, skipping.")
            continue

        values = feature_line.split(',')

        # If no attribute lines, create dummy headers
        if column_names is None:
            if attributes:
                column_names = ["filename"] + attributes
            else:
                column_names = ["filename"] + [f"feature_{i}" for i in range(len(values))]

        # Check for mismatch and skip broken rows
        if len(values) != len(column_names) - 1:
            print(f"⚠️ Skipping {wav_file}: {len(values)} features vs {len(column_names) - 1} headers.")
            continue

        values.insert(0, wav_file)
        all_data.append(values)

        os.remove(tmp_csv_path)

# Create and save combined DataFrame
df = pd.DataFrame(all_data, columns=column_names)
df.to_csv(output_csv, index=False)
print(f"✅ All features extracted and saved to {output_csv}")


(MSG) [2] SMILExtract: openSMILE starting!
(MSG) [2] SMILExtract: config file is: opensmile/config/is09-13/IS09_emotion.conf
(MSG) [2] cComponentManager: successfully registered 102 component types.
(MSG) [2] instance 'lldsink': No filename given, disabling this sink component.
(MSG) [2] instance 'lldhtksink': No filename given, disabling this sink component.
(MSG) [2] instance 'lldarffsink': No filename given, disabling this sink component.
(MSG) [2] instance 'csvsink': No filename given, disabling this sink component.
(MSG) [2] instance 'htksink': No filename given, disabling this sink component.
(MSG) [2] cComponentManager: successfully finished createInstances (25 component instances were finalised, 1 data memories were finalised)
(MSG) [2] cComponentManager: starting single thread processing loop
(MSG) [2] cComponentManager: Processing finished! System ran for 105 ticks.
(MSG) [2] SMILExtract: openSMILE starting!
(MSG) [2] SMILExtract: config file is: opensmile/config/is09-13/IS09

✅ All features extracted and saved to all_features_combined.csv


In [6]:
import os
import subprocess
import pandas as pd
from tqdm import tqdm

# Configuration
OPENSMILE_PATH = "opensmile/build/progsrc/smilextract/SMILExtract"
CONFIG_PATH = "opensmile/config/is09-13/IS09_emotion.conf"
WAV_DIR = "hw3_speech_files"
OUTPUT_CSV = "is09_features.csv"

def extract_features(wav_path):
    """Extract features with robust CSV handling"""
    temp_output = f"temp_{os.path.basename(wav_path)}.csv"
    
    cmd = [
        OPENSMILE_PATH,
        "-C", CONFIG_PATH,
        "-I", wav_path,
        "-O", temp_output,
        "-loglevel", "0",
        "-noconsoleoutput", "1",
        "-appendcsv", "0"
    ]
    
    try:
        subprocess.run(cmd, check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
        
        # Robust CSV reading with error handling
        try:
            # First try standard read
            df = pd.read_csv(temp_output)
        except pd.errors.ParserError:
            # If failed, manually parse problematic CSV
            with open(temp_output) as f:
                lines = f.readlines()
            
            # Find data section
            data_idx = next(i for i, line in enumerate(lines) if line.startswith("@data")) + 1
            header = [line.split()[1] for line in lines if line.startswith("@attribute")]
            data = [line.strip().split(',') for line in lines[data_idx:]]
            
            # Create DataFrame
            df = pd.DataFrame(data, columns=header)
        
        # Add metadata
        filename = os.path.basename(wav_path)
        emotion = filename.split('_')[2]  # Adjust index as needed
        df.insert(0, 'filename', filename)
        df.insert(1, 'emotion', emotion)
        
        return df
        
    except Exception as e:
        print(f"Error processing {wav_path}: {str(e)}")
        return None
    finally:
        if os.path.exists(temp_output):
            os.remove(temp_output)

def main():
    wav_files = [os.path.join(WAV_DIR, f) for f in os.listdir(WAV_DIR) if f.endswith('.wav')]
    all_features = []
    
    for wav_file in tqdm(wav_files, desc="Processing WAV files"):
        features = extract_features(wav_file)
        if features is not None:
            all_features.append(features)
    
    if all_features:
        # Handle varying column counts
        all_columns = set()
        for df in all_features:
            all_columns.update(df.columns)
        
        # Reindex all DataFrames
        processed_dfs = []
        for df in all_features:
            missing_cols = all_columns - set(df.columns)
            for col in missing_cols:
                df[col] = None  # Add missing columns
            processed_dfs.append(df[list(all_columns)])  # Reorder columns
        
        final_df = pd.concat(processed_dfs, ignore_index=True)
        final_df.to_csv(OUTPUT_CSV, index=False)
        print(f"\nSuccess! Features saved to {OUTPUT_CSV}")
        print(f"Total files processed: {len(processed_dfs)}")
        print(f"Feature dimensions: {final_df.shape[1]-2} features per file")
    else:
        print("No files were processed successfully.")

if __name__ == "__main__":
    main()

Processing WAV files: 100%|██████████| 2324/2324 [01:11<00:00, 32.64it/s]



Success! Features saved to is09_features.csv
Total files processed: 2324
Feature dimensions: 386 features per file


In [7]:
import pandas as pd

# Load the CSV file
input_csv = "is09_features.csv"
output_csv = "cleaned_features.csv"

# Read the CSV, skipping empty rows
df = pd.read_csv(input_csv)

# Remove completely empty rows (where all values are NA)
df_cleaned = df.dropna(how='all')

# Alternative: Remove rows where specific key columns are empty
# df_cleaned = df.dropna(subset=['important_column1', 'important_column2'])

# Save the cleaned data
df_cleaned.to_csv(output_csv, index=False)

print(f"Original rows: {len(df)}")
print(f"Cleaned rows: {len(df_cleaned)}")
print(f"Saved cleaned data to {output_csv}")

Original rows: 4648
Cleaned rows: 4648
Saved cleaned data to cleaned_features.csv
