# Audio Data Collection and Voiceprint Verification Model
## Formative 2: Multimodal Data Preprocessing Assignment

## 1. Import Required Libraries

In [None]:
import librosa
import librosa.display
import soundfile as sf
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
from IPython.display import display
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, classification_report, confusion_matrix
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

print(f"Libraries imported | Librosa version: {librosa.__version__}")

## 2. Directory Structure Setup

In [None]:
base_dir = Path(r'c:\Users\evotech\Documents\MACHINE LEARNING PROJECTS\Group_5_Multimodal-Data-Preprocessing-Assignment')
audio_dir = base_dir / 'Dataset' / 'audio_samples'
original_dir = audio_dir / 'original'
augmented_dir = audio_dir / 'augmented'
unauthorized_dir = audio_dir / 'unauthorized'

original_dir.mkdir(parents=True, exist_ok=True)
augmented_dir.mkdir(parents=True, exist_ok=True)
unauthorized_dir.mkdir(parents=True, exist_ok=True)

print(f"Directories ready:\n   Original: {original_dir}\n   Augmented: {augmented_dir}\n   Unauthorized: {unauthorized_dir}")

## 3. Load Audio Samples

In [None]:
audio_files = list(original_dir.glob('*.wav'))
audio_data = []

print(f"Found {len(audio_files)} audio files:")
print("=" * 70)

for audio_file in sorted(audio_files):
    y, sr = librosa.load(audio_file, sr=None)
    duration = librosa.get_duration(y=y, sr=sr)
    
    filename = audio_file.stem
    parts = filename.split('_')
    member = parts[0] if len(parts) > 0 else filename.split('-')[0] if '-' in filename else 'unknown'
    phrase = '_'.join(parts[1:]) if len(parts) > 1 else filename
    
    audio_data.append({
        'filename': audio_file.name,
        'member': member,
        'phrase': phrase,
        'sample_rate': sr,
        'duration': duration,
        'samples': len(y),
        'audio': y,
        'path': str(audio_file)
    })
    
    print(f"{audio_file.name} | {member} | {sr} Hz | {duration:.2f}s")

print(f"\nLoaded {len(audio_data)} audio samples")

## 4. Audio Visualization

In [None]:
def visualize_audio(audio_path, member_name, phrase):
    y, sr = librosa.load(audio_path, sr=None)
    fig, axes = plt.subplots(2, 1, figsize=(14, 8))
    
    librosa.display.waveshow(y, sr=sr, ax=axes[0], color='blue', alpha=0.7)
    axes[0].set_title(f'Waveform: {member_name} - "{phrase}"', fontsize=14, fontweight='bold')
    axes[0].set_xlabel('Time (seconds)')
    axes[0].set_ylabel('Amplitude')
    axes[0].grid(True, alpha=0.3)
    
    D = librosa.amplitude_to_db(np.abs(librosa.stft(y)), ref=np.max)
    img = librosa.display.specshow(D, sr=sr, x_axis='time', y_axis='hz', ax=axes[1], cmap='viridis')
    axes[1].set_title(f'Spectrogram: {member_name} - "{phrase}"', fontsize=14, fontweight='bold')
    axes[1].set_xlabel('Time (seconds)')
    axes[1].set_ylabel('Frequency (Hz)')
    fig.colorbar(img, ax=axes[1], format='%+2.0f dB')
    
    plt.tight_layout()
    plt.show()
    
    print(f"Analysis for {member_name} - '{phrase}':")
    print(f"   Duration: {len(y)/sr:.2f}s | Max Amplitude: {np.max(np.abs(y)):.4f}")
    print(f"   RMS Energy: {np.sqrt(np.mean(y**2)):.4f} | ZCR: {np.mean(librosa.zero_crossings(y)):.4f}\n")

In [None]:
print("AUDIO VISUALIZATIONS")
print("=" * 70)

for data in audio_data:
    visualize_audio(data['path'], data['member'], data['phrase'])
    print("=" * 70 + "\n")

## 5. Audio Augmentation

In [None]:
def pitch_shift(y, sr, n_steps=2):
    return librosa.effects.pitch_shift(y, sr=sr, n_steps=n_steps)

def time_stretch(y, rate=1.2):
    return librosa.effects.time_stretch(y, rate=rate)

def add_noise(y, noise_factor=0.005):
    noise = np.random.randn(len(y))
    return y + noise_factor * noise

def change_volume(y, factor=0.7):
    return y * factor

print("Augmentation functions ready")

In [None]:
print("Applying Augmentations...")
print("=" * 70)

augmented_data = []

for data in audio_data:
    y, sr = data['audio'], data['sample_rate']
    member, phrase = data['member'], data['phrase']
    base_filename = f"{member}_{phrase}"
    
    augmentations = [
        (pitch_shift(y, sr, n_steps=2), 'pitch_up'),
        (time_stretch(y, rate=1.2), 'faster'),
        (add_noise(y, noise_factor=0.005), 'noise'),
        (change_volume(y, factor=0.7), 'volume_low')
    ]
    
    for aug_audio, aug_type in augmentations:
        aug_path = augmented_dir / f"{base_filename}_{aug_type}.wav"
        sf.write(aug_path, aug_audio, sr)
        augmented_data.append({
            'original_file': data['filename'],
            'member': member,
            'phrase': phrase,
            'augmentation': aug_type,
            'audio': aug_audio,
            'sample_rate': sr,
            'path': str(aug_path)
        })
    
    print(f"[OK] {data['filename']}: 4 augmentations created")

print(f"\nTotal: {len(audio_data)} original + {len(augmented_data)} augmented = {len(audio_data) + len(augmented_data)} samples")

## 6. Feature Extraction

In [None]:
def extract_audio_features(y, sr):
    features = {}
    
    mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13)
    for i in range(13):
        features[f'mfcc_{i+1}_mean'] = np.mean(mfccs[i])
        features[f'mfcc_{i+1}_std'] = np.std(mfccs[i])
    
    spectral_rolloff = librosa.feature.spectral_rolloff(y=y, sr=sr)
    features['spectral_rolloff_mean'] = np.mean(spectral_rolloff)
    features['spectral_rolloff_std'] = np.std(spectral_rolloff)
    
    rms = librosa.feature.rms(y=y)
    features['rms_energy_mean'] = np.mean(rms)
    features['rms_energy_std'] = np.std(rms)
    
    zcr = librosa.feature.zero_crossing_rate(y)
    features['zero_crossing_rate_mean'] = np.mean(zcr)
    features['zero_crossing_rate_std'] = np.std(zcr)
    
    spectral_centroid = librosa.feature.spectral_centroid(y=y, sr=sr)
    features['spectral_centroid_mean'] = np.mean(spectral_centroid)
    features['spectral_centroid_std'] = np.std(spectral_centroid)
    
    chroma = librosa.feature.chroma_stft(y=y, sr=sr)
    features['chroma_mean'] = np.mean(chroma)
    features['chroma_std'] = np.std(chroma)
    
    spectral_bandwidth = librosa.feature.spectral_bandwidth(y=y, sr=sr)
    features['spectral_bandwidth_mean'] = np.mean(spectral_bandwidth)
    features['spectral_bandwidth_std'] = np.std(spectral_bandwidth)
    
    return features

print("Feature extraction function ready")

In [None]:
print("Extracting Features...")
print("=" * 70)

all_features = []

for data in audio_data:
    features = extract_audio_features(data['audio'], data['sample_rate'])
    features.update({
        'filename': data['filename'],
        'member': data['member'],
        'phrase': data['phrase'],
        'augmentation': 'original',
        'is_authorized': 1
    })
    all_features.append(features)

for data in augmented_data:
    features = extract_audio_features(data['audio'], data['sample_rate'])
    features.update({
        'filename': Path(data['path']).name,
        'member': data['member'],
        'phrase': data['phrase'],
        'augmentation': data['augmentation'],
        'is_authorized': 1
    })
    all_features.append(features)

features_df = pd.DataFrame(all_features)

print(f"Features extracted: {features_df.shape[0]} samples x {features_df.shape[1]} columns")

In [None]:
print("Feature Sample:")
print("=" * 70)
display(features_df.head(10))

print("\nFeature Statistics:")
feature_cols = [col for col in features_df.columns if any(x in col for x in ['mfcc', 'spectral', 'rms', 'zero', 'chroma'])]
display(features_df[feature_cols].describe())

## 7. Process Unauthorized Samples

In [None]:
unauthorized_files = list(unauthorized_dir.glob('*.wav'))

print(f"Found {len(unauthorized_files)} unauthorized samples")
print("=" * 70)

if len(unauthorized_files) > 0:
    for audio_file in unauthorized_files:
        y, sr = librosa.load(audio_file, sr=None)
        features = extract_audio_features(y, sr)
        features.update({
            'filename': audio_file.name,
            'member': 'unauthorized',
            'phrase': 'unknown',
            'augmentation': 'original',
            'is_authorized': 0
        })
        all_features.append(features)
        print(f"[OK] {audio_file.name}")
    
    features_df = pd.DataFrame(all_features)
    print(f"\nTotal samples: {len(features_df)} (authorized + unauthorized)")
else:
    print("WARNING: No unauthorized samples found")
    print(f"   Add to: {unauthorized_dir}")

## 8. Save audio_features.csv

In [None]:
output_path = base_dir / 'Dataset' / 'audio_features.csv'
features_df.to_csv(output_path, index=False)

print(f"Saved: {output_path}")
print(f"   Shape: {features_df.shape}")
print(f"   Columns: {len(features_df.columns)}")
print("audio_features.csv created")

## 9. Exploratory Data Analysis

In [None]:
print("AUDIO FEATURES EDA")
print("=" * 70)
print("\nDataset Info:")
features_df.info()

print("\n" + "=" * 70)
print("Authorization Distribution:")
print(features_df['is_authorized'].value_counts())

print("\nMember Distribution:")
print(features_df['member'].value_counts())

In [None]:
fig, axes = plt.subplots(2, 2, figsize=(15, 10))

for i, ax in enumerate(axes.flat):
    mfcc_col = f'mfcc_{i+1}_mean'
    if mfcc_col in features_df.columns:
        features_df.boxplot(column=mfcc_col, by='member', ax=ax)
        ax.set_title(f'MFCC {i+1} by Member')
        ax.set_xlabel('Member')
        ax.set_ylabel(f'MFCC {i+1} Mean')

plt.suptitle('MFCC Features Distribution', fontsize=16, fontweight='bold')
plt.tight_layout()
plt.show()

In [None]:
selected_features = ['mfcc_1_mean', 'mfcc_2_mean', 'spectral_rolloff_mean', 'rms_energy_mean', 
                     'zero_crossing_rate_mean', 'spectral_centroid_mean', 'chroma_mean']

plt.figure(figsize=(12, 8))
correlation = features_df[selected_features].corr()
sns.heatmap(correlation, annot=True, fmt='.2f', cmap='coolwarm', center=0, 
            square=True, linewidths=1, cbar_kws={"shrink": 0.8})
plt.title('Feature Correlation Heatmap', fontsize=16, fontweight='bold')
plt.tight_layout()
plt.show()

In [None]:
fig, axes = plt.subplots(1, 3, figsize=(18, 5))

features_df.boxplot(column='rms_energy_mean', by='is_authorized', ax=axes[0])
axes[0].set_title('RMS Energy: Authorized vs Unauthorized')
axes[0].set_xlabel('Is Authorized (0=No, 1=Yes)')
axes[0].set_ylabel('RMS Energy Mean')

features_df.boxplot(column='spectral_rolloff_mean', by='is_authorized', ax=axes[1])
axes[1].set_title('Spectral Rolloff: Authorized vs Unauthorized')
axes[1].set_xlabel('Is Authorized (0=No, 1=Yes)')
axes[1].set_ylabel('Spectral Rolloff Mean')

features_df.boxplot(column='mfcc_1_mean', by='is_authorized', ax=axes[2])
axes[2].set_title('MFCC 1: Authorized vs Unauthorized')
axes[2].set_xlabel('Is Authorized (0=No, 1=Yes)')
axes[2].set_ylabel('MFCC 1 Mean')

plt.suptitle('Authorized vs Unauthorized Comparison', fontsize=16, fontweight='bold')
plt.tight_layout()
plt.show()

## 10. Voiceprint Verification Model

In [None]:
print("MODEL DEVELOPMENT")
print("=" * 70)

feature_columns = [col for col in features_df.columns if any(x in col for x in 
                   ['mfcc', 'spectral', 'rms', 'zero', 'chroma'])]

X = features_df[feature_columns]
y = features_df['is_authorized']

print(f"Features: {X.shape} | Target: {y.shape}")
print(f"Class distribution:\n{y.value_counts()}")

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print(f"Train: {X_train_scaled.shape} | Test: {X_test_scaled.shape}")

In [None]:
print("Training Models...")
print("=" * 70)

models = {
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42, max_depth=10),
    'Logistic Regression': LogisticRegression(random_state=42, max_iter=1000)
}

results = {}

for model_name, model in models.items():
    model.fit(X_train_scaled, y_train)
    y_pred = model.predict(X_test_scaled)
    
    results[model_name] = {
        'model': model,
        'accuracy': accuracy_score(y_test, y_pred),
        'f1_score': f1_score(y_test, y_pred, average='weighted'),
        'predictions': y_pred
    }
    
    print(f"{model_name}: Accuracy={results[model_name]['accuracy']:.4f}, F1={results[model_name]['f1_score']:.4f}")

print("\nModels trained")

In [None]:
print("MODEL EVALUATION")
print("=" * 70)

for model_name, result in results.items():
    print(f"\n{model_name}:")
    print(f"Accuracy: {result['accuracy']:.4f} | F1-Score: {result['f1_score']:.4f}")
    print("\nClassification Report:")
    print(classification_report(y_test, result['predictions'], labels=[0, 1], target_names=['Unauthorized', 'Authorized'], zero_division=0))
    
    cm = confusion_matrix(y_test, result['predictions'], labels=[0, 1])
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
                xticklabels=['Unauthorized', 'Authorized'],
                yticklabels=['Unauthorized', 'Authorized'])
    plt.title(f'Confusion Matrix - {model_name}', fontsize=14, fontweight='bold')
    plt.ylabel('True')
    plt.xlabel('Predicted')
    plt.tight_layout()
    plt.show()
    print("=" * 70)

In [None]:
best_model_name = max(results, key=lambda x: results[x]['accuracy'])
best_model = results[best_model_name]['model']

print(f"BEST MODEL: {best_model_name}")
print(f"   Accuracy: {results[best_model_name]['accuracy']:.4f}")
print(f"   F1-Score: {results[best_model_name]['f1_score']:.4f}")

## 11. Save Model & Scaler

In [None]:
model_dir = base_dir / 'models'
model_dir.mkdir(exist_ok=True)

model_path = model_dir / 'voiceprint_model.pkl'
scaler_path = model_dir / 'audio_scaler.pkl'

with open(model_path, 'wb') as f:
    pickle.dump(best_model, f)

with open(scaler_path, 'wb') as f:
    pickle.dump(scaler, f)

print(f"Saved:\n   Model: {model_path}\n   Scaler: {scaler_path}")
print("Ready for system integration")

## 12. Verification Function

In [None]:
def verify_voiceprint(audio_path, model, scaler):
    try:
        y, sr = librosa.load(audio_path, sr=None)
        features = extract_audio_features(y, sr)
        feature_vector = pd.DataFrame([features])[feature_columns]
        feature_vector_scaled = scaler.transform(feature_vector)
        
        prediction = model.predict(feature_vector_scaled)[0]
        confidence = model.predict_proba(feature_vector_scaled)[0]
        
        return {
            'authorized': bool(prediction),
            'confidence': float(confidence[prediction]),
            'message': 'Voice verified - Access granted' if prediction else 'Voice not recognized - Access denied'
        }
    except Exception as e:
        return {
            'authorized': False,
            'confidence': 0.0,
            'message': f'Verification failed: {str(e)}'
        }

print("Verification function ready")

## 13. Test Verification System

In [None]:
print("TESTING VOICEPRINT VERIFICATION")
print("=" * 70)
print("\nAuthorized Samples:")

test_authorized = audio_files[:2] if len(audio_files) >= 2 else audio_files

for audio_path in test_authorized:
    result = verify_voiceprint(audio_path, best_model, scaler)
    status = 'APPROVED' if result['authorized'] else 'DENIED'
    print(f"{audio_path.name}: {status} ({result['confidence']:.2%})")

In [None]:
print("\nUnauthorized Samples:")

if len(unauthorized_files) > 0:
    for audio_path in unauthorized_files:
        result = verify_voiceprint(audio_path, best_model, scaler)
        status = 'APPROVED' if result['authorized'] else 'DENIED'
        print(f"{audio_path.name}: {status} ({result['confidence']:.2%})")
else:
    print("WARNING: No unauthorized samples to test")

print("\nVerification testing complete")

## 14. Deliverables Summary

### Completed:
- Audio data collection and organization
- Waveform and spectrogram visualizations
- 4 augmentations per sample
- Feature extraction (MFCCs, spectral features, energy)
- `audio_features.csv` saved
- Voiceprint verification model trained and evaluated
- `voiceprint_model.pkl` and `audio_scaler.pkl` saved
- Verification function for system integration

### Files Created:
- `Dataset/audio_features.csv`
- `models/voiceprint_model.pkl`
- `models/audio_scaler.pkl`
- Augmented audio samples in `Dataset/audio_samples/augmented/`

### Next Steps:
1. Add unauthorized samples (if not done)
2. Integrate with face recognition and product recommendation models
3. Build command-line application
4. Create demonstration video
5. Write final report