# Final Project -- Audio Steganography & MFCC Steganalysis

## Amanda Foster

In [58]:
import os
import librosa
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
import wave
from scipy.signal import butter, filtfilt
import pandas as pd

In [59]:
def apply_lowpass_filter(data, cutoff, sr, order=5):
    nyquist = 0.5 * sr
    normal_cutoff = cutoff / nyquist
    b, a = butter(order, normal_cutoff, btype='low', analog=False)
    filtered_data = filtfilt(b, a, data)
    
    return filtered_data

In [60]:
def insert_data(audio_file, message, output_file):
    try: 
      with wave.open(audio_file, 'rb') as file:
          frame_bytes = bytearray(file.readframes(file.getnframes()))
          sr = file.getframerate()
      
      cutoff = 10000  
      filtered_signal = apply_lowpass_filter(np.frombuffer(frame_bytes, dtype=np.int16), cutoff, sr)
      frame_bytes_filtered = filtered_signal.astype(np.int16).tobytes()
      
      message += "#####"
      message_bits = ''.join(format(ord(char), '08b') for char in message)
      message_length = len(message_bits)
      
      if message_length * 2 > len(frame_bytes_filtered):
          raise ValueError("Message is too large to be embedded in the audio file.")
      
      for i in range(message_length):
          frame_bytes[i] &= 0xFE
          frame_bytes[i] |= int(message_bits[i])
      
      with wave.open(output_file, 'wb') as file:
          file.setparams((1, 2, sr, len(frame_bytes), 'NONE', 'not compressed'))
          file.writeframes(frame_bytes)
      
    except Exception as e:
      print("An error occured while embedding the file: ", e)


def embed(audio_file, message):
  output_file = os.path.join("output_files", os.path.splitext(os.path.basename(audio_file))[0] + "_embedded")
  output_file += ".wav"
  
  insert_data(audio_file, message, output_file)

In [61]:
def extract_data(audio_file):
  try:
    with wave.open(audio_file, 'rb') as file:
        frame_bytes = bytearray(file.readframes(file.getnframes()))
    
    extracted_bits = []
    for byte in frame_bytes:
        extracted_bit = byte & 1
        extracted_bits.append(extracted_bit)
    
    extracted_message = ''.join(chr(int(''.join(map(str, extracted_bits[i:i+8])), 2)) for i in range(0, len(extracted_bits), 8))
    
    end_index = extracted_message.find('#####')
    if end_index != -1:
        extracted_message = extracted_message[:end_index]
    
    return extracted_message
  
  except Exception as e:
    print("An error occurred while extracting the message", e)
    return None
  
def extract(audio_file):
  extracted_message = extract_data(audio_file)
  return extracted_message

In [62]:
input_files_dir = "input_files"
output_files_dir = "output_files"
extracted_messages = []

for filename in os.listdir(input_files_dir):
    if filename.endswith(".wav"):
        audio_file = os.path.join(input_files_dir, filename)
        message = f'we embedded a message into {os.path.splitext(os.path.basename(audio_file))[0]}.wav'
        embed(audio_file, message)
        
        embedded_audio_file = os.path.join(output_files_dir, os.path.splitext(os.path.basename(audio_file))[0] + "_embedded.wav")
        extracted_message = extract(embedded_audio_file)
        
        extracted_messages.append(extracted_message)

print("Extracted messages:", extracted_messages)

Extracted messages: ['we embedded a message into LJ003-0252.wav', 'we embedded a message into LJ001-0118.wav', 'we embedded a message into LJ003-0300.wav', 'we embedded a message into LJ001-0016.wav', 'we embedded a message into LJ002-0258.wav', 'we embedded a message into LJ003-0132.wav', 'we embedded a message into LJ001-0125.wav', 'we embedded a message into LJ001-0059.wav', 'we embedded a message into LJ001-0141.wav', 'we embedded a message into LJ002-0103.wav', 'we embedded a message into LJ003-0212.wav', 'we embedded a message into LJ002-0223.wav', 'we embedded a message into LJ001-0066.wav', 'we embedded a message into LJ002-0046.wav', 'we embedded a message into LJ001-0110.wav', 'we embedded a message into LJ002-0294.wav', 'we embedded a message into LJ002-0146.wav', 'we embedded a message into LJ002-0200.wav', 'we embedded a message into LJ003-0149.wav', 'we embedded a message into LJ003-0214.wav', 'we embedded a message into LJ002-0169.wav', 'we embedded a message into LJ003-

In [63]:
def extract_features(audio_file):
    y, sr = librosa.load(audio_file)
    mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=8)
    return np.mean(mfccs, axis=1)  

In [64]:
input_files_dir = 'input_files'
output_files_dir = 'output_files'

input_data = []
input_labels = []
input_filenames = []

output_data = []
output_labels = []
output_filenames = []
output_correct = []

for filename in os.listdir(input_files_dir):
    if filename.endswith('.wav'):
        audio_file = os.path.join(input_files_dir, filename)
        features = extract_features(audio_file)
        input_data.append(features)
        input_labels.append(0)
        input_filenames.append(filename)

for filename in os.listdir(output_files_dir):
    if filename.endswith('.wav'):
        audio_file = os.path.join(output_files_dir, filename)
        features = extract_features(audio_file)
        output_data.append(features)
        output_labels.append(1)
        output_filenames.append(filename)
        output_correct.append(extract(audio_file) == f'we embedded a message into {os.path.splitext(os.path.basename(audio_file))[0].split("_")[0]}.wav')

input_df = pd.DataFrame({
    'filename': input_filenames,
    'data': input_data,
    'label': input_labels,
})

output_df = pd.DataFrame({
    'filename': output_filenames,
    'data': output_data,
    'label': output_labels,
    'output_correct': output_correct
})

df = pd.concat([input_df, output_df], ignore_index=True)

In [65]:
df.head(-6)

Unnamed: 0,filename,data,label,output_correct
0,LJ003-0252.wav,"[-302.39178, 82.564545, -1.0848303, 12.105384,...",0,
1,LJ001-0118.wav,"[-253.91824, 97.19213, -16.520079, 10.717499, ...",0,
2,LJ003-0300.wav,"[-290.78378, 74.56775, 10.23219, 14.446426, -1...",0,
3,LJ001-0016.wav,"[-271.20184, 83.41064, -8.515509, 10.252653, -...",0,
4,LJ002-0258.wav,"[-285.11084, 98.32171, 1.92898, 15.974557, -9....",0,
...,...,...,...,...
1637,LJ003-0183_embedded.wav,"[-311.7116, 75.53801, 12.410514, 15.653378, -1...",1,True
1638,LJ002-0258_embedded.wav,"[-285.03586, 98.26799, 1.9814262, 15.931101, -...",1,True
1639,LJ003-0061_embedded.wav,"[-332.191, 79.15021, 14.579301, 19.426817, -9....",1,True
1640,LJ003-0264_embedded.wav,"[-269.27316, 84.935165, 2.5807147, 3.1040783, ...",1,True


In [66]:
output_files = df[df['label'] == 1]
incorrect_messages = output_files[output_files['output_correct'] == False]

print("Number of incorrect messages:", len(incorrect_messages))

Number of incorrect messages: 0


In [67]:
X = np.vstack((input_data, output_data))
y = np.hstack((input_labels, output_labels))

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

clf = SVC(kernel='rbf', gamma='scale')
clf.fit(X_train_scaled, y_train)

y_pred = clf.predict(X_test_scaled)
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print('Accuracy:', accuracy)
print('Classification Report:')
print(report)

Accuracy: 0.3242424242424242
Classification Report:
              precision    recall  f1-score   support

           0       0.19      0.09      0.12       176
           1       0.36      0.60      0.45       154

    accuracy                           0.32       330
   macro avg       0.28      0.34      0.29       330
weighted avg       0.27      0.32      0.27       330



In [68]:
# Train Random Forest classifier
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train_scaled, y_train)

# Predict labels for test data
y_pred = clf.predict(X_test_scaled)

# Evaluate classifier
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print('Accuracy:', accuracy)
print('Classification Report:')
print(report)

Accuracy: 0.10606060606060606
Classification Report:
              precision    recall  f1-score   support

           0       0.11      0.09      0.10       176
           1       0.11      0.12      0.11       154

    accuracy                           0.11       330
   macro avg       0.11      0.11      0.11       330
weighted avg       0.11      0.11      0.11       330



In [69]:
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

X_train = X_train.reshape(X_train.shape[0], X_train.shape[1], 1)
X_test = X_test.reshape(X_test.shape[0], X_test.shape[1], 1)

model = Sequential([
    LSTM(64, input_shape=(X_train.shape[1], X_train.shape[2])),
    Dense(1, activation='sigmoid')
])

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.fit(X_train, y_train, epochs=10, batch_size=32, validation_split=0.2)

loss, accuracy = model.evaluate(X_test, y_test)
print('Test Loss:', loss)
print('Test Accuracy:', accuracy)

Epoch 1/10


  super().__init__(**kwargs)


[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 11ms/step - accuracy: 0.4905 - loss: 0.7780 - val_accuracy: 0.5152 - val_loss: 0.6916
Epoch 2/10
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - accuracy: 0.4953 - loss: 0.6975 - val_accuracy: 0.5455 - val_loss: 0.6894
Epoch 3/10
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - accuracy: 0.4916 - loss: 0.6955 - val_accuracy: 0.5379 - val_loss: 0.6917
Epoch 4/10
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.5269 - loss: 0.6951 - val_accuracy: 0.4773 - val_loss: 0.6999
Epoch 5/10
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.4803 - loss: 0.6963 - val_accuracy: 0.5152 - val_loss: 0.6928
Epoch 6/10
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.5207 - loss: 0.6941 - val_accuracy: 0.4697 - val_loss: 0.7026
Epoch 7/10
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━