In [2]:
!pip install pydub ffmpeg 

Collecting pydub
  Using cached pydub-0.25.1-py2.py3-none-any.whl.metadata (1.4 kB)
Collecting ffmpeg
  Downloading ffmpeg-1.4.tar.gz (5.1 kB)
  Preparing metadata (setup.py) ... [?25ldone
[?25hUsing cached pydub-0.25.1-py2.py3-none-any.whl (32 kB)
Building wheels for collected packages: ffmpeg
  Building wheel for ffmpeg (setup.py) ... [?25ldone
[?25h  Created wheel for ffmpeg: filename=ffmpeg-1.4-py3-none-any.whl size=6080 sha256=6f38fd92d14fcfb19f2c584193ca412526a906177229932638865c51eb92c67d
  Stored in directory: /home/ec2-user/.cache/pip/wheels/8e/7a/69/cd6aeb83b126a7f04cbe7c9d929028dc52a6e7d525ff56003a
Successfully built ffmpeg
Installing collected packages: pydub, ffmpeg
Successfully installed ffmpeg-1.4 pydub-0.25.1


# Audio Segments

In [50]:
s3 = boto3.client('s3')

def segment_and_upload_audio(input_bucket, input_folder_path, output_folder_path):
    # List all audio files in the specified input folder in S3
    response = s3.list_objects_v2(Bucket=input_bucket, Prefix=input_folder_path)
    audio_files = [obj['Key'] for obj in response.get('Contents', []) if obj['Key'].endswith('.wav')]

    # Process each audio file in the input folder
    for file_key in audio_files:
        # Download the audio file from S3
        response = s3.get_object(Bucket=input_bucket, Key=file_key)
        audio_bytes = response['Body'].read()
        audio = AudioSegment.from_file(BytesIO(audio_bytes))
        
        # Calculate segment duration (0.6 to 0.8 seconds)
        min_segment_duration = 0.6 * 1000  # Convert seconds to milliseconds
        max_segment_duration = 0.8 * 1000
        
        # Segment the audio file
        segments = []
        current_position = 0
        while current_position < len(audio):
            segment_duration = random.randint(min_segment_duration, max_segment_duration)
            segment = audio[current_position:current_position + segment_duration]
            segments.append(segment)
            current_position += segment_duration
        
        # Upload each segment back to S3 in the specified output folder
        for i, segment in enumerate(segments):
            segment_key = f"{output_folder_path}/{file_key.split('/')[-1].split('.')[0]}_{i}.wav"
            segment_bytes = segment.export(format='wav').read()
            s3.put_object(Bucket=input_bucket, Key=segment_key, Body=BytesIO(segment_bytes))
    
    return 'Segments created and uploaded successfully!'

In [52]:
input_bucket = 'audiofilesdata'
input_folder_path = 'REAL'  # Input folder containing audio files
output_folder_path = 'REAL1'  # Output folder for segmented audio

result = segment_and_upload_audio(input_bucket, input_folder_path, output_folder_path)
print(result)

Segments created and uploaded successfully!


In [59]:
!pip install resampy



# Audio Preprocessing

In [1]:
import os
import boto3
import librosa
import numpy as np
import pandas as pd

def features_extractor(file_name):
    audio, sample_rate = librosa.load(file_name, res_type='kaiser_fast') 
    mfccs_features = librosa.feature.mfcc(y=audio, sr=sample_rate, n_mfcc=40)
    mfccs_scaled_features = np.mean(mfccs_features.T, axis=0)
    return mfccs_scaled_features

def process_folder(bucket_name, folder_name):
    s3 = boto3.client('s3')
    response = s3.list_objects_v2(Bucket=bucket_name, Prefix=folder_name)
    
    features_list = []
    for obj in response.get('Contents', []):
        file_key = obj['Key']
        if file_key.endswith('.wav'):  # Assuming all files are WAV format
            file_name = f"/tmp/{file_key.split('/')[-1]}"  # Download to local /tmp directory
            s3.download_file(bucket_name, file_key, file_name)
            print(file_name)
            features = features_extractor(file_name)
            print(features)
            features_list.append([file_key, folder_name] + features.tolist())
            os.remove(file_name)  # Remove the downloaded file to save disk space
    
    return features_list

def save_to_csv(features_list, output_file):
    df = pd.DataFrame(features_list, columns=['File', 'Folder'] + [f'MFCC_{i}' for i in range(1, 41)])
    df.to_csv(output_file, index=False)

bucket_name = 'audiofilesdata'
folders = ['REAL1', 'FAKE1']  # Add more folders if needed
output_file = 'output.csv'

all_features = []
for folder_name in folders:
    folder_features = process_folder(bucket_name, folder_name)
    all_features.extend(folder_features)

save_to_csv(all_features, output_file)


/tmp/biden-original_0.wav
[-273.2599      124.09079     -64.710266    -14.885468    -22.725746
  -10.659842    -13.787889      1.905541    -20.138693      7.932558
  -11.360725     -1.3217432    -8.739006    -11.895443    -15.414835
    6.4152994   -17.678738     -2.775714    -12.65204      -1.969023
  -13.490747     -7.704973    -11.517237     -1.6131587   -12.161943
   -4.8615875    -2.0736775     4.318421      2.3927362     7.453205
    4.1756883     4.312247      6.5104604     1.3550553    -0.6870998
   -0.78896284   -4.7482944    -3.906682      0.8458953     1.1024616 ]
/tmp/biden-original_1.wav
[-278.55127    134.99945    -78.33159    -10.971195    -9.259318
  -14.999792   -12.805491   -11.963385   -32.0156      16.827824
  -12.986719    -9.919474    -8.578353    -4.7466626  -20.984303
    1.5817897  -19.122896    -4.3087206  -12.581625    -0.8588978
   -7.3565736   -8.109311   -18.222345   -10.589687    -3.303582
    7.059836    18.325512    19.806507    11.652556     5.2952127


In [4]:
import pandas as pd

# Read the CSV file
df = pd.read_csv('output.csv')
df



Unnamed: 0,File,Folder,MFCC_1,MFCC_2,MFCC_3,MFCC_4,MFCC_5,MFCC_6,MFCC_7,MFCC_8,...,MFCC_31,MFCC_32,MFCC_33,MFCC_34,MFCC_35,MFCC_36,MFCC_37,MFCC_38,MFCC_39,MFCC_40
0,REAL1/biden-original_0.wav,REAL1,-273.259888,124.090790,-64.710266,-14.885468,-22.725746,-10.659842,-13.787889,1.905541,...,4.175688,4.312247,6.510460,1.355055,-0.687100,-0.788963,-4.748294,-3.906682,0.845895,1.102462
1,REAL1/biden-original_1.wav,REAL1,-278.551270,134.999451,-78.331589,-10.971195,-9.259318,-14.999792,-12.805491,-11.963385,...,-6.366967,-7.712110,4.016825,3.958173,-4.892671,-5.609682,-6.717570,-3.326739,2.155886,-3.418968
2,REAL1/biden-original_10.wav,REAL1,-230.962265,114.526527,-66.253151,11.259330,2.377078,-12.251638,-24.864946,-1.828751,...,-2.873662,-2.188383,1.837111,3.555196,-3.002104,-4.754421,-1.277930,5.200921,2.209428,-1.621000
3,REAL1/biden-original_100.wav,REAL1,-242.187164,82.337769,-43.958176,16.359894,-15.184058,4.883745,-28.326471,-3.250439,...,-4.666645,6.659175,3.834552,-2.237391,-12.121249,-7.869506,0.149885,2.011222,-3.528703,-9.360021
4,REAL1/biden-original_101.wav,REAL1,-222.395569,90.612968,-36.558388,19.143547,-25.588493,2.810948,-26.015686,-4.149233,...,1.191267,6.407966,5.684544,1.603550,-7.318727,-4.276711,1.369621,1.729982,-4.117910,-5.149320
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1993,FAKE1/Obama-to-Trump_221.wav,FAKE1,-241.122437,110.153976,-29.353933,0.703730,-15.984562,-12.750664,-22.192795,-1.391231,...,15.040483,25.599705,21.909267,14.285854,1.879465,0.231721,-2.186425,0.726572,2.670112,1.123251
1994,FAKE1/Obama-to-Trump_222.wav,FAKE1,-432.703552,123.840645,-7.071842,19.198868,13.542641,-9.338780,-12.495111,-3.293220,...,-6.023295,1.950484,2.654285,-0.176964,-3.937673,1.016691,-1.724762,1.299378,2.375386,2.038896
1995,FAKE1/Obama-to-Trump_223.wav,FAKE1,-539.172607,128.243851,-42.753956,20.202297,-17.272509,14.575281,-9.516274,17.137703,...,-2.796578,5.375177,2.949175,3.854049,0.509752,3.275028,-2.323314,-0.476324,1.084716,2.786406
1996,FAKE1/Obama-to-Trump_224.wav,FAKE1,-499.197662,129.269012,-43.626945,13.939455,-27.746536,11.717601,-8.268483,15.564046,...,0.425251,6.714995,1.583632,3.785098,-0.783990,-0.141415,-1.443752,3.196190,3.080776,1.821365


In [None]:
# Drop the individual MFCC columns if needed
df.drop(columns=df.filter(like='MFCC_').columns, inplace=True)

# Save the modified DataFrame to a new CSV file
df.to_csv('output.csv', index=False)

In [None]:
#### Extracting MFCC's For every audio file
import pandas as pd

metadata=pd.read_csv('output.csv')
metadata.head()