In [1]:
import os
import librosa
import pandas as pd


### Datasets apenas com features

In [2]:
def get_annotation_classes(folder_path):
    annotation_files = []
    for file in os.listdir(folder_path):
        if file.endswith(".txt"):
            file_path = os.path.join(folder_path, file)
            annotation_files.append(file_path)

    classes = set()
    for file_path in annotation_files:
        file_name = os.path.basename(file_path)
        parts = file_name.split("_")
        if len(parts) > 1:
            annotation_class = parts[1].replace('.txt', '') # Retira a extensão .txt
            classes.add(annotation_class)

    return classes

In [3]:
def extract_annotations(folder_path, classes):
    data = []
    for annotation_class in classes:
        class_files = []
        for file in os.listdir(folder_path):
            if file.endswith(".txt") and annotation_class in file:
                file_path = os.path.join(folder_path, file)
                class_files.append(file_path)
        
        for file_path in class_files:
            with open(file_path, 'r') as file:
                for line in file:
                    parts = line.strip().split()
                    if len(parts) >= 2:
                        start_time = float(parts[0])
                        end_time = float(parts[1])
                        annotation_time = end_time - start_time
                        if len(parts) >= 3:
                            annotation = parts[2]
                            tag = parts[3] if len(parts) == 4 else ''
                            data.append({
                                'Classe': annotation_class,
                                'Duração': annotation_time,
                                'Anotação': annotation,
                                'Tag': tag,
                                'Início': start_time,
                                'Final': end_time,
                                "Filepath": file_path,
                                "Amostragem": 44_100
                            })

    df = pd.DataFrame(data)
    return df

In [4]:
folder_path = '../Datasets/VSD_2014_December_official_release/Hollywood-dev/annotations'  # Altere para o caminho da sua pasta de anotações
classes = get_annotation_classes(folder_path)
df_hollywood_dev = extract_annotations(folder_path, classes)


In [5]:
df_hollywood_dev

Unnamed: 0,Classe,Duração,Anotação,Tag,Início,Final,Filepath,Amostragem
0,gunshots,861.875000,(nothing),,0.000000,861.875000,../Datasets/VSD_2014_December_official_release...,44100
1,gunshots,17.616426,(nothing),,861.875000,879.491426,../Datasets/VSD_2014_December_official_release...,44100
2,gunshots,3.377296,gunshot,,879.491426,882.868722,../Datasets/VSD_2014_December_official_release...,44100
3,gunshots,8.143190,(nothing),,882.868722,891.011912,../Datasets/VSD_2014_December_official_release...,44100
4,gunshots,44.280787,(nothing),,891.011912,935.292699,../Datasets/VSD_2014_December_official_release...,44100
...,...,...,...,...,...,...,...,...
27177,screams,1.571071,(nothing),,5812.963123,5814.534194,../Datasets/VSD_2014_December_official_release...,44100
27178,screams,3.171058,(nothing),,5814.534194,5817.705251,../Datasets/VSD_2014_December_official_release...,44100
27179,screams,3.988981,(nothing),,5817.705251,5821.694232,../Datasets/VSD_2014_December_official_release...,44100
27180,screams,0.888087,(nothing),,5821.694232,5822.582319,../Datasets/VSD_2014_December_official_release...,44100


### Datasets com Áudio

In [6]:
def extract_audio_info(folder):
    data = []
    for dirpath, dirnames, filenames in os.walk(folder):
        for filename in filenames:
            if filename.endswith('.wav') or filename.endswith('.mp3'):
                file_path = os.path.join(dirpath, filename)
                duration = librosa.get_duration(path=file_path)
                sample_rate = librosa.get_samplerate(file_path)
                data.append([duration, file_path, sample_rate])
    df = pd.DataFrame(data, columns=['Duração', 'Filepath', 'Amostragem'])
    return df

In [7]:
# Função para organizar o dataframe com as colunas solicitadas
def organize_dataframe(df):
    df['Classe'] = ''
    df['Anotação'] = ''
    df['Tag'] = ''
    
    df['Início'] = ''
    df['Final'] = ''
    
    for i, row in df.iterrows():
        if 'HEAR Dataset' in row['Filepath'] and 'NAO_VIOLENCIA' in row['Filepath']:
            df.at[i, 'Classe'] = 'Violência Física'
            df.at[i, 'Anotação'] = 'Não Violência'
        elif 'HEAR Dataset' in row['Filepath'] and 'VIOLENCIA' in row['Filepath']:
            df.at[i, 'Classe'] = 'Violência Física'
            df.at[i, 'Anotação'] = 'Violência Física'
        elif 'Gunshot Audio Forensic Dataset' in row['Filepath']:
            df.at[i, 'Classe'] = 'gunshots'
            df.at[i, 'Anotação'] = 'gunshot_forensic'
            df.at[i, 'Tag'] = os.path.basename(os.path.dirname(row['Filepath'])).replace('_Samsung', '')
        df.at[i, 'Início'] = 0
        df.at[i, 'Final'] = df.at[i, 'Duração']
    return df[['Classe', 'Duração', 'Anotação', 'Tag','Filepath', 'Início', 'Final', 'Amostragem']]


In [8]:
folder_path = '../Datasets'

df_audio_info = extract_audio_info(folder_path)

In [9]:
df_audio_info_organized = organize_dataframe(df_audio_info)

In [10]:
df_audio_info_organized

Unnamed: 0,Classe,Duração,Anotação,Tag,Filepath,Início,Final,Amostragem
0,gunshots,2.094833,gunshot_forensic,BoltAction22,../Datasets\Gunshot Audio Forensic Dataset\Bol...,0,2.094833,48000
1,gunshots,1.864542,gunshot_forensic,BoltAction22,../Datasets\Gunshot Audio Forensic Dataset\Bol...,0,1.864542,48000
2,gunshots,2.071938,gunshot_forensic,BoltAction22,../Datasets\Gunshot Audio Forensic Dataset\Bol...,0,2.071938,48000
3,gunshots,2.074187,gunshot_forensic,BoltAction22,../Datasets\Gunshot Audio Forensic Dataset\Bol...,0,2.074187,48000
4,gunshots,1.963313,gunshot_forensic,BoltAction22,../Datasets\Gunshot Audio Forensic Dataset\Bol...,0,1.963313,48000
...,...,...,...,...,...,...,...,...
73596,Violência Física,10.000000,Violência Física,,../Datasets\HEAR Dataset\AUDIO\media\tiago\EST...,0,10.0,16000
73597,Violência Física,10.000000,Violência Física,,../Datasets\HEAR Dataset\AUDIO\media\tiago\EST...,0,10.0,16000
73598,Violência Física,10.000000,Violência Física,,../Datasets\HEAR Dataset\AUDIO\media\tiago\EST...,0,10.0,16000
73599,Violência Física,10.000000,Violência Física,,../Datasets\HEAR Dataset\AUDIO\media\tiago\EST...,0,10.0,16000


### Agregando Datasets

In [11]:
df_concatenated = pd.concat([df_hollywood_dev, df_audio_info_organized], axis=0)


In [12]:
def aggregate_annotations(df):
    # Agrupando os dados por Classe e Anotação e realizando as agregações
    aggregated_df = df.groupby(['Classe', 'Anotação']).agg({
        'Duração': ['count', 'mean', 'median', 'std']
    }).reset_index()

    # Renomeando as colunas agregadas
    aggregated_df.columns = ['Classe', 'Anotação', 'Quantidade Total', 'Média da Duração', 'Mediana da Duração', 'Desvio Padrão']

    return aggregated_df

In [13]:
aggregated_df = aggregate_annotations(df_concatenated)

In [14]:
aggregated_df

Unnamed: 0,Classe,Anotação,Quantidade Total,Média da Duração,Mediana da Duração,Desvio Padrão
0,Violência Física,Não Violência,36535,9.996247,10.0,0.127032
1,Violência Física,Violência Física,35076,9.998791,10.0,0.070684
2,blood,high,18,52.888889,21.0,64.153207
3,blood,low,366,78.385246,43.0,149.147942
4,blood,medium,56,95.946429,61.5,124.496721
5,blood,unnoticeable,2194,67.113491,37.0,111.517561
6,explosions,(nothing),1293,91.572931,3.06,505.153427
7,explosions,explosion,621,1.464635,1.035202,1.419866
8,explosions,multiple_actions,552,2.551813,1.416628,3.677148
9,fights,1vs1,297,160.185185,78.0,336.340655


In [15]:
aggregated_df.to_csv("dataset_analysis/dataset_analysis.csv")