In [None]:
import os
import pandas as pd
dataset_dir = '/media/bleu/bulkdata2/deeprhythmdata'

full_df = pd.read_csv(os.path.join(dataset_dir, 'manifest.csv'))
full_df.drop(columns=[col for col in full_df.columns if 'Unnamed' in col], inplace=True)
full_df.head()


In [None]:

# Splitting the full dataframe into separate dataframes for each source
df_slakh = full_df[full_df['source'] == 'slakh']
df_fma = full_df[full_df['source'] == 'fma']
df_giantsteps = full_df[full_df['source'] == 'giantsteps']
df_ballroom = full_df[full_df['source'] == 'ballroom']
df_arcbeam = full_df[full_df['source'] == 'arcbeam']
df_yarr = full_df[full_df['source'] == 'yarr']
len(df_slakh), len(df_fma), len(df_giantsteps), len(df_ballroom), len(df_arcbeam), len(df_yarr)

In [None]:
def split_dataframe_direct(df, proportions):
    """
    Shuffle and split a DataFrame into three parts based on given proportions.

    Parameters:
    - df: The pandas DataFrame to split.
    - proportions: A list or tuple of three proportions. The sum must be equal to 1.

    Returns:
    - Three pandas DataFrames corresponding to the given proportions.
    """
    train_ratio, test_ratio, validate_ratio = proportions
    total_ratio = train_ratio + test_ratio + validate_ratio
    assert abs(total_ratio - 1) < 1e-6, "Ratios must sum to 1"
    # Shuffle the DataFrame
    df_shuffled = df.sample(frac=1).reset_index(drop=True)

    # Calculate split indices
    first_split = int(proportions[0] * len(df))
    second_split = first_split + int(proportions[1] * len(df))

    # Split the DataFrame
    df_first = df_shuffled.iloc[:first_split]
    df_second = df_shuffled.iloc[first_split:second_split]
    df_third = df_shuffled.iloc[second_split:]

    return df_first, df_second, df_third

In [None]:
dataframes = [df_slakh, df_arcbeam, df_giantsteps, df_ballroom, df_yarr]
full_train, full_val, full_test = pd.DataFrame(), pd.DataFrame(), pd.DataFrame()

for df in dataframes:
    train, val, test = split_dataframe_direct(df, (0.8, 0.1, 0.1))
    full_train = pd.concat([full_train, train])
    full_val = pd.concat([full_val, val])
    full_test = pd.concat([full_test, test])

full_train.to_csv(os.path.join(dataset_dir, 'train.csv'))
full_val.to_csv(os.path.join(dataset_dir, 'val.csv'))
full_test.to_csv(os.path.join(dataset_dir, 'test.csv'))
len(full_train), len(full_val), len(full_test)