In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('data/labels.csv')

In [3]:
df.groupby('mapped_label').size().sort_values(ascending=False)

mapped_label
Verse           2825
Chorus          2541
Silence         2399
Other           2138
Solo            1665
End             1359
Intro           1117
Theme            736
Interlude        712
Outro            688
Instrumental     453
Transition       418
Bridge           415
Head             352
Pre-chorus       284
Coda             220
Fade-out         154
Pre-verse         54
Post-chorus       51
dtype: int64

In [4]:
df2 = pd.read_csv('data/downloaded_audio_labels.csv')

In [6]:
from python.utils import assign_label_group


In [7]:
df2['mapped_label'] = df2['label'].apply(assign_label_group)
df2.to_csv('data/downloaded_audio_labels.csv', index=False)

In [8]:
df2.groupby('mapped_label').size().sort_values(ascending=False)

mapped_label
Verse           945
Silence         937
Chorus          832
Other           722
End             517
Solo            511
Intro           413
Theme           406
Outro           283
Interlude       187
Transition      167
Head            164
Bridge          138
Fade-out         93
Pre-chorus       90
Instrumental     86
Coda             62
Pre-verse        35
Post-chorus      12
dtype: int64

In [9]:
df2[df2['mapped_label'] == 'Other'].groupby('label').size().sort_values(ascending=False)

label
no_function    663
break           10
voice            7
spoken           7
Development      6
out              4
applause         4
third            4
guitar           3
Recap            3
variation_2      2
count-in         1
gypsy            1
hammond          1
backing          1
pick-up          1
steel            1
variation_1      1
vocals           1
organ            1
dtype: int64

In [10]:
one_hot_encoded_data = pd.get_dummies(df, columns=['mapped_label'])
one_hot_encoded_data.head()

Unnamed: 0,song_id,timestamp,label,mapped_label_Bridge,mapped_label_Chorus,mapped_label_Coda,mapped_label_End,mapped_label_Fade-out,mapped_label_Head,mapped_label_Instrumental,...,mapped_label_Other,mapped_label_Outro,mapped_label_Post-chorus,mapped_label_Pre-chorus,mapped_label_Pre-verse,mapped_label_Silence,mapped_label_Solo,mapped_label_Theme,mapped_label_Transition,mapped_label_Verse
0,2,0.0,Silence,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
1,2,0.464399,Intro,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,2,14.379864,no_function,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
3,2,23.986213,no_function,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
4,2,33.622494,Verse,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [11]:
from sklearn.preprocessing import OneHotEncoder

# Assuming df is your DataFrame

# 1. Initialize the OneHotEncoder
encoder = OneHotEncoder(sparse=False, drop='first')  # Setting sparse to False returns a numpy array

# 2. Fit the encoder on the 'mapped_label' column of the training data
encoder.fit(df2[['mapped_label']])

# 3. Transform the 'mapped_label' column
encoded_data = encoder.transform(df2[['mapped_label']])

# Convert the numpy array to a DataFrame for easy visualization and manipulation
encoded_df2 = pd.DataFrame(encoded_data, columns=encoder.get_feature_names_out(['mapped_label']))

# If you want to join this back to your original df (without 'mapped_label' column):
# df = df.drop(columns=['mapped_label']).reset_index(drop=True)
one_hot_encoded_data = pd.concat([df2, encoded_df2], axis=1)

one_hot_encoded_data.head()

Unnamed: 0,song_id,timestamp,label,mapped_label,mapped_label_Chorus,mapped_label_Coda,mapped_label_End,mapped_label_Fade-out,mapped_label_Head,mapped_label_Instrumental,...,mapped_label_Other,mapped_label_Outro,mapped_label_Post-chorus,mapped_label_Pre-chorus,mapped_label_Pre-verse,mapped_label_Silence,mapped_label_Solo,mapped_label_Theme,mapped_label_Transition,mapped_label_Verse
0,752,0.0,Silence,Silence,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1,752,0.13932,Intro,Intro,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,752,21.083061,Head,Head,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,752,47.113288,Transition,Transition,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,752,67.964807,Head,Head,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [12]:
one_hot_encoded_data = one_hot_encoded_data.drop(columns=['mapped_label', 'label'])

In [13]:
one_hot_encoded_data.to_csv('data/one_hot_encoded_data.csv')

In [28]:
one_hot_encoded_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6600 entries, 0 to 6599
Data columns (total 20 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   song_id                    6600 non-null   int64  
 1   timestamp                  6600 non-null   float64
 2   mapped_label_Chorus        6600 non-null   float64
 3   mapped_label_Coda          6600 non-null   float64
 4   mapped_label_End           6600 non-null   float64
 5   mapped_label_Fade-out      6600 non-null   float64
 6   mapped_label_Head          6600 non-null   float64
 7   mapped_label_Instrumental  6600 non-null   float64
 8   mapped_label_Interlude     6600 non-null   float64
 9   mapped_label_Intro         6600 non-null   float64
 10  mapped_label_Other         6600 non-null   float64
 11  mapped_label_Outro         6600 non-null   float64
 12  mapped_label_Post-chorus   6600 non-null   float64
 13  mapped_label_Pre-chorus    6600 non-null   float

In [14]:
endpoints_df = one_hot_encoded_data.copy()

In [29]:
endpoints_df['end_timestamp'] = endpoints_df.groupby('song_id')['timestamp'].shift(-1)

In [30]:
endpoints_df = endpoints_df.rename(columns={'timestamp': 'start_timestamp'})

In [36]:
endpoints_df.info()
endpoints_df.to_csv('data/start_and_end.csv', index=False)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6600 entries, 0 to 6599
Data columns (total 21 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   song_id                    6600 non-null   int64  
 1   start_timestamp            6600 non-null   float64
 2   mapped_label_Chorus        6600 non-null   float64
 3   mapped_label_Coda          6600 non-null   float64
 4   mapped_label_End           6600 non-null   float64
 5   mapped_label_Fade-out      6600 non-null   float64
 6   mapped_label_Head          6600 non-null   float64
 7   mapped_label_Instrumental  6600 non-null   float64
 8   mapped_label_Interlude     6600 non-null   float64
 9   mapped_label_Intro         6600 non-null   float64
 10  mapped_label_Other         6600 non-null   float64
 11  mapped_label_Outro         6600 non-null   float64
 12  mapped_label_Post-chorus   6600 non-null   float64
 13  mapped_label_Pre-chorus    6600 non-null   float

In [33]:
salami_metadata = pd.read_csv('data/salami-data-public/metadata/metadata.csv')


In [34]:
salami_metadata.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1446 entries, 0 to 1445
Data columns (total 22 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   SONG_ID                  1446 non-null   int64  
 1   SOURCE                   1446 non-null   object 
 2   ANNOTATOR1               1446 non-null   int64  
 3   ANNOTATOR2               1446 non-null   int64  
 4   FILE_LOCATION            1446 non-null   object 
 5   SONG_DURATION            1310 non-null   float64
 6   EMPTY                    0 non-null      float64
 7   SONG_TITLE               1446 non-null   object 
 8   ARTIST                   1446 non-null   object 
 9   FORMAT                   1446 non-null   object 
 10  ANNOTATION_TIME1         1423 non-null   float64
 11  ANNOTATION_TIME2         980 non-null    float64
 12  TEXTFILE1                1349 non-null   object 
 13  TEXTFILE2                905 non-null    object 
 14  CLASS                   

In [35]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18581 entries, 0 to 18580
Data columns (total 5 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   song_id        18581 non-null  int64  
 1   timestamp      18581 non-null  float64
 2   label          18581 non-null  object 
 3   mapped_label   18581 non-null  object 
 4   end_timestamp  17222 non-null  float64
dtypes: float64(2), int64(1), object(2)
memory usage: 725.9+ KB


In [37]:
start_and_end = pd.read_csv('data/start_and_end.csv')

In [38]:
start_and_end.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6600 entries, 0 to 6599
Data columns (total 22 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   song_id                    6600 non-null   int64  
 1   start_timestamp            6600 non-null   float64
 2   mapped_label_Chorus        6600 non-null   float64
 3   mapped_label_Coda          6600 non-null   float64
 4   mapped_label_End           6600 non-null   float64
 5   mapped_label_Fade-out      6600 non-null   float64
 6   mapped_label_Head          6600 non-null   float64
 7   mapped_label_Instrumental  6600 non-null   float64
 8   mapped_label_Interlude     6600 non-null   float64
 9   mapped_label_Intro         6600 non-null   float64
 10  mapped_label_Other         6600 non-null   float64
 11  mapped_label_Outro         6600 non-null   float64
 12  mapped_label_Post-chorus   6600 non-null   float64
 13  mapped_label_Pre-chorus    6600 non-null   float

In [45]:
start_and_end = start_and_end.dropna(subset=['end_timestamp'])

In [48]:
start_and_end = start_and_end.drop('song_duration', axis=1)

In [50]:
start_and_end.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6083 entries, 0 to 6598
Data columns (total 21 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   song_id                    6083 non-null   int64  
 1   start_timestamp            6083 non-null   float64
 2   mapped_label_Chorus        6083 non-null   float64
 3   mapped_label_Coda          6083 non-null   float64
 4   mapped_label_End           6083 non-null   float64
 5   mapped_label_Fade-out      6083 non-null   float64
 6   mapped_label_Head          6083 non-null   float64
 7   mapped_label_Instrumental  6083 non-null   float64
 8   mapped_label_Interlude     6083 non-null   float64
 9   mapped_label_Intro         6083 non-null   float64
 10  mapped_label_Other         6083 non-null   float64
 11  mapped_label_Outro         6083 non-null   float64
 12  mapped_label_Post-chorus   6083 non-null   float64
 13  mapped_label_Pre-chorus    6083 non-null   float

In [53]:
start_and_end.groupby('mapped_label_Intro').size()

mapped_label_Intro
0.0    5670
1.0     413
dtype: int64

In [54]:
start_and_end.to_csv('data/start_and_end.csv', index=False)