In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from sklearn import datasets
import os
import sklearn

In [2]:
import zipfile
import os

# file path to the zip file
zip_filename = 'fma_metadata.zip'

# Verify if the file exists in the current directory
if os.path.exists(zip_filename):
    try:
        # Ouvrir et extraire le fichier ZIP
        with zipfile.ZipFile(zip_filename, 'r') as zip_ref:
            # Extraire tout dans le dossier actuel
            zip_ref.extractall(os.getcwd())
            print(f"The file '{zip_filename}' has been extracted successfully in the current directory.")
    except zipfile.BadZipFile:
        print(f"Error: The file '{zip_filename}' is corrupted or not a zip file.")
else:
    print(f"Error: The file '{zip_filename}' does not exist in the current directory.")


The file 'fma_metadata.zip' has been extracted successfully in the current directory.


In [3]:
#Datasets keys and tracks
tracks = pd.read_csv('fma_metadata/tracks.csv', header=[1])
keys = pd.read_csv('fma_metadata/keys.csv')
#Size of the datasets
print("Size of the datasets:")
print(tracks.shape)
print(keys.shape)

  tracks = pd.read_csv('fma_metadata/tracks.csv', header=[1])


Size of the datasets:
(106575, 53)
(5489, 3)


In [4]:
#columns of the datasets
print('tracks :',tracks.columns)
print(keys.columns)

tracks : Index(['Unnamed: 0', 'comments', 'date_created', 'date_released', 'engineer',
       'favorites', 'id', 'information', 'listens', 'producer', 'tags',
       'title', 'tracks', 'type', 'active_year_begin', 'active_year_end',
       'associated_labels', 'bio', 'comments.1', 'date_created.1',
       'favorites.1', 'id.1', 'latitude', 'location', 'longitude', 'members',
       'name', 'related_projects', 'tags.1', 'website', 'wikipedia_page',
       'split', 'subset', 'bit_rate', 'comments.2', 'composer',
       'date_created.2', 'date_recorded', 'duration', 'favorites.2',
       'genre_top', 'genres', 'genres_all', 'information.1', 'interest',
       'language_code', 'license', 'listens.1', 'lyricist', 'number',
       'publisher', 'tags.2', 'title.1'],
      dtype='object')
Index(['track_id', 'spotify_uri', 'key_and_mode'], dtype='object')


In [5]:
#Change 'Unnamed: 0' in 'track_id'
tracks.rename(columns={'Unnamed: 0': 'track_id'}, inplace=True)



In [6]:
#print the first 5 rows of the datasets
print(keys.head())

   track_id                           spotify_uri key_and_mode
0        10  spotify:track:66381EvBZ6e3RXzYATpGmN     F# Major
1       141  spotify:track:7f0KQDOB9khm9ZtuWjjtre      F Major
2       153  spotify:track:348mNhOGbcRxj7e35jASFm     C# Major
3       173  spotify:track:1B6BPYXUp1FrxUfpRI38MM      A Major
4       181  spotify:track:3Ddc7Lne6RjhTlRPGWpRdi      G Major


In [7]:
#print 'id' column
print(tracks['id'].head(10))

0    NaN
1    1.0
2    1.0
3    1.0
4    6.0
5    4.0
6    4.0
7    4.0
8    4.0
9    4.0
Name: id, dtype: float64


In [8]:
# Filter the DataFrame to keep only rows where 'track_id' is numeric
tracks = tracks[pd.to_numeric(tracks['track_id'], errors='coerce').notna()]

# Convert 'track_id' to int type
tracks['track_id'] = tracks['track_id'].astype(int)


In [9]:
# Merge the two DataFrames on 'track_id'
df = pd.merge(tracks, keys, left_on='track_id', right_on='track_id')
print(df.head())
#shape
print(df.shape)
#songs keys
print(df['key_and_mode'].unique())

   track_id  comments         date_created        date_released engineer  \
0        10       0.0  2008-11-26 01:45:08  2008-02-06 00:00:00      NaN   
1       141       0.0  2008-11-26 01:49:57  2009-01-16 00:00:00      NaN   
2       153       0.0  2008-11-26 01:50:50  2005-06-07 00:00:00      NaN   
3       173       0.0  2008-11-26 01:51:48  2008-06-23 00:00:00      NaN   
4       181       0.0  2008-11-26 01:52:15  2007-04-13 00:00:00      NaN   

   favorites    id                                        information  \
0        4.0   6.0                                                NaN   
1        1.0  60.0  <p>A full ensamble of strings, drums, electron...   
2        0.0  69.0  <p>Self-titled debut released on Philadelphia'...   
3        0.0  72.0  <p><em>from James' <a href="http://www.belowpd...   
4        0.0  79.0  <p>This Human Ear Music reissue compiles a “Be...   

   listens producer  ... language_code  \
0  47632.0      NaN  ...            en   
1   1304.0      NaN 

In [10]:
# Transformations
# Create a dictionary to map notes and modes to their respective semitone offsets
semitone_offsets = {
    'C': 0, 'C#': 1, 'Db': 1, 'D': 2, 'D#': 3, 'Eb': 3, 
    'E': 4, 'Fb': 4, 'E#': 5, 'F': 5, 'F#': 6, 'Gb': 6, 
    'G': 7, 'G#': 8, 'Ab': 8, 'A': 9, 'A#': 10, 'Bb': 10, 
    'B': 11, 'Cb': 11, 'B#': 0
}
# Create a dictionary to map modes to their respective values
modes = {'Major': 0, 'minor': 1}

# Transformation
def process_key(key):
    key = key.split(' ')
    note = key[0]
    mode = key[1]
    return semitone_offsets[note], modes[mode]

# Apply the transformation to the 'key_and_mode' column
df['note'],df['mode'] = None,None
for key in df['key_and_mode']:
    note, mode = process_key(key)
    df.loc[df['key_and_mode'] == key, 'note'] = note
    df.loc[df['key_and_mode'] == key, 'mode'] = mode



    

In [11]:
#Visualization of columns 'key_and_mode', 'note' and 'mode'
print(df[['track_id', 'id', 'key_and_mode', 'note', 'mode']].head(10))

   track_id     id key_and_mode note mode
0        10    6.0     F# Major    6    0
1       141   60.0      F Major    5    0
2       153   69.0     C# Major    1    0
3       173   72.0      A Major    9    0
4       181   79.0      G Major    7    0
5       184   82.0     Bb Major   10    0
6       213   86.0      D Major    2    0
7       251  101.0      A Major    9    0
8       331  108.0      A minor    9    1
9       347  112.0      A minor    9    1


In [12]:
#Filter around C 
def filtre_tonalité() : 
    note_centrale = 0
    # keep the notes in the range of -2 to 2 semitones from the central note
    notes = [(note_centrale + i) % 12 for i in range(-2, 3)]
    return notes

#test
print("1 :",filtre_tonalité())


1 : [10, 11, 0, 1, 2]


In [13]:
#Filter the DataFrame to keep only rows where 'note' is in the specified range
notes = filtre_tonalité()
print(notes)
print("df : ",df['note'].unique())

df_filtered = df[df['note'].isin(notes)]
print(df_filtered[['track_id', 'id', 'note', 'mode']].head(10))
print(df_filtered.shape)
print(df_filtered['note'].unique())


[10, 11, 0, 1, 2]
df :  [6 5 1 9 7 10 2 8 3 4 0 11]
    track_id     id note mode
2        153   69.0    1    0
5        184   82.0   10    0
6        213   86.0    2    0
18       540  142.0    1    0
21       547  142.0    1    0
23       564  148.0    0    1
25       566  148.0    0    0
26       567  148.0   11    0
35       608  169.0    2    1
36       610  171.0   11    0
(2388, 57)
[1 10 2 0 11]


In [None]:
notes = filtre_tonalité()
mode = 0
# Filter the DataFrame to keep only rows where 'note' is in the specified range and mode is Major
df_filtered = df[(df['note'].isin(notes)) & (df['mode'] == mode)]
print(df_filtered[['track_id', 'id', 'note', 'mode']].head(10))
print(df_filtered.shape)
#Notes and modes in the filtered DataFrame
print(df_filtered['note'].unique())
print(df_filtered['mode'].unique())

    track_id     id note mode
2        153   69.0    1    0
5        184   82.0   10    0
6        213   86.0    2    0
18       540  142.0    1    0
21       547  142.0    1    0
25       566  148.0    0    0
26       567  148.0   11    0
36       610  171.0   11    0
37       611  172.0    0    0
39       625  177.0    0    0
(1320, 57)
[1 10 2 0 11]
[0]


In [15]:
#genres in the dataset
print(df_filtered['genre_top'].unique())

['Rock' 'Pop' 'Folk' 'Hip-Hop' 'International' 'Blues' nan 'Experimental'
 'Old-Time / Historic' 'Electronic' 'Country' 'Jazz' 'Classical'
 'Instrumental' 'Soul-RnB']


In [16]:
#list of 'genres'(primary and secondary genres)
print(df_filtered['genres'].unique())

['[26]' '[27]' '[10]' '[17]' '[12]' '[21]' '[46]' '[3]' '[38, 184, 456]'
 '[1]' '[76]' '[25]' '[12, 85]' '[12, 89]' '[10, 12, 66]' '[12, 17]'
 '[10, 12]' '[30, 65]' '[25, 85]' '[8]' '[58]' '[15]' '[22, 38]'
 '[17, 103]' '[7, 10, 103]' '[12, 25, 85]' '[66]' '[17, 66, 76]'
 '[103, 137]' '[12, 66]' '[53, 66]' '[6, 7, 125]' '[12, 58, 66]' '[]'
 '[38, 79, 107]' '[10, 296]' '[297]' '[3, 17]' '[10, 103]' '[2, 8, 92]'
 '[38, 107, 224]' '[27, 38]' '[185]' '[15, 38, 42, 64]' '[4, 118, 179]'
 '[14, 21]' '[2, 103]' '[10, 89]' '[63, 137]' '[12, 103]' '[137]'
 '[26, 66]' '[12, 45, 111]' '[4]' '[58, 76]' '[53]' '[45]' '[8, 37]'
 '[17, 25]' '[12, 26]' '[33, 456]' '[66, 103]' '[33]' '[1, 138]'
 '[76, 170]' '[17, 63, 76]' '[15, 18, 26, 76]' '[17, 76, 362]'
 '[10, 27, 66]' '[9, 17, 27]' '[10, 66]' '[5, 15, 38, 46, 187]' '[27, 66]'
 '[10, 12, 66, 111]' '[38]' '[5, 456]' '[17, 33, 47, 53, 58, 66]' '[79]'
 '[15, 76]' '[26, 58, 66]' '[36, 38, 58]' '[66, 89]' '[85, 111]'
 '[26, 107, 359]' '[25, 71]' '[240, 29

In [17]:
#median and quantiles of the 'duration' column
print(df_filtered['duration'].median())
print(df_filtered['duration'].quantile(0.25))
print(df_filtered['duration'].quantile(0.75))

189.0
137.0
243.25


In [18]:
# Filter the DataFrame to keep only rows where 'duration' is between -5 and 5 seconds from the median
q1 = 184  
q3 = 194  

df_filtered = df_filtered[(df_filtered['duration'] >= q1) & (df_filtered['duration'] <= q3)]

print(df_filtered.head())
print(df_filtered.shape)

     track_id  comments         date_created        date_released    engineer  \
215      4929       0.0  2009-01-02 21:41:31  1999-01-03 00:00:00         NaN   
276      7713       0.0  2009-03-13 12:25:05  2005-03-03 00:00:00        OCDJ   
622     18584       0.0  2009-09-03 11:13:57                  NaN  kev reverb   
726     21083       1.0  2009-11-16 12:53:26  2009-10-29 00:00:00         NaN   
839     24736       0.0  2010-02-03 17:59:36                  NaN         NaN   

     favorites      id                                        information  \
215        3.0  1681.0  <p>Now you will be able to hear this unique an...   
276        1.0  2106.0  <p>Live on Trouble's show from March 3 2005 <b...   
622        0.0  4237.0                                                NaN   
726        4.0  4695.0  <p>From the icy depths of Canada comes “Pample...   
839        1.0  5412.0                                                NaN   

     listens           producer  ... listens.1 lyr

In [19]:
#Remove genres : nan 'Experimental' 'Instrumental' 'Clasical' and 'Jazz'
df_filtered = df_filtered[df_filtered['genre_top'].notna()]
df_filtered = df_filtered[df_filtered['genre_top'] != 'Experimental']
df_filtered = df_filtered[df_filtered['genre_top'] != 'Instrumental']
df_filtered = df_filtered[df_filtered['genre_top'] != 'Classical']
df_filtered = df_filtered[df_filtered['genre_top'] != 'Jazz']


#shape
print(df_filtered.shape)

(23, 57)


In [20]:
#print 'id_track' unique
print(df_filtered['track_id'].unique())

[  7713  18584  21083  27865  29448  29472  31427  34083  52036  52276
  53744  54298  57176  69781  72047  73768  74146  75588  85424  88861
  90829 108031 122883]


In [21]:
#genres still present
print(df_filtered['genres'].unique())

['[17, 103]' '[45]' '[297]' '[66, 89]' '[12, 66]' '[33]' '[12, 45, 111]'
 '[12]' '[76]' '[25, 85]' '[12, 25, 111]' '[25, 89]' '[94]' '[25, 111]'
 '[17]' '[130]' '[15, 236]' '[103]' '[10]' '[66, 109]' '[8]']


In [22]:
#save new dataset
df_filtered.to_csv('fma_metadata/filtered_tracks.csv', index=False)

In [23]:
#print title of each track
print(df_filtered['tracks'])

276      6.0
622      4.0
726      4.0
911      1.0
951      7.0
968     14.0
1085    13.0
1193     4.0
1922    13.0
1938     6.0
1974    11.0
2034     1.0
2179    18.0
2825     1.0
2905    14.0
2960    20.0
3003     8.0
3129    12.0
3785    10.0
3953    25.0
4043    21.0
5100    18.0
5448     5.0
Name: tracks, dtype: float64


In [24]:
#Supression of title.1 = 'Marbled Birds' and 'Futurey Gamey' (first is not present in the dataset, second is an instrumental)
df_filtered = df_filtered[df_filtered['title.1'] != 'Marbled Birds']
df_filtered = df_filtered[df_filtered['title.1'] != 'Futurey Gamey']

In [None]:
#Look for missing titles in the folder ./musiques
titles = df_filtered['title.1'].astype(str).tolist()  # Convert title.1 to a list of strings

# List all files in the specified folder
folder_path = './musiques' # Replace with your folder path
file_names = os.listdir(folder_path)

# Check if the titles are present in the file names
titles_in_files = [title for title in titles if any(title in file for file in file_names)]
titles_missing = [title for title in titles if title not in titles_in_files]

# Print the results
print(f"Found titles: {len(titles_in_files)} sur {len(titles)}")
print("missing titles:", titles_missing)

Found titles: 21 sur 21
missing titles: []
