# Preprocessing: Features Extraction for Data Cleaning

## 1. Set-up


In [0]:
# connect Google Colab with Drive
from google.colab import drive
drive.mount('/content/drive')

In [0]:
# instal music21 library 
!pip install music21

In [0]:
# import libraries
from music21 import *
import pandas as pd
import numpy as np
import random
import shutil
import glob
import os

## 2. Extracting Instruments for Data Cleaning

First of all we need to clean the data as we are not 100% sure that all the MIDIs collected are only piano.

In [0]:
# function to open a midi file from path
def open_midi(midi_path):
    mf = midi.MidiFile()
    mf.open(midi_path)
    mf.read()
    mf.close()
    return midi.translate.midiFileToStream(mf)

In [0]:
midi_folder_path = "../content/drive/My Drive/brain_music/data/audios/midi"
midis_list = os.listdir(midi_folder_path)
print(len(midis_list),'midi files in our database')

In [0]:
# function to concat path to build loop for all midi in the folder
def concat_path(path, child):
    return path + "/" + child

In [0]:
# select a random midi in the general folder
midi_file = random.choice(midis_list)
# get its path
midi_path = concat_path(midi_folder_path, midi_file)
# opening midi
base_midi = open_midi(midi_path)
base_midi

In [0]:
# function to get all the instruments found in each midi
def list_instruments(midi):
  list_instrus = []
  partStream = midi.parts.stream()
  for p in partStream:
    aux = p
    list_instrus.append(p.partName)
  return list(set(list_instrus))

In [0]:
# test - check if only piano or other instrument(s)
instrus = list_instruments(base_midi)
print("List of instruments found in the MIDI file:{}".format(instrus))

In [0]:
# function to return the list of instruments found in MIDI or raise an exception if non-identified
def extract_instrus(midi_path):
  try:
    return list_instruments(open_midi(midi_path))
  except:
    return 'issue'

In [0]:
# to remove duplicated or None values of the list
def remove_none_from_list(the_list, val):
    return [value for value in the_list if value != val]

In [0]:
# function to assign a unique label to instruments: piano_only, other_single_instru or too_many_instrus
def clean_instrus(instrus_list):
  if (instrus_list == None) | (instrus_list == 'issue') | (instrus_list == [] ):
    return 'other'
  else:
    clean_list = remove_none_from_list(instrus_list, None)
    clean_list = [value.strip().lower() for value in clean_list]
    print(clean_list)
    if len(clean_list) == 1:
      if clean_list[0] == 'piano':
        return 'only_piano'
      elif clean_list[0] != 'piano':
        return 'other_single_instru'
    elif len(clean_list) > 1:
      return 'too_many_instrus'
    else:
      return 'other'

## 3. Extracting main key and mode as Features

Once we know which MIDIs are piano only, another single intruments or more than one instrument, we can extract the main key as Feature.

As we need to classify music with emotions, we can directly obtain the music mode (comparable metric) from the main key and organize all the MIDIs files into subfolders minor VS major into each instrument classes folders.

In [0]:
# test to extract main key of MIDI 
music_analysis = base_midi.analyze('key')
print("Expected music key: {0}".format(music_analysis))

In [0]:
# function to return the main key of MIDI or raise an exception if non enough clear
def analyze_key(midi_path):
  try:
    return "{0}".format(open_midi(midi_path).analyze('key'))
  except:
    return 'issue'

In [0]:
# function to extract mode from main key: minor, major or others
def extract_mode(key_value):
  if 'major' in key_value:
    return 'major'
  elif 'minor' in key_value:
    return 'minor'
  else:
    return 'others'

In [0]:
# defining all the path to organize the directory and classify MIDIs into folders and subfolders
# first separating instrument label and then by mode
source_path = "../content/drive/My Drive/brain_music/data/audios/midi/"
piano_path = "../content/drive/My Drive/brain_music/data/audios/midi/piano_only/"
minor_path = "../content/drive/My Drive/brain_music/data/audios/midi/piano_only/minor/"
major_path = "../content/drive/My Drive/brain_music/data/audios/midi/piano_only/major/"
others_path = "../content/drive/My Drive/brain_music/data/audios/midi/others/"
others_single_path = "../content/drive/My Drive/brain_music/data/audios/midi/others/single/"
others_more_path = "../content/drive/My Drive/brain_music/data/audios/midi/others/more/"

In [0]:
# extracting instruments and mode to organize MIDIs accross the directory into folders and subfolders
for midi_file in os.listdir(others_path):
  midi_path = concat_path(others_path, midi_file)
  final_instrus = clean_instrus(extract_instrus(midi_path))
  mode = extract_mode(analyze_key(midi_path))
  if final_instrus == 'only_piano':
    if mode == 'major':
      shutil.copy(midi_path, major_path)
    elif mode == 'minor':
      shutil.copy(midi_path, minor_path)
    else:
      shutil.copy(midi_path, piano_path)
  elif mode != 'others':
    if final_instrus == 'other_single_instru':
      shutil.copy(midi_path, '{}{}/'.format(others_single_path, mode)) 
    elif final_instrus == 'too_many_instrus':
      shutil.copy(midi_path, '{}{}/'.format(others_more_path, mode))
    elif final_instrus == 'other':
      shutil.copy(midi_path, '{}{}/'.format(others_nan_path, mode))
  elif mode == 'others':
    if final_instrus == 'other_single_instru':
      shutil.copy(midi_path, others_single_path) 
    elif final_instrus == 'too_many_instrus':
      shutil.copy(midi_path, others_more_path)
    elif final_instrus == 'other':
      shutil.copy(midi_path, others_nan_path)