# MIDI Phraser (ver. 1.0)

***

Powered by tegridy-tools: https://github.com/asigalov61/tegridy-tools

***

#### Project Los Angeles

#### Tegridy Code 2022

***

# (SETUP ENVIRONMENT)

In [None]:
#@title Install all dependencies (run only once per session)

!git clone https://github.com/asigalov61/tegridy-tools
!pip install tqdm

In [None]:
#@title Import all needed modules

print('Loading needed modules. Please wait...')
import os
import copy
import math
import statistics
import random

from tqdm import tqdm

if not os.path.exists('/content/Dataset'):
    os.makedirs('/content/Dataset')

if not os.path.exists('/content/Out'):
    os.makedirs('/content/Out')

if not os.path.exists('/content/Out-Separated'):
    os.makedirs('/content/Out-Separated')

print('Loading TMIDIX module...')
os.chdir('/content/tegridy-tools/tegridy-tools')

import TMIDIX

print('Done!')

os.chdir('/content/')
print('Enjoy! :)')

# (DOWNLOAD SOURCE MIDI DATASET)

In [None]:
#@title Download original LAKH MIDI Dataset

%cd /content/Dataset/

!wget 'http://hog.ee.columbia.edu/craffel/lmd/lmd_full.tar.gz'
!tar -xvf 'lmd_full.tar.gz'
!rm 'lmd_full.tar.gz'

%cd /content/

In [None]:
#@title Download original clean_midi MIDI Dataset
%cd /content/Dataset/
!wget http://hog.ee.columbia.edu/craffel/lmd/clean_midi.tar.gz
!tar -xvf clean_midi.tar.gz
!rm clean_midi.tar.gz
%cd /content/

# (FILE LIST)

In [None]:
#@title Save file list
###########

print('Loading MIDI files...')
print('This may take a while on a large dataset in particular.')

dataset_addr = "/content/Dataset"
# os.chdir(dataset_addr)
filez = list()
for (dirpath, dirnames, filenames) in os.walk(dataset_addr):
    filez += [os.path.join(dirpath, file) for file in filenames]
print('=' * 70)

if filez == []:
    print('Could not find any MIDI files. Please check Dataset dir...')
    print('=' * 70)

print('Randomizing file list...')
random.shuffle(filez)

TMIDIX.Tegridy_Any_Pickle_File_Writer(filez, '/content/filez')

In [None]:
#@title Load file list
filez = TMIDIX.Tegridy_Any_Pickle_File_Reader('/content/filez')

# (PROCESS)

In [None]:
#@title Extract Phrases

print('=' * 70)
print('TMIDIX MIDI Processor')
print('=' * 70)
print('Starting up...')
print('=' * 70)

###########

START_FILE_NUMBER = 0
LAST_SAVED_BATCH_COUNT = 0

input_files_count = START_FILE_NUMBER
files_count = LAST_SAVED_BATCH_COUNT

melody_chords_f = []
stats = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

###########

MIN_NUMBER_OF_INSTRUMENTS = 2
MAX_NUMBER_OF_INSTRUMENTS = 12

LEAD_INSTRUMENT = 3 # Violin
MIN_LEAD_AVG_PITCH = 60
MONO_OR_POLY_LEAD = True
TRIM_LAST_LEAD = True

PHRASE_SEPARATION_THRESHOLD_TIME = 2500 # In ms

MINIMUM_PHRASE_LENGTH_IN_NOTES = 12 # Lead instrument notes
MAXIMUM_PHRASE_LENGTH_IN_NOTES = 512 # Lead instrument notes

###########

print('Processing MIDI files. Please wait...')
print('=' * 70)

for f in tqdm(filez[START_FILE_NUMBER:]):
    try:
        input_files_count += 1

        fn = os.path.basename(f)
        fn1 = fn.split('.')[0]

        # Filtering out giant MIDIs
        file_size = os.path.getsize(f)

        if file_size < 200000:

          #=======================================================
          # START PROCESSING

          # Convering MIDI to ms score with MIDI.py module
          score = TMIDIX.midi2ms_score(open(f, 'rb').read())

          # INSTRUMENTS CONVERSION CYCLE
          events_matrix = []
          itrack = 1
          patches = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

          patch_map = [[0, 1, 2, 3, 4, 5, 6, 7], # Piano 
                      [24, 25, 26, 27, 28, 29, 30], # Guitar
                      [32, 33, 34, 35, 36, 37, 38, 39], # Bass
                      [40, 41], # Violin
                      [42, 43], # Cello
                      [46], # Harp
                      [56, 57, 58, 59, 60], # Trumpet
                      [71, 72], # Clarinet
                      [73, 74, 75], # Flute
                      [-1], # Drums
                      [52, 53], # Choir
                      [16, 17, 18, 19, 20] # Organ
                      ]

          while itrack < len(score):
              for event in score[itrack]:         
                  if event[0] == 'note' or event[0] == 'patch_change':
                      events_matrix.append(event)
              itrack += 1

          events_matrix.sort(key=lambda x: x[1])

          events_matrix1 = []

          for event in events_matrix:
                  if event[0] == 'patch_change':
                      patches[event[2]] = event[3]

                  if event[0] == 'note':
                      event.extend([patches[event[3]]])
                      once = False
                      
                      for p in patch_map:
                          if event[6] in p and event[3] != 9: # Except the drums
                              event[3] = patch_map.index(p)
                              once = True
                              
                      if not once and event[3] != 9: # Except the drums
                          event[3] = 15 # All other instruments/patches channel
                          event[5] = max(80, event[5])
                          
                      if event[3] < 12: # We won't write chans 12-16 for now...
                          events_matrix1.append(event)
                          stats[event[3]] += 1

          #=======================================================
          # PROCESSING

          # checking number of instruments in a composition
          instruments_list = list(set([y[3] for y in events_matrix1]))
          num_instr = len(instruments_list)
          
          if len(events_matrix1) > 0:
            if num_instr >= MIN_NUMBER_OF_INSTRUMENTS and num_instr <= MAX_NUMBER_OF_INSTRUMENTS:

              # Sorting by pitch, then by start-time
              events_matrix1.sort(key=lambda x: x[4], reverse=True)
              events_matrix1.sort(key=lambda x: x[1])

              events_matrix2 = []

              for e in events_matrix1:
                if e[3] == LEAD_INSTRUMENT:
                  events_matrix2.append(e)
             
              if len(events_matrix2) > 0:
                lead_avg_pitch = statistics.mean([y[4] for y in events_matrix2])

                if lead_avg_pitch >= MIN_LEAD_AVG_PITCH:

                  events_matrix3 = []

                  phrases_start_times = []
                  phrases_end_times = []

                  pe = events_matrix2[0]
                  em = []
                  for e in events_matrix2:
                    if TRIM_LAST_LEAD:
                      end_time = e[1]-pe[1]
                    else:
                      end_time = e[1]-(pe[1]+pe[2])
                    
                    if end_time < PHRASE_SEPARATION_THRESHOLD_TIME:           
                      em.append(e)
                  
                    else:
                      if len(em) > 0:
                        good = True

                        if MONO_OR_POLY_LEAD:
                          ptt = em[0][1]
                          pdd = em[0][2]
                          for ee in em[1:]:                    
                            if ee[1] - ptt == 0:
                              good = False
                            if ee[1] < ptt+pdd:
                              good = False
                            
                            ptt = ee[1]
                            pdd = ee[2]

                        if good and len(em) >= MINIMUM_PHRASE_LENGTH_IN_NOTES:
                          if len(em) <= MAXIMUM_PHRASE_LENGTH_IN_NOTES:
                            phrases_start_times.append(em[0][1])
                            phrases_end_times.append(em[-1][1])

                            events_matrix3.append(em)

                        em = []
                        em.append(e)

                    pe = e

                  if len(events_matrix3) > 0:

                    events_matrix_f = []
                    events_matrix_f1 = []

                    phrase_counter = 0
                    current_time = 0

                    emf1 = []

                    for e in events_matrix1:

                      current_phrase_start_time = phrases_start_times[phrase_counter]
                      current_phrase_end_time = phrases_end_times[phrase_counter]

                      if e[1] >= current_phrase_start_time and e[1] <= current_phrase_end_time:
                        events_matrix_f.append(e)
                        emf1.append(e)
                        

                      else:
                        if current_time >= current_phrase_end_time and phrase_counter < len(events_matrix3)-1:
                          phrase_counter += 1
                          events_matrix_f1.append(emf1)
                          emf1 = []
                        
                      current_time = e[1]

                    events_matrix_f1.append(emf1)

                    events_matrix4 = []

                    for events_matrix_f2 in events_matrix_f1:
                      pttt = events_matrix_f2[0][1]
                      ptttt = 0
                      em4 = []

                      for ee in events_matrix_f2:
                        eee = copy.deepcopy(ee)
                        ptttt += ee[1] - pttt
                        eee[1] = ptttt
                        pttt = ee[1]
                        em4.append(eee)
                      
                      events_matrix4.append(em4)
                      events_matrix_f1 = []        
                    
                    detailed_stats = TMIDIX.Tegridy_SONG_to_MIDI_Converter(events_matrix_f,
                                                                output_signature = 'Phraser',  
                                                                output_file_name = '/content/Out/'+fn1, 
                                                                track_name='Project Los Angeles',
                                                                list_of_MIDI_patches=[0, 24, 32, 40, 42, 46, 56, 71, 73, 0, 53, 19, 0, 0, 0, 0],
                                                                number_of_ticks_per_quarter=500)


                    for i in range(len(events_matrix4)):
                      detailed_stats = TMIDIX.Tegridy_SONG_to_MIDI_Converter(events_matrix4[i],
                                                                output_signature = 'Phraser',  
                                                                output_file_name = '/content/Out-Separated/'+fn1+'_Phrase_'+str(i+1), 
                                                                track_name='Project Los Angeles',
                                                                list_of_MIDI_patches=[0, 24, 32, 40, 42, 46, 56, 71, 73, 0, 53, 19, 0, 0, 0, 0],
                                                                number_of_ticks_per_quarter=500)


                    #=======================================================
                    
                    # Processed files counter
                    files_count += 1
   
    except KeyboardInterrupt:
        print('Saving current progress and quitting...')
        break  

    except Exception as ex:
        print('WARNING !!!')
        print('=' * 70)
        print('Bad MIDI:', f)
        print('Error detected:', ex)
        print('=' * 70)
        continue

print('=' * 70)
print('Done!')   
print('=' * 70)

# Congrats! You did it! :)