# Senior Project Notebook  
In the first phase I have downloaded the matched dataset from the Lakh Midi Dataset and cleared the songs with duplicate midi files after that I have run a web scraper for spotify api to get the features about the songs from spotify like tempo, danceability and so on. I have tried to match artists and songs that were available in spotify. After removing all the duplicates and unmatched songs 3800 songs are left for training
Songs are in the clear_midi file,where the folder name specifies the artist name and the file name specifies the artist name. The scraper I have used stores the data in a file called statistics.csv


In [1]:
from music21 import converter, instrument, note, chord
import matplotlib
import pandas as pd
import numpy as np
from math import ceil
from threading import Thread, Lock

num_threads = 8

note_dict = {
        'C':0,
        'D':2,
        'E':4,
        'F':5,
        'G':7,
        'A':9,
        'B':11
    }

In [2]:
#To extract genres from a list of genres
def genre_extractor(row, lookup_genre):
    for genre in row['genres']:
        if lookup_genre in genre:
            return 1
    return 0

#Helper function
def string_to_list(row):
    return [s.strip()[1:-1] for s in row['genres'][1:-1].split(",")]

In [3]:
# Function to get numerical note value from text
def get_notes(text):
    val = (int(text[-1]) - 1) * 12
    val = val + note_dict[text[0]]
    if text[1] == '-':
        val = val -1
    elif text[1] == '#':
        val = val + 1
    while val > 72:
        val = val - 12
    return val

### Input data construction  
Here we construct a matrix with rows as the duration * 4 to take each row as a quarter note  
And the columns consists of 73 notes from C1 to C7 and additional duration feature

In [4]:
def midi_to_input(artist, song, key):
    midi = converter.parse('./clean_midi/' + artist+'/'+ song + '.mid')
    notes_to_parse = None
    parts = instrument.partitionByInstrument(midi)
    print(song)
    
    if parts: 
        # get the track with the most notes from the instruments
        notes_to_parse = max(parts.parts, key=lambda p: p.__len__()).flat.notes
    else: 
        #single instrument
        notes_to_parse = midi.flat.notes
    transposed = transpose(notes_to_parse, key, notes_to_parse.analyze('ambitus').noteStart, notes_to_parse.analyze('ambitus').noteEnd)
    duration = notes_to_parse.duration.quarterLength
    notes = np.zeros((ceil(duration*4) + 1, 74))
    for element in transposed:
        if isinstance(element, note.Note):
            timestep = int(round(element.offset*4)) 
            notes[timestep, get_notes(element.pitch.nameWithOctave)] = element.volume.velocityScalar
            notes[timestep, 73] = max(notes[timestep, 73], element.duration.quarterLength)
        elif isinstance(element, chord.Chord):
            timestep = int(round(element.offset*4)) 
            for part in element:
                notes[timestep, get_notes(part.pitch.nameWithOctave)] = part.volume.velocityScalar
            notes[timestep, 73] = max(notes[timestep, 73], element.duration.quarterLength)     
    return notes

In [5]:
def transpose(note_stream, key,spectrumStart,spectrumEnd):
    start = get_notes(spectrumStart.nameWithOctave)
    if (key < 6) and (key<=start):
        offset = -1 * key 
    else:
        offset = 12 - key
    return note_stream.transpose(offset)

In [6]:
### Check if statistics with genres are already present on the folder

In [7]:
try:
    songs = pd.read_pickle("./statistics_with_genres.pkl")
except FileNotFoundError:
    songs = pd.read_csv('./statistics.csv')
    songs['genres'] = songs.apply(lambda row: string_to_list(row),axis=1)
    genre_list = ['pop','rock','jazz','soul','classical','electronic','dance','metal','disco', 'funk']

    for genre in genre_list:
        songs['is_' + genre] = songs.apply(lambda row: genre_extractor(row, genre), axis = 1)

    songs.to_pickle("./statistics_with_genres.pkl")

In [8]:
def lstm_input_from_df(df):
    df['LSTM_input'] = df.apply(lambda row: midi_to_input(row.artist_name, row.song_name, row.key), axis = 1)

In [9]:
songs_per_thread = int(len(songs)/num_threads)
df_list = []
thread_list = []
for i in range(num_threads):
    if i == num_threads-1:
        df = songs.iloc[i*songs_per_thread:]
    else:
        df = songs.iloc[i*songs_per_thread: (i+1)*songs_per_thread]
    df_list.append(df)
    thread_list.append(Thread(target=lstm_input_from_df, args=(df,)))
                       
for i in range(num_threads):
    thread_list[i].start()
                    
for i in range(num_threads):
    thread_list[i].join()
                       
print("Finished extracting song input matrices")                    

The Sweetest Taboo
Amish Paradise
One Way Ticket (Because I Can)
Saxy Mood
Hyper-Ballad
I Just Shot John Lennon
The Sweetest Taboo
Amish Paradise
Hyper-Ballad
One Way Ticket (Because I Can)
Saxy Mood
How Do I Live
Headline News
Keep Looking
Eat It
I Just Shot John Lennon
Bachelorette
Joga
Your Love Is King
Dare to Be Stupid
Io camminero
Siempre Hay Esperanza
It's Oh So Quiet
Around the World   Harder Better Faster Stronger
Venus As A Boy
Whatta Man
The Lamb Lies Down on Broadway
Mi manchi
Mer losse d'r Dom en Kolle
Dreadlock Holiday
Canto alla luna


KeyboardInterrupt: 

Let's Talk About Sex
The Lamb Lies Down on Broadway
A chi
One More Time


Exception in thread Thread-7:
Traceback (most recent call last):
  File "/home/ymirkhang/anaconda3/lib/python3.6/threading.py", line 916, in _bootstrap_inner
    self.run()
  File "/home/ymirkhang/anaconda3/lib/python3.6/threading.py", line 864, in run
    self._target(*self._args, **self._kwargs)
  File "<ipython-input-8-e4fbc914f5f5>", line 2, in lstm_input_from_df
    df['LSTM_input'] = df.apply(lambda row: midi_to_input(row.artist_name, row.song_name, row.key), axis = 1)
  File "/home/ymirkhang/anaconda3/lib/python3.6/site-packages/pandas/core/frame.py", line 6004, in apply
    return op.get_result()
  File "/home/ymirkhang/anaconda3/lib/python3.6/site-packages/pandas/core/apply.py", line 142, in get_result
    return self.apply_standard()
  File "/home/ymirkhang/anaconda3/lib/python3.6/site-packages/pandas/core/apply.py", line 248, in apply_standard
    self.apply_series_generator()
  File "/home/ymirkhang/anaconda3/lib/python3.6/site-packages/pandas/core/apply.py", line 277, in a

Canto alla luna
Angeli negri (Angelitos negros)


In [None]:
songs.iloc[1].genres[0]
