## Extracting chords from ChordPro files of The Beatles Songs.

Data was collected from http://getsome.org/guitar/olga/chordpro/b/Beatles/

In [1]:
import re
import os
import numpy as np
import matplotlib.pyplot as plt

In [2]:
uniqueChordNames = ["A","B", "C", "D", "E", "F", "G"]       


In [3]:
def readChordProFile(chordProFile):
    with open(chordProFile, 'r') as file:
        print(chordProFile)
        content = file.read()
        return content

In [4]:
def extractChord(line):
    chord_pattern = r'\[([^\]]+)\]'
    finalChords = []
    lineChords = []
    if "[" in line:
        lineChords = re.findall(chord_pattern, line)
    for chord in lineChords:
        chord = chord.replace('H', 'B')
        chord = chord.replace('min', 'm')
        chord = chord.replace('maj', '')
        chord = chord.replace('sus', '')
        chord = chord.replace('dim', '')
        chord = chord.replace('(', '')
        chord = chord.replace('+', '')
        chord = chord.replace('?', '')
        chord = chord.split("/")[0]
        chord = chord.split("+")[0]
        if len(chord) > 3:
            print('Curring long chord ', chord)
            chord = chord[0:3]
        if chord[0:1] in uniqueChordNames:
            finalChords.append(chord)
    return finalChords

In [5]:
class Song:
    def __init__(self, chordProFile):
        self.chordProFile = chordProFile
        self.content = readChordProFile(chordProFile)
        self.chorus = self.getChorus()
        self.chords = self.getChords()
        self.title = self.getTitle()
        self.key = self.getKey()
        
    def getChords(self):
        chords = []
        
        for line in self.content.split("\n"):
            if "chorus}" in line.lower():
                chorus_chords = self.getChorus()
                chords.extend(chorus_chords)
                
            chords.extend(extractChord(line))
        print('Chord of the song: ' + self.chordProFile, chords)
        return chords
        
    def getChorus(self):
        pattern = r"{soc}(.*?){eoc}"
        match = re.search(pattern, self.content, re.DOTALL)
        chords = []
        if match:
            chorus = match.group(1).strip()
            chord_pattern = r"\[([^\]]*)\]"
            chords = re.findall(chord_pattern, chorus)
            print("Chords in chorus:", chords)
        else:
            print("Chorus section not found.")
        return chords
    
    def getTitle(self):
        pattern = r"\{title:\s*(.*?)\s*\}"
        match = re.search(pattern, self.content)
        title = ''
        if match:
            title = match.group(1)
            print("The title is:", title)
        else:
            print("Title information not found, using the file name.")
            title = os.path.splitext(os.path.basename(self.chordProFile))[0]
            print("The file name is being used as title:", title)
        return title
    
    def getKey(self):
        pattern = r"{key:\s*([A-G][#b]?)\s*}"
        match = re.search(pattern, self.content)
        key = ''
        if match:
            key = match.group(1)
            print("The key is:", key)
        else:
            print("Key information not found.")
        return key

In [6]:
def createChordDictFromChordProFiles(dataFolderPath):
    fileNames = os.listdir(dataFolderPath)
    chordsDict = {}
    longestChordLength = 0
    for file in fileNames:
        print(file)
        if '.DS_Store' in file:
            continue
        chordproFile = dataFolderPath + file
        song = Song(chordproFile)
        chordsDict[song.title] = song.chords
    return chordsDict

In [7]:
pathOfChordTxtFiles = './data/chordpro/'

songDict = createChordDictFromChordProFiles(pathOfChordTxtFiles)
print(songDict)

HelloGoodbye.chopro
./data/chordpro/HelloGoodbye.chopro
Chorus section not found.
Chord of the song: ./data/chordpro/HelloGoodbye.chopro ['C', 'G', 'D7', 'Em', 'D7', 'Em', 'D7', 'G', 'D7', 'C', 'G', 'C', 'Eb', 'G', 'C', 'F', 'G', 'C', 'G', 'D7', 'Em', 'D7', 'Em', 'D7', 'G', 'D7', 'C', 'G', 'C', 'Eb', 'G', 'C', 'F', 'G', 'C', 'G', 'D7', 'Em', 'D7', 'Em', 'D7', 'G', 'D7', 'C', 'G', 'C', 'Eb', 'G', 'C', 'F', 'G', 'C', 'G', 'D7', 'Em', 'D7', 'Em', 'D7', 'G', 'D7', 'C', 'G', 'C', 'Eb', 'G', 'C', 'F', 'G', 'C', 'Eb', 'G']
The title is: Hello, Goodbye
Key information not found.
TheContinuingStoryOfBungalowBill.chopro
./data/chordpro/TheContinuingStoryOfBungalowBill.chopro
Chorus section not found.
Chorus section not found.
Chorus section not found.
Chorus section not found.
Chord of the song: ./data/chordpro/TheContinuingStoryOfBungalowBill.chopro ['C', 'G', 'C', 'Fm', 'C', 'Fm', 'G', 'A', 'E7', 'A', 'Dm', 'A', 'Dm', 'E7', 'Am', 'C', 'F', 'G', 'Am', 'C', 'F', 'Em', 'G', 'Am', 'Fm', 'Am', 'C',

### Map the string chord names to integer values

In [8]:
distinctChordNames = list(sorted({ele for val in songDict.values() for ele in val}))
print(distinctChordNames)
print(len(distinctChordNames))

['A', 'A6', 'A7', 'A7a', 'A9', 'AD', 'Ab', 'Ab7', 'Am', 'Am6', 'Am7', 'B', 'B7', 'Bb', 'Bb6', 'Bbm', 'Bdim', 'Bm', 'Bm7', 'C', 'C#7', 'C#m', 'C+', 'C7', 'C74', 'C9', 'Cm', 'Cm7', 'D', 'D#', 'D4', 'D7', 'D7a', 'D9', 'Dm', 'Dm6', 'Dm7', 'E', 'E4', 'E6', 'E6a', 'E7', 'E7-', 'E7.', 'Eb', 'Eb6', 'Eb7', 'Em', 'Em6', 'Em7', 'F', 'F#', 'F#7', 'F#m', 'F6', 'F7', 'F7V', 'Fm', 'G', 'G#', 'G#0', 'G#7', 'G#m', 'G6', 'G6#', 'G6a', 'G7', 'G9', 'Gm', 'Gm6', 'Gm7', 'Hm']
72


In [9]:
def mapChordNamesToIndexes(chords):
    indexArr = []
    for i, chord in enumerate(chords):
        indexArr.append(distinctChordNames.index(chord))
    return indexArr

In [10]:
finalChordDict = {}
for song in songDict:
        chordArray = songDict[song]
        if len(chordArray) < 10:
            print('Chords are too short, ignoring the song ' + song)
        else:   
            finalChordDict[song] = mapChordNamesToIndexes(chordArray)
        
print(finalChordDict)

Chords are too short, ignoring the song ChristmastimeIsHereAgain
Chords are too short, ignoring the song Anna
Chords are too short, ignoring the song TooMuchMonkeyBusiness
Chords are too short, ignoring the song TwistAndShout
{'Hello, Goodbye': [19, 58, 31, 47, 31, 47, 31, 58, 31, 19, 58, 19, 44, 58, 19, 50, 58, 19, 58, 31, 47, 31, 47, 31, 58, 31, 19, 58, 19, 44, 58, 19, 50, 58, 19, 58, 31, 47, 31, 47, 31, 58, 31, 19, 58, 19, 44, 58, 19, 50, 58, 19, 58, 31, 47, 31, 47, 31, 58, 31, 19, 58, 19, 44, 58, 19, 50, 58, 19, 44, 58], 'The Continuing Story of Bungalow Bill': [19, 58, 19, 57, 19, 57, 58, 0, 41, 0, 34, 0, 34, 41, 8, 19, 50, 58, 8, 19, 50, 47, 58, 8, 57, 8, 19, 50, 58, 8, 19, 50, 58, 47, 58, 8, 57, 8, 19, 50, 58, 8, 19, 50, 58, 47, 58, 8, 57], 'Yellow Submarine': [58, 28, 19, 58, 47, 8, 23, 28, 58, 28, 19, 58, 47, 8, 23, 28, 58, 28, 19, 58, 47, 8, 23, 28, 58, 28, 19, 58, 47, 8, 23, 28, 58, 28, 58, 28, 58, 28, 19, 58, 47, 8, 23, 28, 58, 28, 19, 58, 58, 28, 58, 28, 58, 28, 19, 58, 47

### Create feature arrays

In [11]:
def getFeatureAndTargetArrForEachWindow(chords, windowLength, hopSize, index):
    featureArray = chords[index: index + windowLength]
    targetArray = chords[index + windowLength: index + windowLength + 1]
    return featureArray, targetArray

In [12]:
def getFeaturesAndTargetsByWindowAndHop(chordsDictionary, windowSize, hopSize):
    features = np.empty((0, windowSize), dtype=np.ndarray)
    targets = np.empty((0, 1), dtype=np.ndarray)
    for song in chordsDictionary:
        chordArray = chordsDictionary[song]
        if len(chordArray) < windowSize + 1:
            print('Chords are shorter than number of targets so skipping the song: ' + song)
            continue
        indexProcessed = 0
        while len(chordArray) - indexProcessed > windowSize + 1:
            featureArr, targetArray = getFeatureAndTargetArrForEachWindow(chordArray, windowSize, hopSize, indexProcessed)
            #features.extend(featureArr)
            featureArr = np.array(featureArr).reshape(1, len(featureArr))
            features = np.append(features, featureArr, axis=0)
            targetArray = np.array(targetArray).reshape(1, len(targetArray))
            targets = np.append(targets, targetArray, axis=0)
            #targets.extend(targetArr)
            indexProcessed = indexProcessed + hopSize

    features = np.array(features)
    targets = np.array(targets)
    print(features.shape)
    print(targets.shape)
    print('Slicing the song chords is done!')
    return features, targets

In [13]:
def fillChordArraysUntilDesiredLength(chordArr, desiredLength):
    finalArr = np.zeros(desiredLength, dtype=int)
    chordLength = len(chordArr)
    if chordLength < desiredLength:
        print('Chords are shorter than number of targets so repeating the chords: ' + song)
        quotient, remainder = divmod(desiredLength, chordLength) 
        print(str(chordLength) + " quotient " + str(quotient) + " remainder " + str(remainder))
        for step in range(quotient):
            finalArr[step*chordLength:(step+1)*chordLength] = chordArr
        finalArr[chordLength*quotient: desiredLength] = chordArr[0:remainder]
        print(len(finalArr))
    elif chordLength > desiredLength:
        finalArr = chordArr[0: desiredLength]
    else:
        finalArr = chordArr
    print(len(finalArr))
    print(finalArr)
    return finalArr

In [14]:
def getFeaturesAndTargetsOfFlatSong(chordsDictionary, desiredFeatureLength):
    features = np.zeros((len(chordsDictionary),desiredFeatureLength), dtype=int)
    targets = np.zeros((len(chordsDictionary),1), dtype=int)
    for i, song in enumerate(chordsDictionary):
        chordArray = chordsDictionary[song] 
        finalChords = fillChordArraysUntilDesiredLength(chordArray, desiredFeatureLength+1)
        features[i:] = finalChords[0:desiredFeatureLength]
        targets[i:] = finalChords[desiredFeatureLength:desiredFeatureLength+1]
    print(features.shape)
    print(targets.shape)
    print('Slicing the song chords is done!')
    return features, targets

In [15]:
windowSize = 16
hopSize = 8

featureLength = 50


pathOfChordTxtFiles = './data/chordsAsText/'
featureArr, targetArr = getFeaturesAndTargetsByWindowAndHop(finalChordDict, windowSize, hopSize)
#featureArr, targetArr = getFeaturesAndTargetsOfFlatSong(finalChordDict, featureLength)
print(featureArr.shape)
print(targetArr.shape)

print(featureArr)
#print(targetArr)

ValueError: all the input arrays must have same number of dimensions, but the array at index 0 has 2 dimension(s) and the array at index 1 has 1 dimension(s)

## Train SVMR model

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import RBF
from sklearn import svm
import sklearn
import pandas as pd

In [None]:
targets = pd.Series(targetArr.flatten())

In [None]:
#X_train, X_test, y_train, y_test = train_test_split(featureArr, targets, test_size=0.2, random_state=42)
feat_train, feat_test, tar_train, tar_test = train_test_split(featureArr, targets, test_size=0.3, random_state=4)

In [None]:
# Create SVR regression object
# C tolerand, epsilon the distance from the line
# The error term is instead handled in the constraints, where we set the absolute error less than or equal to a specified margin, called the maximum error, ϵ (epsilon).
# additional hyperparameter, C, that we can tune. As C increases, our tolerance for points outside of ϵ also increases. As C approaches 0, the tolerance approaches 0 and the equation collapses into the simplified (although sometimes infeasible) one.
regr = svm.SVR(C=1.0, epsilon=0.001, kernel='rbf')

# Train the model using the training sets
regr.fit(feat_train, tar_train)

# Make predictions using the testing set
tar_predict = regr.predict(feat_test)

In [None]:
#computing a set of performance metrics

#mean squared error (lower the better)
print('Mean squared error: %.4f'% sklearn.metrics.mean_squared_error(tar_test, tar_predict))

#mean absolute error (lower the better)
print('Mean absolute error: %.4f'% sklearn.metrics.mean_absolute_error(tar_test, tar_predict))

#maximum error (lower the better)
print('Max error squared error: %.4f'% sklearn.metrics.max_error(tar_test, tar_predict))

#median absolute error (lower the better)
print('Median absolute error: %.4f'% sklearn.metrics.median_absolute_error(tar_test, tar_predict))

#coefficient of determination (r2 score): 1 is perfect prediction (it can get arbitrary negative)
print('Coefficient of determination (R2 score): %.4f'% sklearn.metrics.r2_score(tar_test, tar_predict))

#explained variance score: 1 is perfect prediction (it can get arbitrary worse)
print('Explained variance score: %.4f'% sklearn.metrics.explained_variance_score(tar_test, tar_predict))

In [None]:
def accuracy(y_true, y_pred):
    return 1 - sklearn.metrics.mean_squared_error(y_true, y_pred)

# Calculate the accuracy of the model
acc = accuracy(tar_test, tar_predict)

# Plot the accuracy as a bar chart
plt.bar(["Accuracy"], [acc])
plt.title("Model Accuracy")
plt.show()