# pythagoras

Welcome to _pythagoras_. For more information about the project, visit the [project website](http://web.simmons.edu/~bellanti/pythagoras) or the Github repo.

---


---
# Load libraries and settings
---

In [32]:
import pandas as pd
import numpy as np
from lxml import etree
import regex as re
# from collections import Counter
# import matplotlib.pyplot as plt
# %matplotlib inline
import sqlite3
import csv
import os

---
# XML
---

m = re.match(r'.*/(\w*?)-(.*).xml',xml_filepath)
comp = m.group(1)
title = m.group(2).replace(' ','_')
print(comp, title)

In [76]:
d = "/Users/BrandonBel/Desktop/MusicXML_scores/"
xml_d = d + "xml/"
csv_d = d + "csv/"

for path in os.listdir(xml_d):
    xml_filepath = xml_d + path
    csv_filepath = xml_filepath.replace('xml','csv')

    if path.replace('xml','csv') not in os.listdir(csv_d):
        try:
            xml_to_csv(xml_filepath,csv_filepath)
            print("Conversion successful: %s --> %s" % (path, path.replace('xml','csv')))
        except:
            print("Conversion failed: %s " % path)


Conversion failed: Mozart-KV216-3_Rondo.xml 
Conversion successful: Beethoven-01_variation.xml --> Beethoven-01_variation.csv
Conversion successful: Beethoven-1_Horn1.xml --> Beethoven-1_Horn1.csv
Conversion failed: Schubert-Sinfonie Nr 7 - Satz 1.xml 
Conversion failed: Schubert-Sinfonie Nr 7 - Satz 2.xml 


### Get information about the work

In [8]:
print('\nINSTRUMENTATION:\n')

parts = []

for e in root.xpath('//score-part'):
    part = e.xpath('./@id')[0]
    parts.append(part)
    instr = e.xpath('.//instrument-name')[0].text
    print("\t%s (%s)" % (instr,part))


INSTRUMENTATION:

	Midi_2 (P1)
	Midi_2 (P2)


In [9]:
print('\nKEY & TIME\n')

attrib_list = root.xpath("//part[@id='P1']/measure[@number='1']/attributes/*")

key_dict = {
    '-7':'Cb major (Ab minor)',
    '-6':'Gb major (Eb minor)',
    '-5':'Db major (Bb minor)',
    '-4':'Ab major (F minor)',
    '-3':'Eb major (C minor)',
    '-2':'Bb major (G minor)',
    '-1':'F major (D minor)',
    '0':'C major (A minor)',
    '1':'G major (E minor)',
    '2':'D major (B minor)',
    '3':'A major (F# minor)',
    '4':'E major (C# minor)',
    '5':'B major (G# minor)',
    '6':'F# major (D# minor)',
    '7':'C# major (A# minor)'
}

for e in attrib_list:
    if e.tag == 'key':
        accidentals = e.xpath('./fifths')[0].text
        mode = e.xpath('./mode')[0].text
        key = key_dict[accidentals]
        print("\tKey signature:\t", key)
    if e.tag == 'time':
        beats = e.xpath('./beats')[0].text
        val = e.xpath('./beat-type')[0].text
        time = "%s/%s" % (beats,val)
        print("\tTime signature:\t",time)


KEY & TIME

	Key signature:	 D major (B minor)
	Time signature:	 4/4


### Build a dataframe with notes

In [75]:
def xml_to_csv(xml_filepath, csv_filepath):
    tree = etree.parse(xml_filepath)
    root = tree.getroot()

    with open(csv_filepath,'w') as csv_file:
        writer = csv.writer(csv_file)
        writer.writerow(('part','measure','pitch','step','alter','octave','duration','type','dotted','rest','grace','cue'))
        notes = []

        for e in root.xpath('//note'):

            part = e.xpath('../..//@id')[0]

            measure = e.xpath('..//@number')[0]

            dotted = True if 'dot' in [child.tag for child in e] else False

            grace = True if 'grace' in [child.tag for child in e] else False

            cue = True if 'cue' in [child.tag for child in e] else False

            rest = True if 'rest' in [child.tag for child in e] else False

            duration_list = e.xpath('.//duration/text()')
            duration = duration_list[0] if len(duration_list)>0 else ''

            typ_list = e.xpath('.//type/text()')
            typ = typ_list[0] if len(typ_list)>0 else ''

            step_list = e.xpath('.//step/text()')
            step = step_list[0] if len(step_list)>0 else ''

            alter_list = e.xpath('.//alter/text()')
            alter = alter_list[0] if len(alter_list)>0 else ''
            if alter == '0':
                accidental = ''
            elif alter == '-1':
                accidental = 'b'
            elif alter == '1':
                accidental = '#'
            elif alter == '-2':
                accidental = 'bb'
            elif alter == '2':
                accidental = '##'
            else:
                accidental = ''

            pitch = step + accidental

            octave_list = e.xpath('.//octave/text()')
            octave = octave_list[0] if len(step_list)>0 else ''

            # notes.append((part,measure,pitch,step,alter,octave,duration,typ,dotted,rest,grace,cue))
            writer.writerow((part,measure,pitch,step,alter,octave,duration,typ,dotted,rest,grace,cue))


---
# Pandas Dataframe
---

In [84]:
df = pd.DataFrame(notes,columns=['part','measure','pitch','step','alter','octave','duration','type','dotted','rest','grace','cue'])

In [10]:
pitch_dict = {'A0':'1','G##0':'1','Bbb0':'1','A#0':'2','Bb0':'2','Cbb1':'2','B0':'3','Cb1':'3','A##0':'3','C1':'4','B#0':'4','Dbb1':'4','C#1':'5','Db1':'5','B##1':'5','D1':'6','C##1':'6','Dbb1':'6','D#1':'7','Eb1':'7','Fbb1':'7','E1':'8','Fb1':'8','D##1':'8','1':'8','F1':'9','E#1':'9','Gbb1':'9','F#1':'10','Gb1':'10','E##1':'10','G1':'11','F##1':'11','Abb1':'11','G#1':'12','Ab1':'12','A1':'13','G##1':'13','Bbb1':'13','A#1':'14','Bb1':'14','Cbb1':'14','B1':'15','Cb1':'15','A##1':'15','C2':'16','B#2':'16','Dbb2':'16','C#2':'17','Db2':'17','B##2':'17','D2':'18','C##2':'18','Dbb2':'18','D#2':'19','Eb2':'19','Fbb2':'19','E2':'20','Fb2':'20','D##2':'20','2':'20','F2':'21','E#2':'21','Gbb2':'21','F#2':'22','Gb2':'22','E##2':'22','G2':'23','F##2':'23','Abb2':'23','G#2':'24','Ab2':'24','A2':'25','G##2':'25','Bbb2':'25','A#2':'26','Bb2':'26','Cbb2':'26','B2':'27','Cb2':'27','A##2':'27','C3':'28','B#3':'28','Dbb3':'28','C#3':'29','Db3':'29','B##3':'29','D3':'30','C##3':'30','Dbb3':'30','D#3':'31','Eb3':'31','Fbb3':'31','E3':'32','Fb3':'32','D##3':'32','3':'32','F3':'33','E#3':'33','Gbb3':'33','F#3':'34','Gb3':'34','E##3':'34','G3':'35','F##3':'35','Abb3':'35','G#3':'36','Ab3':'36','A3':'37','G##3':'37','Bbb3':'37','A#3':'38','Bb3':'38','Cbb3':'38','B3':'39','Cb3':'39','A##3':'39','C4':'40','B#4':'40','Dbb4':'40','C#4':'41','Db4':'41','B##4':'41','D4':'42','C##4':'42','Dbb4':'42','D#4':'43','Eb4':'43','Fbb4':'43','E4':'44','Fb4':'44','D##4':'44','4':'44','F4':'45','E#4':'45','Gbb4':'45','F#4':'46','Gb4':'46','E##4':'46','G4':'47','F##4':'47','Abb4':'47','G#4':'48','Ab4':'48','A4':'49','G##4':'49','Bbb4':'49','A#4':'50','Bb4':'50','Cbb4':'50','B4':'51','Cb4':'51','A##4':'51','C5':'52','B#5':'52','Dbb5':'52','C#5':'53','Db5':'53','B##5':'53','D5':'54','C##5':'54','Dbb5':'54','D#5':'55','Eb5':'55','Fbb5':'55','E5':'56','Fb5':'56','D##5':'56','5':'56','F5':'57','E#5':'57','Gbb5':'57','F#5':'58','Gb5':'58','E##5':'58','G5':'59','F##5':'59','Abb5':'59','G#5':'60','Ab5':'60','A5':'61','G##5':'61','Bbb5':'61','A#5':'62','Bb5':'62','Cbb5':'62','B5':'63','Cb5':'63','A##5':'63','C6':'64','B#6':'64','Dbb6':'64','C#6':'65','Db6':'65','B##6':'65','D6':'66','C##6':'66','Dbb6':'66','D#6':'67','Eb6':'67','Fbb6':'67','E6':'68','Fb6':'68','D##6':'68','6':'68','F6':'69','E#6':'69','Gbb6':'69','F#6':'70','Gb6':'70','E##6':'70','G6':'71','F##6':'71','Abb6':'71','G#6':'72','Ab6':'72','A6':'73','G##6':'73','Bbb6':'73','A#6':'74','Bb6':'74','Cbb6':'74','B6':'75','Cb6':'75','A##6':'75','C7':'76','B#7':'76','Dbb7':'76','C#7':'77','Db7':'77','B##7':'77','D7':'78','C##7':'78','Dbb7':'78','D#7':'79','Eb7':'79','Fbb7':'79','E7':'80','Fb7':'80','D##7':'80','7':'80','F7':'81','E#7':'81','Gbb7':'81','F#7':'82','Gb7':'82','E##7':'82','G7':'83','F##7':'83','Abb7':'83','G#7':'84','Ab7':'84','A7':'85','G##7':'85','Bbb7':'85','A#7':'86','Bb7':'86','Cbb8':'86','B7':'87','Cb8':'87','A##7':'87','C8':'88','B#7':'88','Dbb8':'88'}

In [85]:
df['keyboard_step'] = df['pitch'] + df['octave']
df['keyboard_step'] = df['keyboard_step'].apply(lambda x: int(pitch_dict[x]) if len(x)>0 else None)

In [86]:
df['interval'] = df['keyboard_step'].diff()

In [87]:
name = re.findall(r'.*\/(.*?)\.xml',xml_filepath)[0]
outfile_path = '../scores/csv/' + name + '.csv'

print(outfile_path)
df.to_csv(outfile_path)

../scores/csv/Mozart-Ah_vous.csv


In [48]:
m = re.match(r'(\w*?)-(.*)', name)
composer = m.group(1)
title = m.group(2)
print(composer,title)

Mozart Ah_vous


---
# Regex patterns
---
Using the string generated from the dataframe series, match any reoccuring patterns longer than a given length.

In [88]:
df

Unnamed: 0,part,measure,pitch,step,alter,octave,duration,type,dotted,rest,grace,cue,keyboard_step,interval
0,P1,1,C,C,,5,480,quarter,False,False,False,False,52.0,
1,P1,1,C,C,,5,480,quarter,False,False,False,False,52.0,0.0
2,P1,2,G,G,,5,480,quarter,False,False,False,False,59.0,7.0
3,P1,2,G,G,,5,480,quarter,False,False,False,False,59.0,0.0
4,P1,3,A,A,,5,480,quarter,False,False,False,False,61.0,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3776,P2,325,E,E,,3,480,quarter,False,False,False,False,32.0,4.0
3777,P2,325,G,G,,3,480,quarter,False,False,False,False,35.0,3.0
3778,P2,325,C,C,,4,480,quarter,False,False,False,False,40.0,5.0
3779,P2,325,,,,,480,quarter,False,True,False,False,,


In [89]:
df = df[df['part'] == 'P1'].fillna('')
steps = df['interval'].to_string(index=False, header=False)

# steps = re.sub(r'\n|\t| ','',steps)
steps = re.sub(r'\n','.',steps)
steps = re.sub(r' ','',steps)


print(steps)

.0.7.0.2.0.-2.0.-2.0.-1.0.-2.0.2.-4.7.0.-2.0.-1.0.-2.0.5.0.-2.0.-1.0.1.-1.-2.-2.0.7.0.2.0.-2.0.-2.0.-1.0.-2.0.2.-4.2.-2.-1.1.-1.1.-1.1.9.-2.-1.1.-1.1.-1.1.1.1.3.-1.3.-2.-1.-2.0.-2.9.-2.-2.-1.-2.-2.0.-2.9.-2.-1.-2.-2.-2.0.-1.8.-1.-2.-2.-2.-1.-2.7.-2.-8.1.....-2.-1.1.-1.1.2.-2.0.-2.-1.1.-1.1.2.-2.0.-1.-1.1.-1.1.1.-1.0.-2.-1.1.-1.1.2.-2.7.-2.-1.1.9.-4.-3.-2.0.-2.-1.1.9.-3.-4.-2.0.-1.-1.1.8.-5.-2.-1.3.-3.-2.0.-2.-1.1.-1.1.-1.1.9.-2.-1.1.-1.1.-1.1.1.1.3.-1.3.-2.-1.-2.0.-2.9.-2.-2.-1.-2.-2.0.-2.9.-2.-1.-2.-2.-2.0.-1.8.-1.-2.-2.-2.-1.-2.7.-2.-8.1.....0...0.-7.0.9.-2.-2.-5.7.-2.-1.-4.5.0.-1.-2.-2.-5.9.0.-2.-2.-5.2.5.0.-2.2.-5.-2.-2.7...-3...0.-7.0...0.-5.-1......2.2.1.-1.-2.-3...0.-7.0...0.-5.-1...9.-2.2.1.-1.-2.-7.-3.3.5.0.-1.1.....0...9.0.-2.-2.-5.0.7.0.-2.-1.-4.-2.8.-1.-1.-2.-5.-1.8.0.-2.-2.-5.2.5.0.-2.-1.-2.-2.-2.7.-5.-3...4.3.5.4.3....-5.-2.-1.-2.-2....-1.1.3.-1.-2....-3.1.0.5.4....-11.0.0.6.3...-10.0.0.5.3...-10.7.0.-2.-8......-1.1.9.-4.-5...-3.1.12.-6.-6....-2.1.8.-5.-3....-3.1.3.-3.-3.

In [90]:
steps = steps[:200]
steps

'.0.7.0.2.0.-2.0.-2.0.-1.0.-2.0.2.-4.7.0.-2.0.-1.0.-2.0.5.0.-2.0.-1.0.1.-1.-2.-2.0.7.0.2.0.-2.0.-2.0.-1.0.-2.0.2.-4.2.-2.-1.1.-1.1.-1.1.9.-2.-1.1.-1.1.-1.1.1.1.3.-1.3.-2.-1.-2.0.-2.9.-2.-2.-1.-2.-2.0.-'

In [91]:
min_length = 4
min_occur = 3

# pat = r'(.{%d,})(?:.*\1){%d,}' % (min_length, min_occur-1) # backup copy
# pat = r'(.{%d,})(?:.*\1){%d,}' % (min_length, min_occur-1) # search w/ no delimiters
pat = r'((?:[^.]*\.){%d,})(?:.*\1){%d,}' % (min_length, min_occur-1)

step_patterns = re.findall(pat,steps,overlapped=True)

for pattern in step_patterns:
    print(pattern)

.0.-2.0.
.-2.0.-2.
.0.-2.0.-1.0.-2.0.
0.-2.0.-1.0.-2.0.
.-2.0.-1.0.-2.0.
-2.0.-1.0.-2.0.
2.0.-1.0.-2.0.
.0.-1.0.-2.0.
0.-1.0.-2.0.
.-1.0.-2.0.
-1.0.-2.0.
1.0.-2.0.
.0.-2.0.
.0.-2.0.-1.0.
0.-2.0.-1.0.
.-2.0.-1.0.
-2.0.-1.0.
2.0.-1.0.
.0.-1.0.
.0.-2.0.
.0.-2.0.


---
# SQLite3 implementation
---
[SQLite3 documentation](https://docs.python.org/3.7/library/sqlite3.html)

## Create tables

In [77]:
import sqlite3
conn = sqlite3.connect('pythagoras.db')
c = conn.cursor()

In [84]:
c.execute('''CREATE TABLE IF NOT EXISTS work (
    work_id INTEGER PRIMARY KEY AUTOINCREMENT,
    work_title text,
    work_composer text
    );''')
conn.commit()

In [91]:
c.execute('''CREATE TABLE IF NOT EXISTS pattern (
    pattern_id INTEGER PRIMARY KEY AUTOINCREMENT,
    pattern_string text
    );''')
conn.commit()

In [86]:
c.execute('''CREATE TABLE IF NOT EXISTS work_pattern (
    work_id INTEGER,
    pattern_id INTEGER,
    FOREIGN KEY (work_id) REFERENCES work(work_id),
    FOREIGN KEY (pattern_id) REFERENCES pattern(pattern_id)
    );''')
conn.commit()

In [92]:
# show tables
print(c.execute('''SELECT name from sqlite_master where type="table";''').fetchall())

# show columns
print(c.execute('''PRAGMA table_info(pattern);''').fetchall())

[('work',), ('sqlite_sequence',), ('work_pattern',), ('pattern',)]
[(0, 'pattern_id', 'INTEGER', 0, None, 1), (1, 'pattern_string', 'text', 0, None, 0)]


## Insert values into tables

In [103]:
files = os.listdir("/Users/BrandonBel/Desktop/MusicXML_scores/csv/")
for file in files:
    try:
        m = re.match(r'(\w*?)-(.*).csv',file)
        composer = m.group(1)
        title = m.group(2)
#         print("\nComposer:", composer, "\nTitle: ", title)
        
        if c.execute('''SELECT COUNT(work_title) FROM work WHERE work_title=(?)''', (title,)).fetchone()[0] == 0:
            c.execute('''INSERT INTO work(work_title, work_composer) VALUES (?,?)''',(title,composer))

    except:
        print("Cannot match: %s" % file)

Cannot match: .DS_Store


In [104]:
for row in c.execute('''SELECT * FROM work;'''):
    print(row)

(1, 'Duo2b', 'Beethoven')
(2, 'SATZ1', 'Handel')
(3, 'D 459, 2 KlavierstÅcke', 'Schubert')
(4, 'Opus 22,2', 'Beethoven')
(5, 'Satz1', 'Beethoven')
(6, 'XI-Menuett', 'Mozart')
(7, 'Sonatine Mand c', 'Beethoven')
(8, 'D 002, Fantasie', 'Schubert')
(9, 'Opus 101,1', 'Beethoven')
(10, 'Opus 22,3', 'Beethoven')
(11, 'Duo2c', 'Beethoven')
(12, 'op123-2-Gloria', 'Beethoven')
(13, 'Duo2a', 'Beethoven')
(14, 'FRöS4', 'Beethoven')
(15, '04_variation', 'Beethoven')
(16, 'SATZ2', 'Handel')
(17, 'op3_4Adagio', 'Beethoven')
(18, '13_variation', 'Beethoven')
(19, 'Opus 22,1', 'Beethoven')
(20, 'op3_2Andante', 'Beethoven')
(21, 'Opus 101,3', 'Beethoven')
(22, 'Satz2', 'Beethoven')
(23, 'Andante favori Opus 35', 'Beethoven')
(24, '5_viola', 'Beethoven')
(25, '1_Exposito', 'Schubert')
(26, 'Opus 26,4', 'Beethoven')
(27, 'SonateNr15_E-Dur', 'Handel')
(28, 'SONAT_1', 'Beethoven')
(29, 'Satz3', 'Beethoven')
(30, 'Schweizer_Lied', 'Beethoven')
(31, 'Opus 101,2', 'Beethoven')
(32, 'SATZ3', 'Handel')
(33, 'FR

In [113]:
for row in c.execute('''SELECT COUNT(work_title) FROM work WHERE work_composer = "Beethoven";'''):
    print(row)

(219,)


In [108]:
for row in c.execute('''SELECT * FROM work WHERE work_title LIKE "%opus%";'''):
    print(row)

(4, 'Opus 22,2', 'Beethoven')
(9, 'Opus 101,1', 'Beethoven')
(10, 'Opus 22,3', 'Beethoven')
(19, 'Opus 22,1', 'Beethoven')
(21, 'Opus 101,3', 'Beethoven')
(23, 'Andante favori Opus 35', 'Beethoven')
(26, 'Opus 26,4', 'Beethoven')
(31, 'Opus 101,2', 'Beethoven')
(43, 'Opus 22,4', 'Beethoven')
(44, 'Opus27,Nr.1', 'Chopin')
(46, 'Opus 10-2,3', 'Beethoven')
(48, 'Opus 81a,2', 'Beethoven')
(50, 'Opus64,Nr.2', 'Chopin')
(51, 'Opus 26,1', 'Beethoven')
(55, 'Opus 81a,3', 'Beethoven')
(57, 'Opus 10-2,2', 'Beethoven')
(67, 'Opus 81a,1', 'Beethoven')
(68, 'Opus64,Nr.1', 'Chopin')
(69, 'Opus 26,3', 'Beethoven')
(70, 'Opus 26,2', 'Beethoven')
(71, 'Opus 10-2,1', 'Beethoven')
(80, 'Opus 27-1,3', 'Beethoven')
(90, 'Opus 106,4', 'Beethoven')
(91, 'Opus 27-1,2', 'Beethoven')
(109, 'Opus25,Nr.9', 'Chopin')
(114, 'Opus 27-1,1', 'Beethoven')
(115, 'Opus 10-3,4', 'Beethoven')
(121, 'Opus 10-1,2', 'Beethoven')
(125, 'Opus 106,3', 'Beethoven')
(129, 'Opus09,Nr.1', 'Chopin')
(130, 'Opus 79,1', 'Beethoven')
(1

In [114]:
conn.close()

---
# Vizualizations
---