Data from: https://github.com/nyphilarchive/PerformanceHistory/tree/master/Programs

The schema looks as follows

```
<programs>
   <program>
      <id/> // GUID
      <programID/> // NYP Local ID
      <orchestra/>
      <season/>  
      <concertInfo> // A program can have multiple concerts
         <eventType/>
         <Location/>
         <Venue/>
         <Date/>
         <Time/>
      </concertInfo>
      <worksInfo> // a program will usually have multiple works 
         <work>
             <composerName/>
             <workTitle/>
             <conductorName/>     
             <soloists> // included only if there are listed soloists
                <soloist>
                    <soloistName/>
                    <soloistInstrument/> 
                    <soloistRole/>           
                </soloist>
             </soloists>
             <interval/> // intermissions are listed among works, but with a distinct interval tag
         </work>
      </worksInfo>
   </program>
</programs>
```
            

In [1]:
import lxml
import pandas as pd
import seaborn
%matplotlib inline
import matplotlib.pyplot as plt
seaborn.set(style='dark')

In [2]:
from lxml import etree, objectify

In [3]:
import collections

In [44]:
import glob

In [4]:
def parse_programs(programs):
    
    return [parse_program(x) for x in programs]

def parse_program(program):
    
    dispatch = dict(concertInfo=parse_concertInfo,
                    worksInfo=parse_worksInfo)
    data = dict()
    
    for child in program.getchildren():
        if child.tag in dispatch:
            data[child.tag] = dispatch[child.tag](child)
        else:
            data[child.tag] = child.text
            
    return data
            
def parse_concertInfo(concertInfo):
    data = dict()
    
    for child in concertInfo.getchildren():
        data[child.tag] = child.text
    
    return data

def parse_worksInfo(worksInfo):
    
    data = list()
    
    for child in worksInfo.getchildren():
        data.append(parse_work(child))
            
    return data

def parse_work(work):
    
    dispatch = dict(soloists=parse_soloists)
    data = dict()
    
    for child in work.getchildren():
        if child.tag in dispatch:
            data[child.tag] = dispatch[child.tag](child)
        else:
            data[child.tag] = child.text
            
    return data

def parse_soloists(soloists):
    data = list()
    for child in soloists.getchildren():
        data.append(parse_soloist(child))
    return data

def parse_soloist(soloist):
    data = dict()
    
    for child in soloist.getchildren():
        data[child.tag] = child.text
    
    return data

In [5]:
def flatten(d):
    
    works = d.pop('worksInfo', [])
    concertInfo = d.pop('concertInfo', [])
    
    out = []
    for w in works:
        out.append(concertInfo.copy())
        w.pop('soloists', None)
        out[-1].update(d)
        out[-1].update(w)
        
    return out

In [46]:
def load_programs():
    # We need this to handle badly formatted &'s in strings
    parser = etree.XMLParser(recover=True)

    fd = []

    for xmlfile in sorted(glob.glob('../data/nyphil/Programs/1*.xml')):
        obj = objectify.parse(xmlfile, parser=parser)
        dix = parse_programs(obj.getroot())
        for _ in dix:
            fd.extend(flatten(_))
    df = pd.DataFrame.from_records(fd)
    df['Date'] = pd.to_datetime(df['Date'])
    del df['worksInfo']
    del df['work']
    del df['concertInfo']

    return df

---
# Analysis

In [47]:
df = load_programs()

In [51]:
df.head(5)

Unnamed: 0,Date,Location,Time,Venue,composerName,conductorName,eventType,id,interval,orchestra,program,programID,season,workTitle
0,1842-12-07 05:00:00,"Manhattan, NY",8:00PM,Apollo Rooms,"Beethoven, Ludwig van","Hill, Ureli Corelli",Subscription Season,38e072a7-8fc9-4f9a-8eac-3957905c0002,,New York Philharmonic,,3853,1842-43,"SYMPHONY NO. 5 IN C MINOR, OP.67"
1,1842-12-07 05:00:00,"Manhattan, NY",8:00PM,Apollo Rooms,"Weber, Carl Maria Von","Timm, Henry C.",Subscription Season,38e072a7-8fc9-4f9a-8eac-3957905c0002,,New York Philharmonic,,3853,1842-43,OBERON
2,1842-12-07 05:00:00,"Manhattan, NY",8:00PM,Apollo Rooms,"Hummel, Johann",,Subscription Season,38e072a7-8fc9-4f9a-8eac-3957905c0002,,New York Philharmonic,,3853,1842-43,"QUINTET, PIANO, D MINOR, OP. 74"
3,1842-12-07 05:00:00,"Manhattan, NY",8:00PM,Apollo Rooms,,,Subscription Season,38e072a7-8fc9-4f9a-8eac-3957905c0002,Intermission,New York Philharmonic,,3853,1842-43,
4,1842-12-07 05:00:00,"Manhattan, NY",8:00PM,Apollo Rooms,"Weber, Carl Maria Von","Etienne, Denis G.",Subscription Season,38e072a7-8fc9-4f9a-8eac-3957905c0002,,New York Philharmonic,,3853,1842-43,OBERON


In [52]:
works = df['workTitle'].value_counts()

In [59]:
works.head(20)

MEISTERSINGER, DIE, WWV 96                             655
TANNHAUSER, WWV 70                                     528
GOTTERDAMMERUNG [GÖTTERDÄMMERUNG]                      511
LOHENGRIN                                              415
DAMNATION DE FAUST, LA, OP. 24                         389
WALKURE, DIE, WWV 86B                                  389
SYMPHONY NO. 5 IN C MINOR, OP.67                       383
TRISTAN UND ISOLDE, WWV 90                             370
MIDSUMMER NIGHT'S DREAM, OP. 61                        364
SYMPHONY NO. 5, E MINOR, OP. 64                        318
LEONORE OVERTURE NO. 3, OP. 72B                        299
UNITED STATES                                          298
SYMPHONY NO. 4, F MINOR, OP. 36                        297
SYMPHONY NO. 3 IN E FLAT MAJOR, OP. 55 (EROICA)        295
SYMPHONY NO. 7 IN A MAJOR, OP.92                       286
PORGY AND BESS                                         284
PARSIFAL, WWV 111                                      2

In [54]:
composers = df['composerName'].value_counts()

In [58]:
composers.head(20)

Wagner,  Richard               4256
Beethoven,  Ludwig  van        3518
Tchaikovsky,  Pyotr  Ilyich    2753
Mozart,  Wolfgang  Amadeus     1931
Brahms,  Johannes              1806
Strauss,  Richard              1268
Mendelssohn,  Felix            1175
Berlioz,  Hector               1068
Bach,  Johann  Sebastian       1007
Dvorak,  Antonín                912
Schubert,  Franz                906
Liszt,  Franz                   857
Debussy,  Claude                774
Weber,  Carl  Maria Von         743
Stravinsky,  Igor               741
Schumann,  Robert               717
Strauss,  Johann, II            714
Ravel,  Maurice                 701
Gershwin,  George               681
Traditional,                    659
dtype: int64

In [61]:
conductors = df['conductorName'].value_counts()

In [62]:
conductors.head(20)

Damrosch, Walter           5209
Stransky, Josef            5117
Mehta, Zubin               2755
Hoogstraten, Willem van    1999
Kostelanetz, Andre         1939
Masur, Kurt                1741
Mitropoulos, Dimitri       1538
Mengelberg, Willem         1261
Bernstein, Leonard         1232
Toscanini, Arturo          1006
Smallens, Alexander         984
Rodzinski, Artur            874
Walter, Bruno               848
Boulez, Pierre              789
Leinsdorf, Erich            750
Stokowski, Leopold          725
Schelling, Ernest           704
Slatkin, Leonard            495
Szell, George               487
Bergmann, Carl              484
dtype: int64

In [60]:
df['Date'].describe()

count                   60578
unique                   9344
top       1956-07-12 04:00:00
freq                      847
first     1842-12-07 05:00:00
last      2004-07-30 04:00:00
Name: Date, dtype: object