# convert xml to panda
I mounted my own google drive and use the xml files from there, but they are in the same format as the xml-audio folder in this drive. 

In [2]:
# Load the Drive helper and mount
from google.colab import drive

# This will prompt for authorization.
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
!ls "/content/drive/My Drive/Team 6/xml-audio"

Bdb001.A.dialogue-acts.xml  Bmr024.E.dialogue-acts.xml
Bdb001.B.dialogue-acts.xml  Bmr024.F.dialogue-acts.xml
Bdb001.C.dialogue-acts.xml  Bmr024.G.dialogue-acts.xml
Bdb001.D.dialogue-acts.xml  Bmr024.H.dialogue-acts.xml
Bdb001.E.dialogue-acts.xml  Bmr024.I.dialogue-acts.xml
Bdb001.F.dialogue-acts.xml  Bmr025.A.dialogue-acts.xml
Bed002.A.dialogue-acts.xml  Bmr025.B.dialogue-acts.xml
Bed002.B.dialogue-acts.xml  Bmr025.C.dialogue-acts.xml
Bed002.C.dialogue-acts.xml  Bmr025.D.dialogue-acts.xml
Bed002.D.dialogue-acts.xml  Bmr025.E.dialogue-acts.xml
Bed002.E.dialogue-acts.xml  Bmr025.F.dialogue-acts.xml
Bed002.F.dialogue-acts.xml  Bmr025.G.dialogue-acts.xml
Bed003.A.dialogue-acts.xml  Bmr025.H.dialogue-acts.xml
Bed003.B.dialogue-acts.xml  Bmr026.A.dialogue-acts.xml
Bed003.C.dialogue-acts.xml  Bmr026.B.dialogue-acts.xml
Bed003.D.dialogue-acts.xml  Bmr026.C.dialogue-acts.xml
Bed004.A.dialogue-acts.xml  Bmr026.D.dialogue-acts.xml
Bed004.B.dialogue-acts.xml  Bmr026.E.dialogue-acts.xml
Bed004.C.d

In [4]:
import pandas as pd
import xml.etree.ElementTree as et
import os
from pathlib import Path
import glob

In [5]:
with os.scandir('/content/drive/My Drive/Team 6/xml-audio') as entries:
    for entry in entries:
        print(entry.name)

Bmr023.D.dialogue-acts.xml
Bmr030.D.dialogue-acts.xml
Bmr030.E.dialogue-acts.xml
Bed010.E.dialogue-acts.xml
Bmr005.C.dialogue-acts.xml
Bmr019.E.dialogue-acts.xml
Bmr016.E.dialogue-acts.xml
Bro012.C.dialogue-acts.xml
Bed016.D.dialogue-acts.xml
Bmr021.E.dialogue-acts.xml
Bmr030.B.dialogue-acts.xml
Bro022.B.dialogue-acts.xml
Bmr016.C.dialogue-acts.xml
Bro018.A.dialogue-acts.xml
Bmr025.A.dialogue-acts.xml
Bro024.B.dialogue-acts.xml
Bed008.B.dialogue-acts.xml
Bsr001.E.dialogue-acts.xml
Bed009.B.dialogue-acts.xml
Bmr025.D.dialogue-acts.xml
Bro017.E.dialogue-acts.xml
Bro023.E.dialogue-acts.xml
Bro019.C.dialogue-acts.xml
Bmr024.I.dialogue-acts.xml
Bro017.D.dialogue-acts.xml
Bed012.A.dialogue-acts.xml
Bro007.A.dialogue-acts.xml
Btr002.F.dialogue-acts.xml
Bed014.A.dialogue-acts.xml
Bro010.F.dialogue-acts.xml
Bmr011.F.dialogue-acts.xml
Bmr030.C.dialogue-acts.xml
Bed012.C.dialogue-acts.xml
Bed017.E.dialogue-acts.xml
Bro011.D.dialogue-acts.xml
Bns001.D.dialogue-acts.xml
Bed009.E.dialogue-acts.xml
B

In [6]:
#to make sure colab reads the correct files
print(glob.glob("/content/drive/My Drive/xml-audio/*.xml"))

[]


In [7]:
#def getvalueofnode(node):
    #""" return node text or None """
    #return node.text if node is not None else None

def main():
    """ main """
    filenames =  glob.glob("/content/drive/My Drive/Team 6/xml-audio/Bdb001.A.dialogue-acts.xml") 
    filename = [] 
    li = []
    for filename in filenames :
         
         global left
         parsed_xml = et.parse(filename)
         #I hard-coded here but i think it would be easier to modify the panda dataframe later
         dfcols = ['Id', 'st_time', 'ed_time', 'type', 'adjacency', 'original-type', 'channel', 'participant']
         left = pd.DataFrame(columns=dfcols)

         root = parsed_xml.getroot()
         
        
         for diaAct in parsed_xml.findall('./dialogueact'):
             uId = diaAct.get('{http://nite.sourceforge.net/}id')
             sT = diaAct.get("starttime")
             eT = diaAct.get("endtime")
             tP = diaAct.get("type")
             aJ = diaAct.get("adjacency")
             oT = diaAct.get("original-type")
             ch = diaAct.get("channel")
             par = diaAct.get("participant")

             
             left = left.append(pd.Series([uId, sT, eT, tP, aJ, oT, ch, par], index=dfcols),ignore_index=True)
             li.append(left)
             
    left = pd.concat(li, axis=0, ignore_index=True)

    return left

left = main()

left.head(5)

Unnamed: 0,Id,st_time,ed_time,type,adjacency,original-type,channel,participant
0,Bdb001.A.dialogueact74,164.014,165.974,s^bk|s,5b.6a,s^bk|s,c1,mn017
1,Bdb001.A.dialogueact74,164.014,165.974,s^bk|s,5b.6a,s^bk|s,c1,mn017
2,Bdb001.A.dialogueact78,165.974,166.184,s^bk,5b+,s^bk,c1,mn017
3,Bdb001.A.dialogueact74,164.014,165.974,s^bk|s,5b.6a,s^bk|s,c1,mn017
4,Bdb001.A.dialogueact78,165.974,166.184,s^bk,5b+,s^bk,c1,mn017


## Preprocessing

In [8]:
df = left #.loc[1:500]
df = df.drop_duplicates(keep='first')

# Add the 'Interruption' (bool) variable
df['Interruption'] = df['type'].str.contains('%-', regex = False)
df['Interruption'] = df['Interruption'].astype(bool)

# View what types are counted as Interruptions 
# print(df.loc[df['Interruption'], 'type'])

27      s^bk.%-
65        s.%--
90           %-
104          %-
170       s.%--
189        s.%-
252          %-
299    s^cs.%--
495       s.%--
Name: type, dtype: object


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """


In [11]:
# Add segment length variable
df.loc[:, 'st_time'] = pd.to_numeric(df.loc[:, 'st_time'])
df.loc[:, 'ed_time'] = pd.to_numeric(df.loc[:, 'ed_time'])
df.loc[:, 'length'] = df.ed_time - df.st_time

segment_length = df.length.max()

In [12]:
import numpy as np

# Dummy segments dataframe
segments_df = pd.DataFrame(np.zeros((100, 3)))
segments_df.columns = ['id', 'st_time', 'ed_time']

def getLabel(segments, interruptions):
    counts = np.empty(segments.shape[0])

    for seg_index, seg_row in segments.iterrows():
      for inter_index, inter_row in interruptions.iterrows():
        if seg_row['st_time'] < inter_row['st_time'] and seg_row['ed_time'] > inter_row['ed_time']:
          counts[seg_index] += 1
        else:
          counts[seg_index] = 0

    # label as True if there's at least one entire interruption in the segment
    labels = counts > 0 
    return labels

labels = getLabel(segments_df, df)