In [13]:
import sys
import os
import glob
import re 
import pandas as pd
import numpy as np
from lxml import etree
import matplotlib.pyplot as plt
import requests
import time
from collections import Counter

# Get a list of paths to TEI XML files
paths = glob.glob("results/*.xml")

# Output directory for saving CSV files
output_path = "./graphdata/"

# Create the output directory if it doesn't exist
if not os.path.exists(output_path):
    os.makedirs(output_path)

# Namespace Definition
namespaces = {"tei" : "http://www.tei-c.org/ns/1.0"}

# Iterate over each TEI XML file
for path in paths:
    # Extract file name without extension
    file_name = os.path.splitext(os.path.split(path)[1])[0]
    print(file_name)

    # Create a directory for each file if it doesn't exist
    if not os.path.exists(output_path + file_name):
        os.makedirs(output_path + file_name)

    # XML Parsing
    root = etree.parse(path).getroot()

    # Metadata Extraction
    people_lt = []
    people_header = root.xpath(".//tei:listPerson", namespaces=namespaces)[0]
    for person in people_header:
        id = "#" + person.xpath("./@xml:id", namespaces=namespaces)[0]
        try:
            sex = person.xpath("./@sex")[0]
        except:
            sex = ""
        try:
            name = person.xpath("./tei:persName//text()", namespaces=namespaces)[0]
        except:
            name = ""
        try:
            trait = person.xpath("./tei:trait/tei:desc/text()", namespaces=namespaces)[0]
        except:
            trait = ""     

        people_lt.append([id, name, sex, trait])

    # Convert metadata to DataFrame and save as CSV
    metadata = pd.DataFrame(people_lt, columns=["Id", "Label", "Sex", "Trait"])
    metadata.to_csv(output_path + file_name + "/" + file_name + "_nodes.csv", sep=",", index=False)

    # Scene Extraction
    scenes = root.xpath('//tei:div[@type="scene"]', namespaces=namespaces)

    name_scenes = []
    people_total = []

    # Iterate over each scene
    for scene in scenes:
        number_scene = scene.xpath("./@n")[0]
        name_scenes.append(number_scene)

        # Extract characters present in the scene
        whos = scene.xpath(".//tei:sp/@who", namespaces=namespaces)
        people = [item for who in whos for item in who.split(" ")]

        # Count occurrences of characters in the scene
        people_dict = dict(Counter(people))
        people_total.append(people_dict)

    # Convert character occurrences to DataFrame and save as CSV
    characters_scenes_df = pd.DataFrame(people_total, index=name_scenes).T
    characters_scenes_df.to_csv(output_path + file_name + "/" + file_name + "_characters_scenes.csv", sep=",", index=False)

    # Character Co-occurrence Matrix
    characters_scenes_bool_df = pd.get_dummies(characters_scenes_df).fillna(0).astype(bool).astype(int)
    characters_scenes_bool_df.to_csv(output_path + file_name + "/" + file_name + "_characters_scenes_bool.csv", sep=",", index=False)

    characters_scenes_coocu_df = characters_scenes_bool_df.dot(characters_scenes_bool_df.T)
    characters_scenes_coocu_df.to_csv(output_path + file_name + "/" + file_name + "_characters_scenes_coocu.csv", sep=",", index=False)

    # Edge Generation
    edges = []
    for character_1 in characters_scenes_coocu_df.columns.tolist():
        for character_2 in characters_scenes_coocu_df.columns.tolist():
            if character_1 != character_2:
                edge = characters_scenes_coocu_df.loc[character_1, character_2]
                edges.append([character_1, character_2, edge])

    # Convert edge data to DataFrame and save as CSV
    edges = pd.DataFrame(edges, columns=["Source", "Target", "Weight"])
    edges.to_csv(output_path + file_name + "/" + file_name + "_edges.csv", sep=",", index=False)


bien-vengas-mal-si-vienes-solo
el-cordero-de-isaias
la-gran-cenobia
el-diablo-mudo-segunda-version
primero-soy-yo
basta-callar
los-empenos-de-un-acaso
triunfar-muriendo
el-pintor-de-su-deshonra-auto
la-desdicha-de-la-voz
el-veneno-y-la-triaca
el-mayor-monstruo-del-mundo
la-devocion-de-la-misa
la-redencion-de-cautivos
loa-para-psiquis-y-cupido
ni-amor-se-libra-de-amor
el-astrologo-fingido
el-primer-refugio-del-hombre-loa
el-dia-mayor-de-los-dias
el-verdadero-dios-pan
suenos-hay-que-verdad-son
el-jardin-de-falerina
el-primer-refugio-del-hombre-y-probatica-piscina
el-postrer-duelo-de-espana
la-vina-del-senor-1996
no-hay-mas-fortuna-que-dios
para-vencer-a-amor-querer-vencerle
el-ano-santo-de-roma
amar-y-ser-amado-y-divina-filotea
el-divino-orfeo
eco-y-narciso
la-senora-y-la-criada
el-segundo-blason-del-austria
la-exaltacion-de-la-cruz
el-monstruo-de-los-jardines
loa-a-fieras-afemina-amor
no-hay-burlas-con-el-amor
gustos-y-disgustos-son-no-mas-que-imaginacion
el-sitio-de-breda
la-hidalga-de