# MEETUPS IDENTIFICATION AFTER COREFERENCE

Coreference process already completed

ANNOTATIONS:
    
    HM = historical meetup
    HT = historical trace, 
    N = it is not a meetup
    
INPUT:
    
    Coreference:
        People and places stored in meetupsFastCorefOutputPP/
    Time in extractedTimeExpressions/
    Theme in extractedMeetupTypes/


In [10]:
import os
import pandas as pd
from operator import countOf
import numpy as np
import json
from _datetime import date
from datetime import datetime
import time
from pandas import json_normalize 
import sys
from timeit import default_timer as timer
# tracing memory
import tracemalloc

# wikidata api 
from SPARQLWrapper import SPARQLWrapper, JSON
# from qwikidata.sparql import return_sparql_query_results

# For DBpedia spotlight, PPE entities
import requests
# import pycurl
from urllib.request import urlopen
from urllib.parse import quote
import gc
pd.options.mode.copy_on_write = True

In [11]:
# using DBPedia spotlight to search for entities
def executeQueryDbpedia(q, f='application/json'):
    epr = "http://dbpedia.org/sparql"
    try:
        params = {'query': q}
        resp = requests.get(epr, params=params, headers={'Accept': f})
        return resp
    except Exception as e:
        print(e, file=sys.stdout)
        raise

def queryBiographyInformationByDBpediaID(id_number):
    values = []
    variables = []
    # SELECT * WHERE {
    # VALUES ?id { "10085"^^xsd:integer }
    # OPTIONAL { ?s dbo:wikiPageID ?id . ?s rdfs:label ?label .  FILTER langMatches( lang(?label), "EN" )  } .
    #  OPTIONAL { ?s dbo:thumbnail ?image } .
    # OPTIONAL { ?s dbp:birthDate ?dob } .
    # OPTIONAL { ?s dbp:deathDate ?dod } .
    # OPTIONAL { ?s dbo:wikiPageID ?wikiId } .
    # OPTIONAL { ?s dbo:abstract ?abstract .  FILTER langMatches( lang(?abstract), "EN" ) } .
    # }
    query_text = """SELECT * WHERE { VALUES ?id { '"""+id_number+"""'^^xsd:integer } OPTIONAL { ?s dbo:wikiPageID ?id . ?s rdfs:label ?label .  FILTER langMatches( lang(?label), "EN" )  } . 
    OPTIONAL { ?s dbo:thumbnail ?image } . OPTIONAL { ?s dbp:birthDate ?birthdate } . OPTIONAL { ?s dbp:deathDate ?deathdate } . 
    OPTIONAL { ?s owl:sameAs ?wikidataId . FILTER regex (?wikidataId, "http://www.wikidata.org/entity/" ) }
    OPTIONAL { ?s dbo:abstract ?abstract .  FILTER langMatches( lang(?abstract), "EN" ) } . }"""
    results = executeQueryDbpedia(query_text).json()
    # if query returns a response
    if len(results['results']['bindings']) == 0:
        print("No entity", id_number)
    else:
        # return a list, with [lat, long]
        result_list = list(results['results']['bindings'])[0]
        variables = list(results['head']['vars'])
        for label in variables:
            if label in result_list:
                if (label=='birthdate' or label=='deathdate'):
                    if evaluateDateFormat(result_list[label]['value']):
                        values.append(result_list[label]['value'])
                    else:
                        values.append(np.NaN)
                else:
                    values.append(result_list[label]['value'])
            else:
                values.append(np.NaN)
    return variables,values
   
def setWikidateParams(query):
    endpoint_url = "https://query.wikidata.org/sparql"
    user_agent = "Polifonia-MEETUPSPilot0.1 (https://polifonia-project.eu/pilots/meetups/; alba.morales-tirado@open.ac.uk) /%s.%s" % (sys.version_info[0], sys.version_info[1])
    # TODO adjust user agent; see https://w.wiki/CX6
    sparql = SPARQLWrapper(endpoint_url, agent=user_agent)
    sparql.setQuery(query)
    sparql.setReturnFormat(JSON)
    return sparql.query().convert()

def queryWikidataEndpoint(uri):
    values = []
    variables = []
    # """SELECT * WHERE { <http://www.wikidata.org/entity/Q212692> wdt:P569 ?birthdate ; 
    # OPTIONAL { <http://www.wikidata.org/entity/Q212692> wdt:P570 ?deathdate } . }"""
    sparql_query = """SELECT * WHERE { <"""+uri+"""> wdt:P569 ?birthdate ; 
    OPTIONAL { <"""+uri+"""> wdt:P570 ?deathdate } . }"""
    # results = return_sparql_query_results(sparql_query)
    results = setWikidateParams(sparql_query)
    
    if len(results['results']['bindings']) == 0:
        print("No entity", uri)
    else:
        # return a list, with [lat, long]
        result_list = list(results['results']['bindings'])[0]
        variables = list(results['head']['vars'])
        for label in variables:
            if label in result_list:
                # values.append(result_list[label]['value'])
                values.append(result_list[label]['value'].split("T",1)[0])
                # dic['birthdate'].split("T",1)[0]
            else:
                values.append(np.NaN)
    return variables,values

def evaluateDateFormat(dateValue):
    # initializing format
    format = "%Y-%m-%d"
    # checking if format matches the date
    res = True
    # using try-except to check for truth value
    try:
        res = bool(datetime.strptime(dateValue, format))
    except ValueError:
        res = False
    return res

In [12]:
# True = both sentences have the same entities
# False = the sentences do not have the same entities
def aSameAsb(entType, aSent_indx,bSent_indx, ent_df):
    # print("aSameAsb")
    if sentenceHas(entType, aSent_indx, ent_df, '') and sentenceHas(entType, bSent_indx, ent_df, ''):
        # Check if all elements are the same
        # Diferent for time
        if entType =="person":
            aEnts_df = ent_df.query("entType == '{}' & sentenceIndex == {}".format(entType,int(aSent_indx)))
            aEnts_df = aEnts_df[~aEnts_df['URI'].isna()]
            bEnts_df = ent_df.query("entType == '{}' & sentenceIndex == {}".format(entType,int(bSent_indx)))
            bEnts_df = bEnts_df[~bEnts_df['URI'].isna()]
            # print("A -> ",aEnts_df)
            # print("B -> ",bEnts_df)
            # Check if the shape (number of rows and columns) is the same
            if aEnts_df.shape != bEnts_df.shape:
                # print("aSameAsb->people->False")
                return False
            aSubEnts_df = aEnts_df[['URI']]
            bSubEnts_df = bEnts_df[['URI']]
            comparison_matrix = aSubEnts_df.values == bSubEnts_df.values
            return comparison_matrix.all()
        elif entType == "place":
            aEnts_df = ent_df.query("entType == '{}' & sentenceIndex == {}".format(entType,int(aSent_indx)))
            bEnts_df = ent_df.query("entType == '{}' & sentenceIndex == {}".format(entType,int(bSent_indx)))
            # print("A -> ",aEnts_df)
            # print("B -> ",bEnts_df)
            # # Check if the shape (number of rows and columns) is the same
            # if aEnts_df.shape != bEnts_df.shape:
            #     print("aSameAsb->place->False")
            #     return False
            aSubEnts_df = aEnts_df[['URI']]
            bSubEnts_df = bEnts_df[['URI']]
            # comparison_matrix = aSubEnts_df.values == bSubEnts_df.values
            # return comparison_matrix.all()
            # Check if at least one row is the same
            return have_common_rows(aSubEnts_df, bSubEnts_df)
        elif entType =="time":
            aEnts_df = ent_df.query("sentenceIndex == {}".format(int(aSent_indx)))
            bEnts_df = ent_df.query("sentenceIndex == {}".format(int(bSent_indx)))
            # print("A -> ",aEnts_df)
            # print("B -> ",bEnts_df)
            # Check if the shape (number of rows and columns) is the same
            # if aEnts_df.shape != bEnts_df.shape:
            #     print("aSameAsb->time->False")
            #     return False
            aSubEnts_df = aEnts_df[~aEnts_df['normalised_value_start'].isna()]
            bSubEnts_df = bEnts_df[~bEnts_df['normalised_value_start'].isna()]
            aSubEnts_df = aSubEnts_df[['normalised_value_start','normalised_value_end']]
            bSubEnts_df = bSubEnts_df[['normalised_value_start','normalised_value_end']]
            # Check if at least one row is the same
            value =  have_common_rows(aSubEnts_df, bSubEnts_df)
            if not value and (len(pd.isnull(aEnts_df.normalised_value_start))>0 or len(pd.isnull(bEnts_df.normalised_value_start))>0):
                aSubEnts_df = aEnts_df[aEnts_df['normalised_value_start'].isna()]
                bSubEnts_df = bEnts_df[bEnts_df['normalised_value_start'].isna()]
                aSubEnts_df = aSubEnts_df[['normalised_value_start','normalised_value_end','time_expression']]
                bSubEnts_df = bSubEnts_df[['normalised_value_start','normalised_value_end','time_expression']]
                # Check if at least one row is the same
                return have_common_rows(aSubEnts_df, bSubEnts_df)
            return value
            # comparison_matrix = aSubEnts_df.values == bSubEnts_df.values
            # return comparison_matrix.all()
    else:
        # print("no sentence has->False")
        return False
    
def have_common_rows(df1, df2):
    # Merge the two dataframes based on common columns
    merged_df = pd.merge(df1, df2, how='inner')

    # Check if the merged dataframe has any rows
    return not merged_df.empty

# True = sentence has entities
# False = sentence does not have an entity
def sentenceHas(entityType, index, df_ent, entIRI):
    if entityType == "time":
        # print("sentenceHas->Time")
        df_ent = df_ent[df_ent['sentenceIndex']==int(index)]
        # df_ent.drop_duplicates(subset=['sentenceIndex','paragraphIndex'],inplace=True)
        if len(df_ent) > 0:
            # print("sentenceHas->Time->True")
            return True
        else:
            # print("sentenceHas->Time->False")
            return False
    elif entityType == "place":
        if len(df_ent[(df_ent['sentenceIndex']==int(index)) & (df_ent['entType']=="place")]) > 0:
            # print("sentenceHas->place->True")
            return True
        else:
            # print("sentenceHas->place->False")
            return False
    elif entityType == "person":
        # At least two people
        if len(df_ent[(df_ent['sentenceIndex']==int(index)) & (df_ent['entType']=="person")]) >= 2:
            # print("sentenceHas->person->True")
            return True
        else:
            # print("sentenceHas->person->False")
            return False
    elif entityType == "OnePerson":
        # At least one person
        if len(df_ent[(df_ent['sentenceIndex']==int(index)) & (df_ent['entType']=="person")]) >= 1:
            # print("sentenceHas->OnePerson->True")
            return True
        else:
            # print("sentenceHas->OnePerson->False")
            return False
        
def createDfFromList(data_list):
    # Extracting data for each column
    column1 = [item[0][0] if item[0] else None for item in data_list]
    column2 = [item[0][1] if len(item[0]) > 1 else None for item in data_list]
    column3 = [item[1] for item in data_list]
    column4 = [item[2] for item in data_list]
    # Creating a DataFrame
    df = pd.DataFrame({'after': column1, 'before': column2, 'paragraphIndex': column3, 'annotation':column4})
    # print(df)
    return df

In [None]:
# for chunk in pd.read_csv('totalBiographiesBenchmark.csv', chunksize=50):
# for chunk in pd.read_csv('list_wikiIdSample.csv', chunksize=10):
for chunk in pd.read_csv('toAnnotatePart6.csv', chunksize=10):
    df_file_name = pd.DataFrame()
    df_file_name['file_name'] = chunk['file_name']
    # for each file
    entities_df = pd.DataFrame()
    for file_name_item in df_file_name.itertuples():
        print("FILE: ",file_name_item.file_name) # FILE name using the DBPedia ID
        if os.path.isfile('meetupsAnnotations/'+str(file_name_item.file_name)):
            print("Already exists. Next.")
            continue
        tracemalloc.start()
        # READ Files
        # startRead = timer()
        try:
            # read time 
            time_df = pd.read_csv('extractedTimeExpressions/'+file_name_item.file_name, usecols=['time_expression','sentenceIndex','paragraphIndex','normalised_value_start','normalised_value_end'])
        except pd.errors.EmptyDataError:    
            print("EmptyDataError: Time expression. No columns to parse from file")
            # What happends when there is no TIME? We still could have people, places and themes, therefor a HT? Shall continue?
            time_df = pd.DataFrame(columns=['time_expression','sentenceIndex','paragraphIndex','normalised_value_start','normalised_value_end'])
        except FileNotFoundError:
            print("FileNotFoundError: No time.")
            time_df = pd.DataFrame(columns=['time_expression','sentenceIndex','paragraphIndex','normalised_value_start','normalised_value_end'])
        try:
            # read people and places
            pp_df = pd.read_csv('meetupsFastCorefOutputPP/'+file_name_item.file_name, usecols=['URI','entity','offset','similarityScore','sentenceIndex','paragraphIndex','entType'])
        except pd.errors.EmptyDataError:
            print("People and places not found. Continue to next biography")
            print("EmptyDataError: PP. No columns to parse from file.")
            pp_df = pd.DataFrame(columns=['URI','entity','offset','similarityScore','sentenceIndex','paragraphIndex','entType'])
            # continue
        except FileNotFoundError:
            print("FileNotFoundError: People and places not found. Continue to next biography")
            continue
        # The process is sentence by sentence. Read indexedSentences/
        sentences_df = pd.read_csv('indexedSentences/'+file_name_item.file_name, usecols=['sentenceIndex','paragraphIndex'])
        # endRead = timer()
        # print("Reading time: ", str(endRead - startRead))
#         # Obtain the IRI
#         labels,values = queryBiographyInformationByDBpediaID(file_name_item.file_name.replace(".csv",""))
#         if len(values) > 0:
#             row_dict = dict(zip(labels, values))
#             datarow = pd.Series(data=row_dict)
#             entities_df = pd.concat([entities_df,datarow.to_frame().T],ignore_index=True)
#             entities_df.rename(columns={'s': 'URI','label':'entity'}, inplace=True)
            
#             # IRI is important to identify the subject in each MEETUP
#             subject_IRI = entities_df.iloc[0]['URI']
            
        startFile = timer()
        bioAnnotationsList = []
        # everytime it will start a new paragraph, the before sentence is NA
        paragraph_indx_list = sorted(sentences_df.paragraphIndex.unique())
        print("Paragraphs to process",len(paragraph_indx_list))
        for paragraph_indx in paragraph_indx_list:
            # print()
            # print("Paragraph: ", paragraph_indx)
            # startParag = timer()
            time_prg_df = time_df[time_df['paragraphIndex'] == int(paragraph_indx)]
            pp_prg_df = pp_df[pp_df['paragraphIndex'] == int(paragraph_indx)]
            # if there are no entities in the given paragraph
            if len(pp_prg_df)==0:
                # print("No person or place entities in given paragraph. Continue to next paragraph")
                continue
            sentences_parag_df = sentences_df[sentences_df['paragraphIndex'] == int(paragraph_indx)]
            sentences_indx_list = sorted(sentences_parag_df.sentenceIndex.unique())
            # annotationsList each item is a list [[indexes],annotation]
            # indexes can be [bSent_indx] or [aSent_indx,bSent_indx]
            # annotation can take HM, HT, N values
            annotationsList = []
            for sent_indx in sentences_indx_list:
                # startSent = timer()
                aSent_indx = sent_indx
                bSent_indx = sent_indx-1
                # print("Sentences. After: ", aSent_indx, ". Before: ",bSent_indx)
                aHasBtime = False
                aHasBplace = False
                aHasBperson = False
                joinAandB = False
                # by default N, meaning DISCARD sentence.
                annotation = "N"
                
                if sentenceHas("person",aSent_indx, pp_prg_df,'') and (sentenceHas("place",aSent_indx, pp_prg_df,'')  or sentenceHas("time",aSent_indx, time_prg_df,'')):
                    annotation = "HT"
                    joinAandB = False
                    # print("annotation",annotation)
                # if !A.time & A.place == B.place -> A.time = B.time
                # print("if !A.time & A.place == B.place -> A.time = B.time")
                if not sentenceHas("time",aSent_indx, time_prg_df,'') and (aSameAsb("place",aSent_indx,bSent_indx,pp_prg_df)):
                    # THEN A.time = B.time
                    if sentenceHas("time",bSent_indx, time_prg_df,''):
                        aHasBtime = True
                        joinAandB = True
                        annotation = "HT"
                        # print("aHasBtime: TRUE. joinAandB: TRUE")
                # print("if !A.place & A.time == B.time -> A.place = B.place")
                # if !A.place & A.time == B.time -> A.place = B.place
                if not sentenceHas("place",aSent_indx, pp_prg_df,'') and (aHasBtime or aSameAsb("time",aSent_indx,bSent_indx,time_prg_df)):
                    # THEN A.place = B.place
                    if sentenceHas("place",bSent_indx, pp_prg_df,''):
                        aHasBplace = True
                        annotation = "HT"
                        joinAandB = True
                        # print("aHasBplace: TRUE. joinAandB: TRUE. Annotation: ",annotation)
                # print("if !A.person & A.time == B.time & A.place == B.place -> A.person = B.person")
                # if !A.person & A.time == B.time & A.place == B.place -> A.person = B.person
                if not sentenceHas("person",aSent_indx, pp_prg_df,'') and (aHasBtime or aSameAsb("time",aSent_indx,bSent_indx,time_prg_df)) and (aHasBplace or aSameAsb("place",aSent_indx,bSent_indx,pp_prg_df)):
                    # THEN A.person = B.person
                    aHasBperson = True
                    annotation = "HT"
                    joinAandB = True
                    # print("aHasBperson: TRUE. joinAandB: TRUE. Annotation: ",annotation)
                # print("if !A.time & !A.place & A.person == B.person -> A.time = B.time & A.place = B.place")
                # if !A.time & !A.place & A.person == B.person -> A.time = B.time & A.place = B.place
                if not sentenceHas("time",aSent_indx, time_prg_df,'') and not sentenceHas("place",aSent_indx, pp_prg_df,'') and aSameAsb("person",aSent_indx,bSent_indx,pp_prg_df):
                    # THEN A.time = B.time & A.place = B.place
                    if (sentenceHas("time",bSent_indx, time_prg_df,'')) and (sentenceHas("place",bSent_indx, pp_prg_df,'')): 
                        aHasBtime = True
                        aHasBplace = True
                        joinAandB = True
                        annotation = "HT"
                        # print("aHasBtime: TRUE. aHasBplace: TRUE. joinAandB: TRUE. Annotation: ",annotation)
                # print("if A.time & A.place & A.person -> A.meetup")
                # if A.time & A.place & A.person -> A.meetup
                if (sentenceHas("time",aSent_indx, time_prg_df,'') or aHasBtime) and (sentenceHas("place",aSent_indx, pp_prg_df,'') or aHasBplace) and (sentenceHas("person",aSent_indx, pp_prg_df,'') or aHasBperson):
                    annotation = "HM"
                    # joinAandB = False
                    # print("TRUE -> A is a MEETUPS")
                    # if aHasBtime or aHasBplace or aHasBperson:
                    #     joinAandB = True
                    # gather the meetups
                    # B+A 
                    if aHasBtime or aHasBplace or aHasBperson:
                        joinAandB = True
                        # print("joinAandB: ", joinAandB)
                    else:
                        # if A.time = B.time & A.place = B.place & A.person = B.person -> B+A 
                        # print("if A.time = B.time & A.place = B.place & A.person = B.person -> B+A ")
                        if (aSameAsb("time",aSent_indx,bSent_indx,time_prg_df) or aHasBtime) and (aSameAsb("place",aSent_indx,bSent_indx,pp_prg_df) or aHasBplace) and (aSameAsb("person",aSent_indx,bSent_indx,pp_prg_df) or aHasBperson):
                            # add B + A
                            joinAandB = True
                            # print("joinAandB: TRUE ", annotation)
                            
                        # if (A.time = B.time & !B.person & !B.place) OR (A.person = B.person & !B.time & !B.place) OR (A.place = B.place & !B.person & !B.time)
                        # print("if (A.time = B.time & !B.person & !B.place) OR (A.person = B.person & !B.time & !B.place) OR (A.place = B.place & !B.person & !B.time)")
                        if (aSameAsb("time",aSent_indx,bSent_indx,time_prg_df) and not sentenceHas("person",bSent_indx, pp_prg_df,'') and not sentenceHas("place",bSent_indx, pp_prg_df,'')) or (aSameAsb("person",aSent_indx,bSent_indx,pp_prg_df) and not sentenceHas("time",bSent_indx, time_prg_df,'') and not sentenceHas("place",bSent_indx, pp_prg_df,'')) or (aSameAsb("place",aSent_indx,bSent_indx,pp_prg_df) and not sentenceHas("person",bSent_indx, pp_prg_df,'') and not sentenceHas("time",bSent_indx, time_prg_df,'')):
                            # THEN B + A
                            joinAandB = True
                            # print("TRUE", annotation)
                        
                #  A.sentences += B.sentences & A.themes += B.themes & B.discard = true
                if annotation == "N" and sentenceHas("OnePerson",aSent_indx, pp_prg_df,''):
                    if (sentenceHas("place",aSent_indx, pp_prg_df,'')  or sentenceHas("time",aSent_indx, time_prg_df,'')):
                        annotation = "HT"
                        joinAandB = False
                        # print("annotation",annotation)
                if joinAandB:
                    # append the indexes to build the meetup from two sentences
                    annotationsList.extend([[[aSent_indx,bSent_indx],paragraph_indx,annotation]])
                else:
                    annotationsList.extend([[[aSent_indx],paragraph_indx,annotation]])
                # print()
                # print("FINAL Parag: ",paragraph_indx,"Sentences a: ",aSent_indx," and b: ", bSent_indx)
                # print("FINAL annotation for sentence index: ", annotation, "joinAandB: ", joinAandB)
                # print("aHasBtime: ",aHasBtime,". aHasBplace: ", aHasBplace, ". aHasBperson: ",aHasBperson)
                # endSent = timer()
                # print("Time to process sentence: ", str(endSent-startSent))
            bioAnnotationsList.extend(annotationsList)
            print("annotationsList of paragraph",annotationsList)
            del annotationsList
            del sentences_indx_list
            gc.collect()
            # endParag = timer()
            # print("Time to process a paragraph: ", str(endParag - startParag))
            # break
        # break
        # print("bio annotations -> ",bioAnnotationsList)
        bioAnnotations_df = createDfFromList(bioAnnotationsList)
        bioAnnotations_df = bioAnnotations_df[['before','after','paragraphIndex','annotation']]
        bioAnnotations_df.to_csv('meetupsAnnotations/'+file_name_item.file_name,index=False)    
        
        del paragraph_indx_list
        gc.collect()
        endFile = timer()        
        print("Time to process file: ", str(endFile-startFile))
        # else:
        #     print("Missing biography {} IRI from dbpedia. No MEETUPS identification executed.".format(file_name_item))
        print("Memory use", tracemalloc.get_traced_memory())
        tracemalloc.stop()
        # break

FILE:  1965784.csv
Paragraphs to process 24
annotationsList of paragraph [[[0], 0, 'N'], [[1], 0, 'HT']]
annotationsList of paragraph [[[0], 3, 'N'], [[1], 3, 'N']]
annotationsList of paragraph [[[0], 6, 'N'], [[1], 6, 'N'], [[2], 6, 'N'], [[3], 6, 'N']]
annotationsList of paragraph [[[0], 7, 'HT'], [[1], 7, 'HT']]
annotationsList of paragraph [[[0], 8, 'N'], [[1], 8, 'N']]
annotationsList of paragraph [[[0], 9, 'N'], [[1], 9, 'HT']]
annotationsList of paragraph [[[0], 10, 'HT'], [[1], 10, 'N'], [[2], 10, 'N'], [[3], 10, 'N']]
Time to process file:  0.9964531459845603
Memory use (53708, 317108)
FILE:  32264675.csv
Paragraphs to process 25
annotationsList of paragraph [[[0], 0, 'HT'], [[1], 0, 'N'], [[2], 0, 'N'], [[3], 0, 'N'], [[4], 0, 'HT'], [[5], 0, 'N'], [[6], 0, 'N']]
annotationsList of paragraph [[[0], 2, 'N'], [[1], 2, 'N'], [[2], 2, 'HT'], [[3], 2, 'N'], [[4], 2, 'N']]
annotationsList of paragraph [[[0], 4, 'N'], [[1], 4, 'HT'], [[2], 4, 'HT'], [[3], 4, 'N'], [[4], 4, 'N']]
ann

# Add information about the sentence, piece of text as evidence

In [14]:
# Read files in batches
# for chunk in pd.read_csv('totalTest.csv', chunksize=10):
for chunk in pd.read_csv('toAnnotatePart6.csv', chunksize=10):
    df_file_name = pd.DataFrame()
    df_file_name['file_name'] = chunk['file_name']
    # for each file
    entities_df = pd.DataFrame()
    for file_name_item in df_file_name.itertuples():
        print("FILE: ",file_name_item.file_name) # FILE name using the DBPedia ID
        tracemalloc.start()
        # READ Files
        # startRead = timer()
        try:
            # Read Annotation file
            meetupsAnnotations_df = pd.read_csv('meetupsAnnotations/'+str(file_name_item.file_name))
        
            if len(meetupsAnnotations_df) == 0:
                print("No annotations")
                continue

            # The process is sentence by sentence. Read indexedSentences/
            sentences_df = pd.read_csv('indexedSentences/'+file_name_item.file_name, usecols=['sentenceIndex','paragraphIndex','sentences'])
            
            text_list = []
            for meetup in meetupsAnnotations_df.itertuples():
                if meetup.annotation == 'N':
                    text = ''
                else:
                    if pd.isnull(meetup.before):
                        sent_df = sentences_df.loc[(sentences_df.paragraphIndex == int(meetup.paragraphIndex))&(sentences_df.sentenceIndex == int(meetup.after))].copy()
                        text = str(sent_df.iloc[0]['sentences'])
                    else:
                        sent_df = sentences_df[(sentences_df['paragraphIndex'] == meetup.paragraphIndex) & (sentences_df['sentenceIndex'].isin([int(meetup.before), int(meetup.after)]))].copy()
                        text = str(sent_df.iloc[0]['sentences']) + " " + str(sent_df.iloc[1]['sentences'])
                text_list.append(text)
            
            meetupsAnnotations_df['sentences'] = text_list
            meetupsAnnotations_df.to_csv('meetupsAnnotations/'+file_name_item.file_name,index=False)    
        except Exception as e:
            # print("Identifier: ",str(meetup.after)," ",str(meetup.paragraphIndex))
            print(e, file=sys.stdout)
                
        print("Memory use", tracemalloc.get_traced_memory())
        tracemalloc.stop()

FILE:  1965784.csv
Memory use (33020, 308951)
FILE:  32264675.csv
Memory use (35178, 300195)
FILE:  2934315.csv
Memory use (70830, 316713)
FILE:  28312254.csv
Memory use (25611, 297483)
FILE:  8040240.csv
Memory use (31236, 299878)
FILE:  22459257.csv
Memory use (28255, 296419)
FILE:  3362986.csv
Memory use (34120, 305685)
FILE:  19474675.csv
Memory use (29231, 301854)
FILE:  51596657.csv
Memory use (36999, 310626)
FILE:  484364.csv
Memory use (62636, 326829)
FILE:  4241227.csv
Memory use (43455, 311304)
FILE:  46385848.csv
Memory use (28328, 297378)
FILE:  52279001.csv
Memory use (34932, 304508)
FILE:  8064528.csv
Memory use (25247, 298729)
FILE:  5143989.csv
Memory use (39861, 306644)
FILE:  15182641.csv
Memory use (69903, 319639)
FILE:  6975970.csv
Memory use (79195, 347751)
FILE:  1949333.csv
Memory use (64298, 326679)
FILE:  2470630.csv
Memory use (58752, 317039)
FILE:  18713679.csv
Memory use (24503, 298673)
FILE:  23394028.csv
Memory use (36127, 310438)
FILE:  1862380.csv
Memory