In [3]:
import pandas as pd
from stanza.server import CoreNLPClient

In [101]:

client = CoreNLPClient(
    annotators=['tokenize','ssplit', 'pos', 'lemma', 'ner', 'parse', 'depparse','coref'],
    memory='16G', 
    properties={'coref.algorithm': 'neural'},
    endpoint='http://localhost:8000',
    be_quiet=True,
    timeout=1200000)

client.start()
import time; time.sleep(10)


2025-02-24 22:58:09 INFO: Writing properties to tmp file: corenlp_server-e8506313a3914c7f.props
2025-02-24 22:58:09 INFO: Starting server with command: java -Xmx16G -cp /Users/aravpatel/stanza_corenlp/* edu.stanford.nlp.pipeline.StanfordCoreNLPServer -port 8000 -timeout 1200000 -threads 5 -maxCharLength 100000 -quiet True -serverProperties corenlp_server-e8506313a3914c7f.props -annotators tokenize,ssplit,pos,lemma,ner,parse,depparse,coref -preload -outputFormat serialized


In [24]:
import pandas as pd
df = pd.read_csv('articles_ner.csv')

In [None]:
text1 = df['articles'][0]


In [51]:

#perform coreference resolution
ann = client.annotate(text1)
chain = ann.corefChain
print(chain)




[chainID: 161
mention {
  mentionID: 161
  mentionType: "PROPER"
  number: "SINGULAR"
  gender: "NEUTRAL"
  animacy: "INANIMATE"
  beginIndex: 12
  endIndex: 13
  headIndex: 12
  sentenceIndex: 22
  position: 1
}
mention {
  mentionID: 81
  mentionType: "PROPER"
  number: "SINGULAR"
  gender: "NEUTRAL"
  animacy: "INANIMATE"
  beginIndex: 11
  endIndex: 12
  headIndex: 11
  sentenceIndex: 10
  position: 1
}
mention {
  mentionID: 65
  mentionType: "PROPER"
  number: "SINGULAR"
  gender: "NEUTRAL"
  animacy: "INANIMATE"
  beginIndex: 16
  endIndex: 17
  headIndex: 16
  sentenceIndex: 8
  position: 1
}
mention {
  mentionID: 82
  mentionType: "PROPER"
  number: "SINGULAR"
  gender: "NEUTRAL"
  animacy: "INANIMATE"
  beginIndex: 19
  endIndex: 20
  headIndex: 19
  sentenceIndex: 10
  position: 2
}
representative: 2
, chainID: 102
mention {
  mentionID: 100
  mentionType: "NOMINAL"
  number: "PLURAL"
  gender: "UNKNOWN"
  animacy: "ANIMATE"
  beginIndex: 1
  endIndex: 4
  headIndex: 3
  se

In [56]:
def print_mentions(chain, ann):
    # Extract tokens for reference
    tokens_by_sentence = [
        [token.word for token in sentence.token] for sentence in ann.sentence
    ]

    # Function to extract token text from CoreNLP output
    def extract_mention_text(mention):
        sent_idx = mention.sentenceIndex
        start_idx = mention.beginIndex
        end_idx = mention.endIndex
        return " ".join(tokens_by_sentence[sent_idx][start_idx:end_idx])

    # Extract and format coreference chains
    coref_chains = {}

    for chain in chain:
        rep_mention = chain.mention[0]  # First mention as representative
        rep_text = extract_mention_text(rep_mention)

        mentions = []

        for mention in chain.mention:
            mention_text = extract_mention_text(mention)
            if mention_text != rep_text:  # Avoid duplicate mapping
                mentions.append(mention_text)

        if mentions:
            coref_chains[rep_text] = mentions

    # Print Coreference Chains
    for rep, mentions in coref_chains.items():
        print(f"Representative: '{rep}'")
        print(f"Mentions: {mentions}")
        print("-" * 40)


In [57]:
print_mentions(chain, ann)

Representative: 'The AAP leaders'
Mentions: ['they']
----------------------------------------
Representative: 'the crowd'
Mentions: ['us']
----------------------------------------
Representative: 'Prime Minister Narendra Modi'
Mentions: ['my', 'my', 'he', 'his', 'I', 'he', 'he', 'I', 'I']
----------------------------------------
Representative: 'Ms. Sitharaman'
Mentions: ['Finance Minister Nirmala Sitharaman']
----------------------------------------
Representative: 'their'
Mentions: ['people']
----------------------------------------
Representative: 'The MLAs'
Mentions: ['the eight AAP MLAs joining the BJP days ahead of the election']
----------------------------------------
Representative: 'the BJP'
Mentions: ['he', 'BJP', 'the Indira Gandhi government', 'He', 'The BJP government', 'the government', 'the government', 'BJP', 'BJP', 'the Nehru government', 'BJP', 'BJP', 'BJP', 'the BJP government']
----------------------------------------
Representative: 'your'
Mentions: ['you']
------

In [None]:

def extract_coref_chains(ann):
    
    # Extract tokens by sentence for reference
    tokens_by_sentence = [
        [token.word for token in sentence.token] for sentence in ann.sentence
    ]

    coref_chains = []

    for chain in ann.corefChain:
        chain_data = {"chainID": chain.chainID, "mentions": []}

        # Identify representative mention
        rep_mention = chain.mention[0]  # Usually, the first mention is the best reference
        rep_text = " ".join(tokens_by_sentence[rep_mention.sentenceIndex][rep_mention.beginIndex:rep_mention.endIndex])
        chain_data["representative"] = rep_text

        for mention in chain.mention:
            mention_text = " ".join(tokens_by_sentence[mention.sentenceIndex][mention.beginIndex:mention.endIndex])
            mention_data = {
                "text": mention_text,
                "sentence": mention.sentenceIndex,
                "start": mention.beginIndex,
                "end": mention.endIndex

            }
            chain_data["mentions"].append(mention_data)

        coref_chains.append(chain_data)
    
    return coref_chains



In [62]:
ann = client.annotate(text1)
coref_chains = extract_coref_chains(ann)
print(coref_chains)

[{'chainID': 161, 'mentions': [{'text': 'Delhi', 'sentence': 22, 'start': 12, 'end': 13}, {'text': 'Delhi', 'sentence': 10, 'start': 11, 'end': 12}, {'text': 'Delhi', 'sentence': 8, 'start': 16, 'end': 17}, {'text': 'Delhi', 'sentence': 10, 'start': 19, 'end': 20}], 'representative': 'Delhi'}, {'chainID': 102, 'mentions': [{'text': 'The AAP leaders', 'sentence': 13, 'start': 1, 'end': 4}, {'text': 'they', 'sentence': 13, 'start': 9, 'end': 10}], 'representative': 'The AAP leaders'}, {'chainID': 166, 'mentions': [{'text': 'the AAP ’s', 'sentence': 12, 'start': 19, 'end': 22}, {'text': 'the AAP ’s', 'sentence': 22, 'start': 29, 'end': 32}, {'text': 'the AAP ’s', 'sentence': 12, 'start': 12, 'end': 15}], 'representative': 'the AAP ’s'}, {'chainID': 73, 'mentions': [{'text': 'the crowd', 'sentence': 8, 'start': 7, 'end': 9}, {'text': 'us', 'sentence': 8, 'start': 27, 'end': 28}], 'representative': 'the crowd'}, {'chainID': 105, 'mentions': [{'text': 'Prime Minister Narendra Modi', 'sentenc

In [100]:
client.stop()

In [67]:
df.head()

Unnamed: 0,links,articles,titles,pos_tags,ner_tags
0,https://www.thehindu.com/elections/delhi-assem...,Lauding the Union Budget presented by Finance ...,"\nModi attacks AAP, lauds ‘people’s Budget’ in...","[('Lauding', 'VERB'), ('the', 'DET'), ('Union'...","[('the Union Budget', 'ORG'), ('Finance', 'ORG..."
1,https://www.thehindu.com/elections/delhi-assem...,The EVMs and VVPAT machines used in the Delhi ...,\nEVMs kept in strongrooms with three-tier sec...,"[('The', 'DET'), ('EVMs', 'PROPN'), ('and', 'C...","[('VVPAT', 'ORG'), ('the Delhi Assembly', 'ORG..."
2,https://www.thehindu.com/elections/delhi-assem...,A day before polling for the 70-member Delhi A...,"\nAtishi, relatives of Bidhuri booked for poll...","[('A', 'DET'), ('day', 'NOUN'), ('before', 'AD...","[('70', 'CARDINAL'), ('Delhi Assembly', 'PERSO..."
3,https://www.thehindu.com/elections/delhi-assem...,Sitting next to his stall on a packed afternoo...,"\nBeset with dip in profits, inadequate infras...","[('Sitting', 'VERB'), ('next', 'ADV'), ('to', ...","[('afternoon', 'TIME'), ('Ghazipur', 'PERSON')..."
4,https://www.thehindu.com/elections/delhi-assem...,As she drives her e-rickshaw through the stree...,"\nInflation, civic amenities, representation, ...","[('As', 'SCONJ'), ('she', 'PRON'), ('drives', ...","[('south Delhi’s Khanpur', 'GPE'), ('Savitri',..."


In [78]:
df_first_5 = df.iloc[:5]


In [79]:
df_first_5['coref_chains'] = df_first_5['articles'].apply(lambda x: extract_coref_chains(client.annotate(x)))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_first_5['coref_chains'] = df_first_5['articles'].apply(lambda x: extract_coref_chains(client.annotate(x)))


In [102]:
df['coref_chains'] = None
for idx, row in df.iterrows():
    print(f"Processing row {idx}")
    df.at[idx, 'coref_chains'] = extract_coref_chains(client.annotate(row['articles']))
    

Processing row 0
Processing row 1
Processing row 2
Processing row 3
Processing row 4
Processing row 5
Processing row 6
Processing row 7
Processing row 8
Processing row 9
Processing row 10
Processing row 11
Processing row 12
Processing row 13
Processing row 14
Processing row 15
Processing row 16
Processing row 17
Processing row 18
Processing row 19
Processing row 20
Processing row 21
Processing row 22
Processing row 23
Processing row 24
Processing row 25
Processing row 26
Processing row 27
Processing row 28
Processing row 29
Processing row 30
Processing row 31
Processing row 32
Processing row 33
Processing row 34
Processing row 35
Processing row 36
Processing row 37
Processing row 38
Processing row 39
Processing row 40
Processing row 41
Processing row 42
Processing row 43
Processing row 44
Processing row 45
Processing row 46
Processing row 47
Processing row 48
Processing row 49
Processing row 50
Processing row 51
Processing row 52
Processing row 53
Processing row 54
Processing row 55
Pr

In [106]:
#save the dataframe
df.to_csv('articles_coref.csv', index=False)