# Ritz, Julia (2010). Using tf-idf-related Measures for Determining the Anaphoricity of Noun Phrases.

```perl
#1st pass
l:=4; #initialize term length l
D:=0; #initialize file counter D

for each Document d i in the corpus
#count document
D++;
p:=1; #initialize character position p
    while p + l in d i
        #sequentially cut into terms t of length l
        t:=substring(d i , p, l);
        #*insert string normalization (optional)*
        #initialize count array where necessary
        C(t, d i ):=0 unless defined;
        #save number of previous mentions
        #(i.e. annotate t with C(t, d i ))
        A(t, d i , p):=C(t, d i );
        #count current mention
        C(t, d i )++;
        #count documents containing t
        #(only on first mention of t)
        E(t)++ if (C(t, d i ) =1);
        p++;
    end; #end while
end; #end for each;


#2nd pass
for each Document d i in the corpus
    for each noun phrase NP s e in d i
        sum:=0; #initialize sum
        #from NP’s starting position. . .
        p:=s;
        #. . . to start of last term
        while p <= e − l + 1
            t:=substring(d i , p, l);
            #*insert string normalization (optional)*
            #get annotation of t at p,
            #calculate tf-idf from it
            #and add it to the current sum
            sum+=(get(t, d i , p)/p)*log(D/E(t));
            #calculate sum of other measures
            ...
        end; #end while

        #average by the number of terms in NP s e
        a:=sum/(e − s − l + 2);
        #annotate sum and means to NP s e
        S(d i , s, e):=sum;
        M (d i , s, e):=a;
    end; #end for each
end; #end for each
```

In [43]:
from collections import defaultdict


def first_pass(corpus, term_len=4): # term_len = l
    previous_mentions = {}
    term_count = defaultdict(lambda : defaultdict(int))
    docs_containing_term_count = defaultdict(int)

    for doc_id, doc in enumerate(corpus): # doc_id = D
        doc_len = len(doc)
        char_pos = 0 # p 
        while char_pos+term_len < doc_len: # TODO: check if off-by-one
            #sequentially cut into terms t of length l
            term = doc[char_pos:char_pos+term_len] # t
            
            #*insert string normalization (optional)*            
                
            #save number of previous mentions
            previous_mentions[(term, doc_id, char_pos)] = term_count[term][doc_id]
            
            #count current mention
            term_count[term][doc_id] += 1
            
            if term_count[term][doc_id] == 1:
                #count documents containing t
                #(only on first mention of t)
                docs_containing_term_count[term] += 1 # E           
            char_pos += 1
    return previous_mentions, term_count, docs_containing_term_count

In [24]:
from math import log

def second_pass(corpus, previous_mentions, docs_containing_term_count, term_len=4):
    np_sums = {}
    np_means = {}

    for doc_id, doc in enumerate(corpus):
        for np in doc.noun_phrases(): # NP starting at position s and ending at position e
            np_sum = 0
            start_pos = np.startpos()
            end_pos = np.endpos()
            #from NP’s starting position. . .
            char_pos = start_pos
            #. . . to start of last term
            while char_pos <= endpos - term_len + 1: # TODO: off-by-one?
                term = doc[char_pos:char_pos+term_len] # t

                #*insert string normalization (optional)*

                #get annotation of t at p,
                #calculate tf-idf from it
                #and add it to the current sum                
                anno_t_p = previous_mentions[(term, doc_id, char_pos)]
                np_sum += (anno_t_p / char_pos) * log( len(corpus) / docs_containing_term_count[term])

                # TODO: calculate sum of other measures
                # ...

            #average by the number of terms in NP s e
            average = np_sum / (end_pos - start_pos - term_len + 2) # TODO: off-by-one?
            
            #annotate sum and means to NP s e
            np_sums[(doc_id, start_pos, end_pos)] = np_sum
            np_means[(doc_id, start_pos, end_pos)] = average
    return np_sums, np_means

In [33]:
prev, term_count, doc_count = first_pass(['das haus ist rot.', 
                                          'die roten haeuser.', 
                                          'die haeuser sind rot', 
                                          'der turm ist gruen.',
                                          'zwei haeuser sind auch rot.',
                                          'die toten haeuser.'])

In [35]:
# for term in term_count:
#     print term, term_count[term]

# for term in doc_count:
#     print "term '{}' is in {} document(s)".format(term, doc_count[term])

In [23]:
previous_mentions = defaultdict(lambda : defaultdict(int))
previous_mentions['a']['b']['c'] = 23

TypeError: 'int' object does not support item assignment