## Libraries

In [9]:
from nltk.tokenize import sent_tokenize, word_tokenize
import nltk, re
import pandas as pd
import random


## Defining the Functions

In [19]:
# Text preprocessing: lowercase, removing numbers, removing punctuation except '.'
def clean_txt(txt):
    result = txt.lower()
    result = ''.join([r for r in result if not r.isdigit()])
    result = result.translate(str.maketrans("","", '!"#$%&\'()*+,-/:;<=>?@[\\]^_`{|}~')) 
    return(result)

## Anaphora Finder 

In [20]:
# Search in the corpus for anaphora cases, i.e. repetition of n-tokens at the beginning of adjacent sentences
def find_anaphora (txt, n):
    sent_tokenized = sent_tokenize(txt)
    dataset = []
    tmp = []
    repetition = False
    copied = False
    for i in range(len(sent_tokenized) - 1):
        prev_sent = clean_txt(sent_tokenized[i])
        next_sent = clean_txt(sent_tokenized[i + 1])
        prev_phrase = get_sentence_start(prev_sent, n)
        next_phrase = get_sentence_start(next_sent, n)
        if prev_phrase == next_phrase and prev_phrase != '' and next_phrase != '' :
            if repetition == False:
                tmp.append(prev_sent)
                tmp.append(next_sent)
                repetition = True
            else:
                tmp.append(next_sent)
        else:
            if len(tmp) >= 3:
                tmp_str = ''
                strong_punc = False
                for w in tmp:
                    tmp_str += w
                words = word_tokenize(tmp_str)
                if words[-1] != '.':
                    strong_punc = True
                dataset.append(tuple((tmp_str, len(tmp), round(len(words)/len(tmp)), strong_punc)))
                copied = True
            tmp = []
            repetition = False
    if not copied and len(tmp) >= 3:
        tmp_str = ''
        strong_punc = False
        for w in tmp:
            tmp_str += w
        words = word_tokenize(tmp_str)
        if words[-1] != '.':
            strong_punc = True
        dataset.append(tuple((tmp_str, len(tmp), round(len(words)/len(tmp)), strong_punc)))
    return dataset

# Returns n-first tokens of a sentence, used inside findAnaphora function for comparing the sentence start
def get_sentence_start(sentence, n):
    #sentence = clean_txt(sentence)
    res = []
    tokens = word_tokenize(sentence)
    if len(tokens) >= n:
        for i in range(n):
            if i == n-1:
                res.append(tokens[i])
            else:
                res.append(tokens[i] + ' ')
    if len(res) >= 1:
        str_txt = ''.join(res)
    else:
        str_txt = ''
    return str_txt

## Data Acquisition

In [21]:
import pandas as pd
f=pd.read_csv("corpus.csv")
keep_col = ['transcripts']
new_f = f[keep_col]
data = new_f.to_string() #render string representation
#new_f.to_csv("newFile.csv", index=False) #inorder to make new file with only transcripts

In [22]:
new_f.head()

Unnamed: 0,transcripts
0,Fellow Citizens of the Senate and the House of...
1,"When it was first perceived, in early times, t..."
2,"FRIENDS AND FELLOW-CITIZENS, Called upon to un..."
3,Unwilling to depart from examples of the most ...
4,I should be destitute of feeling if I was not ...


In [23]:
len(word_tokenize(data)) #Corpus contains around 4.2 million words

4210680

## Search Anaphora

In [24]:
ana_result = find_anaphora(data, 3)

In [27]:
print(ana_result)

[('is there one among our citizens who would not prefer perpetual peace with texas to occasional wars which so often occur between bordering independent nationsis there one who would not prefer free intercourse with her to high duties on all our products and manufactures which enter her ports or cross her frontiersis there one who would not prefer an unrestricted communication with her citizens to the frontier obstructions which must occur if she remains out of the union', 3, 26, True), ('if it be the question is decided.if it be not expressed the next inquiry must be whether it is properly an incident to an expressed power and necessary to its execution.if it be it may be exercised by congress.if it be not congress can not exercise it.', 4, 12, False), ('it will be found by applying the restriction thus understood to the bill under consideration that it contains appropriations for more than twenty objects of internal improvement called in the bill harbors at places which have never be

In [26]:
len(ana_result)

335

Anaphora (Level: Sentence and punctuation) 
Example for the randomly chosen speech

In [217]:

nlp = spacy.load("en")
sentences = list()

tokens = nlp(data['speech'][test])

for sent in tokens.sents:
    sentences.append(sent.string.strip())

In [218]:

first_sent = sentences[0] 
first_sent = first_sent.lower()
first_sent = re.sub(r'[^\w\s]','',first_sent)
last_sent = first_sent.split()
anaphora = list()
for sent in sentences[1:]:
    sent = sent.lower()
    sent = re.sub(r'[^\w\s]','',sent)
    words = sent.split()
    anaphora_record = list()
    for i, j in zip(words, last_sent):
        if i == j :
            anaphora_record.append(i)
        else:
            break
    if len(anaphora_record) > 0:
        anaphora.append(anaphora_record)
    last_sent = words;

In [219]:
for sent in sentences:
    part = re.split('[,:;]', sent)
    first_part = part[0] 
    first_part = first_part.lower()
    first_part = re.sub(r'[^\w\s]','',first_part)
    words = first_part.split()
    words = list(filter(('and').__ne__, words))
    words = list(filter(('').__ne__, words))
    last_part = words
    for i in part[1:]:
        i  = i.lower()
        i = re.sub(r'[^\w\s]','',i)
        words = i.split()
        words = list(filter(('and').__ne__, words))
        words = list(filter(('').__ne__, words))
        anaphora_record = list()
        for i, j in zip(words, last_part):
            if i == j :
                anaphora_record.append(i)
            else:
                break
        if len(anaphora_record) > 0:
            anaphora.append(anaphora_record)
        last_part = words;
        
        
            
        
    
    

In [220]:
print(anaphora)

[]


In [221]:
print("Number of anaphora candidates: ", len(anaphora))

Number of anaphora candidates:  0


Epistrophe (Level: Sentence and punctuation) 
Example for the randomly chosen speech

In [222]:

first_sent = sentences[0] 
first_sent = first_sent.lower()
first_sent = re.sub(r'[^\w\s]','',first_sent)
last_sent = first_sent.split()
epistrophe = list()
for sent in sentences[1:]:
    sent = sent.lower()
    sent = re.sub(r'[^\w\s]','',sent)
    words = sent.split()
    epistrophe_record = list()
    for i, j in zip(words[::-1], last_sent[::-1]):
        if i == j :
            epistrophe_record.append(i)
        else:
            break
    if len(epistrophe_record) > 0:
        epistrophe.append(epistrophe_record[::-1])
    last_sent = words;

In [223]:
for sent in sentences:
    part = re.split('[,:;]', sent)
    first_part = part[0] 
    first_part = first_part.lower()
    first_part = re.sub(r'[^\w\s]','',first_part)
    words = first_part.split()
    words = list(filter(('and').__ne__, words))
    words = list(filter(('').__ne__, words))
    last_part = words
    for i in part[1:]:
        i  = i.lower()
        i = re.sub(r'[^\w\s]','',i)
        words = i.split()
        words = list(filter(('and').__ne__, words))
        words = list(filter(('').__ne__, words))
        epistrophe_record = list()
        for i, j in zip(words[::-1], last_part[::-1]):
            if i == j :
                epistrophe_record.append(i)
            else:
                break
        if len(epistrophe_record) > 0:
            epistrophe.append(epistrophe_record)
        last_part = words;

In [224]:
print(epistrophe)

[]


In [225]:
print("Number of epistrophe candidates: ", len(epistrophe))

Number of epistrophe candidates:  0


Epalanepsis (Level: Sentence and punctuation) 
Example for the randomly chosen speech

In [226]:

first_sent = sentences[0] 
first_sent = first_sent.lower()
first_sent = re.sub(r'[^\w\s]','',first_sent)
last_sent = first_sent.split()
epanalepsis = list()
for sent in sentences[1:]:
    sent = sent.lower()
    sent = re.sub(r'[^\w\s]','',sent)
    words = sent.split()
    epanalepsis_record = list()
    for i, j in zip(words[::-1], last_sent):
        if i == j :
            epanalepsis_record.append(i) 
        else:
            break
    if len(epanalepsis_record) > 0:
        epanalepsis.append(epanalepsis_record[::-1])
    last_sent = words;

In [227]:
for sent in sentences:
    part = re.split('[,:;]', sent)
    first_part = part[0] 
    first_part = first_part.lower()
    first_part = re.sub(r'[^\w\s]','',first_part)
    words = first_part.split()
    words = list(filter(('and').__ne__, words))
    words = list(filter(('').__ne__, words))
    last_part = words
    for i in part[1:]:
        i  = i.lower()
        i = re.sub(r'[^\w\s]','',i)
        words = i.split()
        words = list(filter(('and').__ne__, words))
        words = list(filter(('').__ne__, words))
        epanalepsis_record = list()
        for i, j in zip(words[::-1], last_part):
            if i == j :
                epanalepsis_record.append(i)
            else:
                break
        if len(epanalepsis_record) > 0:
            epanalepsis.append(epanalepsis_record)
        last_part = words;

In [228]:
print(epanalepsis)

[]


In [215]:
print("Number of epanalepsis candidates: ", len(epanalepsis))

Number of epanalepsis candidates:  0
