## Libraries

In [2]:
from nltk.tokenize import sent_tokenize, word_tokenize
import nltk, re
import pandas as pd
import random


## Defining the Functions

In [3]:
# Text preprocessing: lowercase, removing numbers, removing punctuation except '.'
def clean_txt(txt):
    result = txt.lower()
    result = ''.join([r for r in result if not r.isdigit()])
    #result = result.translate(str.maketrans("","", '!"#$%&\'()*+,-/:;<=>?@[\\]^_`{|}~')) 
    return(result)

## Anaphora Finder 

In [4]:
# Search in the corpus for anaphora cases, i.e. repetition of n-tokens at the beginning of adjacent sentences
def find_anaphora (txt, n):
    sent_tokenized = sent_tokenize(txt)
    dataset = []
    tmp = []
    repetition = False
    copied = False
    for i in range(len(sent_tokenized) - 1):
        prev_sent = clean_txt(sent_tokenized[i])
        next_sent = clean_txt(sent_tokenized[i + 1])
        prev_phrase = get_sentence_start(prev_sent, n)
        next_phrase = get_sentence_start(next_sent, n)
        if prev_phrase == next_phrase and prev_phrase != '' and next_phrase != '' :
            if repetition == False:
                tmp.append(prev_sent)
                tmp.append(next_sent)
                repetition = True
            else:
                tmp.append(next_sent)
        else:
            if len(tmp) >= 3:
                tmp_str = ''
                strong_punc = False
                for w in tmp:
                    tmp_str += w
                words = word_tokenize(tmp_str)
                if words[-1] != '.':
                    strong_punc = True
                dataset.append(tuple((tmp_str, len(tmp), round(len(words)/len(tmp)), strong_punc)))
                copied = True
            tmp = []
            repetition = False
    if not copied and len(tmp) >= 3:
        tmp_str = ''
        strong_punc = False
        for w in tmp:
            tmp_str += w
        words = word_tokenize(tmp_str)
        if words[-1] != '.':
            strong_punc = True
        dataset.append(tuple((tmp_str, len(tmp), round(len(words)/len(tmp)), strong_punc)))
    return dataset

# Returns n-first tokens of a sentence, used inside findAnaphora function for comparing the sentence start
def get_sentence_start(sentence, n):
    tmp = []
    tokens = word_tokenize(sentence)
    if len(tokens) >= n:
        for i in range(n):
            if i == n-1:
                tmp.append(tokens[i])
            else:
                tmp.append(tokens[i] + ' ')
    if len(tmp) >= 1:
        str_txt = ''.join(tmp)
    else:
        str_txt = ''
    return str_txt

## Epistrophe Finder

In [5]:
# Search in the corpus for epistrophe cases, i.e. repetition of n-tokens at the end of adjacent sentences
def find_Epistrophe (txt, n):
    sent_tokenized = re.split("[.,?!;:]", txt)
    dataset = []
    tmp = []
    repetition = False
    copied = False
    identical = False
    for i in range(len(sent_tokenized) - 1):
        diff = 0.0        
        prev_sent = clean_txt(sent_tokenized[i]).strip()
        next_sent = clean_txt(sent_tokenized[i + 1]).strip()
        prev_phrase = get_sentence_end(prev_sent, n)
        next_phrase = get_sentence_end(next_sent, n)
        if prev_phrase == next_phrase and prev_phrase != '' and next_phrase != '' :
            if repetition == False:
                tmp.append(prev_sent + '.')
                tmp.append(next_sent + '.')                
                repetition = True
            else:
                tmp.append(next_sent + '.')
        else:
            if len(tmp) >= 3:
                lengths = []
                tmp_str = ''
                dif = 0 
                for r in range(len(tmp) - 1):
                    if tmp[r] == tmp[r + 1]:
                        identical = True                    
                for w in tmp:
                    words = word_tokenize(w)
                    lengths.append(len(words))
                    tmp_str += w   
                for l in range(len(lengths) - 1):
                    dif += abs(lengths[l] - lengths[l+1])
                dataset.append(tuple((tmp_str, dif, len(tmp), identical)))
                copied = True
            tmp = []
            repetition = False
            identical = False
    if not copied and len(tmp) >= 3:
        lengths = []
        tmp_str = ''
        dif = 0
        for r in range(len(tmp) - 1):
                    if tmp[r] == tmp[r + 1]:
                        identical = True
        for w in tmp:
            words = word_tokenize(w)
            lengths.append(len(words))
            tmp_str += w
        for l in range(len(lengths) - 1):
            dif += abs(lengths[l] - lengths[l+1])
        dataset.append(tuple((tmp_str, dif, len(tmp), identical)))
    return dataset


# Returns n-last tokens of a sentence, used inside findEpistrophe function for comparing the sentence end
def get_sentence_end(sentence, n):
    #sentence = clean_txt(sentence)
    tmp = []
    clean_s = re.sub("\.", "", sentence)
    tokens = word_tokenize(clean_s)
    if len(tokens) >= n:
        for j in range(n, 0, -1):
            if j == 1:
                tmp.append(tokens[-j])
            else:
                tmp.append(tokens[-j] + ' ')
    if len(tmp) >= 1:
        str_txt = ''.join(tmp)
    else:
        str_txt = ''
    return str_txt

## Data Acquisition

In [19]:
f = pd.read_csv("corpus.csv")
keep_col = ['transcripts']
new_f = f[keep_col]
data = new_f.to_string() #render string representation
#new_f.to_csv("newFile.csv", index=False) #inorder to make new file with only transcripts

In [7]:
new_f.head()

Unnamed: 0,transcripts
0,Fellow Citizens of the Senate and the House of...
1,"When it was first perceived, in early times, t..."
2,"FRIENDS AND FELLOW-CITIZENS, Called upon to un..."
3,Unwilling to depart from examples of the most ...
4,I should be destitute of feeling if I was not ...


In [8]:
len(word_tokenize(data)) #Corpus contains around 4.2 million words

4210680

## Search Anaphora

In [9]:
ana_result = find_anaphora(data, 3)

In [10]:
print(ana_result)

[('is there one among our citizens who would not prefer perpetual peace with texas to occasional wars, which so often occur between bordering independent nations?is there one who would not prefer free intercourse with her to high duties on all our products and manufactures which enter her ports or cross her frontiers?is there one who would not prefer an unrestricted communication with her citizens to the frontier obstructions which must occur if she remains out of the union?', 3, 28, True), ('if it be, the question is decided.if it be not expressed, the next inquiry must be whether it is properly an incident to an expressed power and necessary to its execution.if it be, it may be exercised by congress.if it be not, congress can not exercise it.', 4, 13, False), ('it will be found by applying the restriction thus understood to the bill under consideration that it contains appropriations for more than twenty objects of internal improvement, called in the bill harbors, at places which hav

In [11]:
len(ana_result)

372

## Search Epistrophe

In [12]:
epi_result = find_Epistrophe(data, 2)

In [13]:
print(epi_result)

[('the conventions of  -  -.and of  -  -.will expire by their own limitation on  -  -.', 6, 3, False), ('the next session of the chambers commenced on  -  -.and continued until  -  -.a new bill was introduced on  -  -.', 8, 3, False), ('it is created by law.is amendable by law.and is repealable by law.', 2, 3, False), ('and commerce with the republic of new granada.among the conditions of which was a stipulation on the part of new granada guaranteeing to the united states the right of way or transit across that part of the isthmus which lies in the territory of new granada.in consideration of which the united states guaranteed in respect of the same territory the rights of sovereignty and property of new granada.', 47, 3, False), ('between her britannic majesty and the republic of honduras.constituted and declared a free territory under the sovereignty of the said republic of honduras.” stipulated that “the two contracting parties do hereby mutually engage to recognize and respect in a

In [14]:
len(epi_result)

133

## Construct DataFrame 

In [51]:
#Dataframe for Anaphora
#Choosen Features: Number of successive sentences, Average Length of Sentences, if a sentence has strong punctuation
df_a = pd.DataFrame(ana_result, columns=["Text", "No of Successive Sent", "AvgLength of sent", "StrongPunctuation"]) 
labels_a = [0,0,0,0,0,0,1,1,0,1,0,1,1,1,1,1,0,0,1,0,
            0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,1,0,1,1,1,
            1,1,1,1,0,0,1,0,1,1,1,1,0,0,0,1,1,0,1,1,
            0,1,1,1,0,1,1,0,0,1,1,0,0,1,0,0,1,1,0,0,
            0,0,1,0,0,1,1,1,1,1,1,0,0,1,0,1,1,0,1,0,
            1,1,1,1,1,0,0,0,1,0,0,1,1,0,0,1,1,0,1,1,
            1,1,1,1,1,1,1,1,1,1,1,0,0,1,1,0,0,0,0,0,
            1,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,
            0,1,1,1,1,1,1,1,0,1,0,1,1,1,0,1,1,1,1,1,
            0,1,0,1,0,1,0,1,0,1,1,1,1,0,1,1,0,1,0,1,
            0,1,1,0,0,1,1,1,1,0,1,1,0,1,1,0,1,0,1,0,
            0,1,1,0,0,0,0,0,0,1,1,0,0,1,1,0,1,1,1,1,
            0,0,1,0,1,1,0,1,0,1,1,0,1,0,1,1,1,1,0,1,
            1,1,0,0,0,1,0,1,1,1,1,0,0,0,1,1,0,1,1,1,
            1,1,1,1,1,1,1,0,0,0,1,1,0,1,1,0,0,1,1,0,
            0,1,0,0,1,0,1,0,0,1,1,1,0,0,1,0,1,0,1,1,
            0,1,0,1,1,1,1,1,0,1,1,1,0,0,0,1,0,0,0,1,
            1,1,1,1,0,0,0,0,0,1,0,0,1,0,1,0,1,0,0,1,
            1,0,1,0,1,0,1,1,0,1,0,1]

df_a.insert(4, "isAnaphora", labels_a, True)
df_a

Unnamed: 0,Text,No of Successive Sent,AvgLength of sent,StrongPunctuation,isAnaphora
0,is there one among our citizens who would not ...,3,28,True,0
1,"if it be, the question is decided.if it be not...",4,13,False,0
2,it will be found by applying the restriction t...,4,52,False,0
3,it was the interest of the bank that the reven...,3,44,False,0
4,is it the fact that in all the unsettled regio...,3,51,True,0
...,...,...,...,...,...
367,this is the time to re ignite the american ima...,3,18,False,1
368,you can't take big loads because you have peop...,3,19,False,0
369,we could do all of these different things so e...,3,8,False,1
370,it's going up in new mexico.it's going up in a...,3,8,False,0


In [50]:

df_e = pd.DataFrame(epi_result, columns=["Text", "Sent Length Difference", "No of Successive Sent", "isIdentical"])
labels_e = [0,0,1,0,0,0,1,0,0,1,
            1,0,0,0,0,0,1,0,1,0,
            1,0,1,1,1,1,0,0,0,0,
            0,0,1,1,1,1,0,0,0,0,
            0,0,0,1,1,1,0,1,0,1,
            1,1,1,0,1,1,1,1,1,1,
            1,0,1,1,1,1,1,1,1,0,
            0,1,0,1,0,0,1,0,1,0,
            1,1,1,1,0,0,1,1,1,1,
            0,0,0,1,1,1,1,1,1,1,
            1,1,1,1,1,0,0,1,1,0,
            0,0,0,0,1,1,1,1,1,0,
            1,1,1,0,0,0,1,0,0,1,
            0,0,0]
df_e.insert(4, "isEpiphora", labels_e, True)
df_e

Unnamed: 0,Text,Sent Length Difference,No of Successive Sent,isIdentical,isEpiphora
0,the conventions of - -.and of - -.will exp...,6,3,False,0
1,the next session of the chambers commenced on ...,8,3,False,0
2,it is created by law.is amendable by law.and i...,2,3,False,1
3,and commerce with the republic of new granada....,47,3,False,0
4,between her britannic majesty and the republic...,28,3,False,0
...,...,...,...,...,...
128,“don't do it.don't do it.don't do it.,1,3,True,0
129,i want those companies—and they're starting—i ...,8,3,False,1
130,“how is your drug problem.” “we don't have muc...,7,3,False,0
131,thank you.thank you.thank you.thank you.thank ...,0,5,True,0


Epalanepsis (Level: Sentence and punctuation) 
Example for the randomly chosen speech

In [17]:

first_sent = sentences[0] 
first_sent = first_sent.lower()
first_sent = re.sub(r'[^\w\s]','',first_sent)
last_sent = first_sent.split()
epanalepsis = list()
for sent in sentences[1:]:
    sent = sent.lower()
    sent = re.sub(r'[^\w\s]','',sent)
    words = sent.split()
    epanalepsis_record = list()
    for i, j in zip(words[::-1], last_sent):
        if i == j :
            epanalepsis_record.append(i) 
        else:
            break
    if len(epanalepsis_record) > 0:
        epanalepsis.append(epanalepsis_record[::-1])
    last_sent = words;

NameError: name 'sentences' is not defined

In [None]:
for sent in sentences:
    part = re.split('[,:;]', sent)
    first_part = part[0] 
    first_part = first_part.lower()
    first_part = re.sub(r'[^\w\s]','',first_part)
    words = first_part.split()
    words = list(filter(('and').__ne__, words))
    words = list(filter(('').__ne__, words))
    last_part = words
    for i in part[1:]:
        i  = i.lower()
        i = re.sub(r'[^\w\s]','',i)
        words = i.split()
        words = list(filter(('and').__ne__, words))
        words = list(filter(('').__ne__, words))
        epanalepsis_record = list()
        for i, j in zip(words[::-1], last_part):
            if i == j :
                epanalepsis_record.append(i)
            else:
                break
        if len(epanalepsis_record) > 0:
            epanalepsis.append(epanalepsis_record)
        last_part = words;

In [None]:
print(epanalepsis)

In [None]:
print("Number of epanalepsis candidates: ", len(epanalepsis))