## Mining Text from Hamlet using Regular Expressions 
###  Agustina Maccio 

In [1]:
import pandas as pd
import re

#open file
text = open('hamlet_act1.txt', 'r').read()

print(text)

ACT 1
=====

Scene 1
[Enter Barnardo and Francisco, two sentinels.]


BARNARDO  Who's there?

FRANCISCO
Nay, answer me. Stand and unfold yourself.

BARNARDO  Long live the King!

FRANCISCO  Barnardo?

BARNARDO  He.

FRANCISCO
You come most carefully upon your hour.

BARNARDO
'Tis now struck twelve. Get thee to bed, Francisco.

FRANCISCO
For this relief much thanks. 'Tis bitter cold,
And I am sick at heart.

BARNARDO  Have you had quiet guard?

FRANCISCO  Not a mouse stirring.

BARNARDO  Well, good night.
If you do meet Horatio and Marcellus,
The rivals of my watch, bid them make haste.

[Enter Horatio and Marcellus.]


FRANCISCO
I think I hear them.--Stand ho! Who is there?

HORATIO  Friends to this ground.

MARCELLUS  And liegemen to the Dane.

FRANCISCO  Give you good night.

MARCELLUS
O farewell, honest soldier. Who hath relieved
you?

FRANCISCO
Barnardo hath my place. Give you good night.
[Francisco exits.]

MARCELLUS  Holla, Barnardo.

BARNARDO  Say, what, is Horatio there?

HORAT

## Hamlet - Act 1 Mining sentences

In [2]:
# mine all the sentences in Act I using regular expressions

result = re.findall(r"([\s][']?[A-Z][^A-Z][^=\.!?]+[\.!?])", text)

# number of sentes after mining process
len(result)

554

In [3]:
# print result
result


[' Barnardo and Francisco, two sentinels.',
 " Who's there?",
 '\nNay, answer me.',
 ' Stand and unfold yourself.',
 ' Long live the King!',
 ' Barnardo?',
 '\nYou come most carefully upon your hour.',
 "\n'Tis now struck twelve.",
 ' Get thee to bed, Francisco.',
 '\nFor this relief much thanks.',
 " 'Tis bitter cold,\nAnd I am sick at heart.",
 ' Have you had quiet guard?',
 ' Not a mouse stirring.',
 ' Well, good night.',
 '\nIf you do meet Horatio and Marcellus,\nThe rivals of my watch, bid them make haste.',
 ' Horatio and Marcellus.',
 '\nI think I hear them.',
 ' Who is there?',
 ' Friends to this ground.',
 ' And liegemen to the Dane.',
 ' Give you good night.',
 '\nO farewell, honest soldier.',
 ' Who hath relieved\nyou?',
 '\nBarnardo hath my place.',
 ' Give you good night.',
 ' Holla, Barnardo.',
 ' Say, what, is Horatio there?',
 ' A piece of him.',
 '\nWelcome, Horatio.',
 ' Marcellus.',
 '\nWhat, has this thing appeared again tonight?',
 ' I have seen nothing.',
 "\nHora

## Identify all the bigrams: (noun -> verb) and (pronoun -> verb) 

In [4]:
import nltk
from nltk.util import ngrams
from nltk import RegexpParser
import ast

nltk.help.upenn_tagset()


$: dollar
    $ -$ --$ A$ C$ HK$ M$ NZ$ S$ U.S.$ US$
'': closing quotation mark
    ' ''
(: opening parenthesis
    ( [ {
): closing parenthesis
    ) ] }
,: comma
    ,
--: dash
    --
.: sentence terminator
    . ! ?
:: colon or ellipsis
    : ; ...
CC: conjunction, coordinating
    & 'n and both but either et for less minus neither nor or plus so
    therefore times v. versus vs. whether yet
CD: numeral, cardinal
    mid-1890 nine-thirty forty-two one-tenth ten million 0.5 one forty-
    seven 1987 twenty '79 zero two 78-degrees eighty-four IX '60s .025
    fifteen 271,124 dozen quintillion DM2,000 ...
DT: determiner
    all an another any both del each either every half la many much nary
    neither no some such that the them these this those
EX: existential there
    there
FW: foreign word
    gemeinschaft hund ich jeux habeas Haementeria Herr K'ang-si vous
    lutihaw alai je jour objets salutaris fille quibusdam pas trop Monte
    terram fiche oui corporis ...
IN: preposition or

In [5]:
patterns = """
    extract: {<N.*|PRP><VB.*>}
    """

bigrams= {}
last_subtree = 0
for sentence in result:
    # word tokenization and tagging
    nltk_tokens = nltk.word_tokenize(sentence)
    
    for i in range (len(nltk_tokens)):
        nltk_tokens[i] = nltk_tokens[i].lower()
    
    #just alphanumeric characters
    nonPunct = re.compile(".*[A-Za-z0-9].*")
    filtered = [w for w in nltk_tokens if nonPunct.match(w)]
        
    tags = nltk.pos_tag(filtered)
    
    #find the patterns
    PChunker = RegexpParser('extract: {<N.*|PRP><VB.*>}')  # tree of chunck wordks selected by the pattern
    chunks = PChunker.parse(tags)
    bigrams_one_sentence = []
    for subtree in chunks.subtrees(filter=lambda t: t.label() in "extract"):
        last_subtree = subtree
        exp = ""
        for l in subtree.leaves():
            exp += str(l[0]) + " "
        exp = exp[:-1]
        if exp == "'t is":
            exp = "it is"
        if exp not in bigrams:
            bigrams[exp] = 1
        elif exp in bigrams:
            bigrams[exp] += 1
    
bigrams

{'you come': 1,
 'it is': 27,
 'you had': 1,
 'you do': 3,
 'watch bid': 1,
 'them make': 1,
 'thing appeared': 1,
 'i have': 7,
 'horatio says': 1,
 'him touching': 1,
 'apparition come': 1,
 'we have': 4,
 'nights seen': 1,
 'us hear': 1,
 'pole had': 1,
 'it burns': 1,
 'break thee': 1,
 'it comes': 3,
 'thou art': 1,
 'it horatio': 3,
 'it harrows': 1,
 'denmark did': 1,
 'it stalks': 1,
 'he had': 1,
 'norway combated': 1,
 'he smote': 1,
 'he gone': 1,
 'task does': 1,
 'doth make': 1,
 'whisper goes': 1,
 'us was': 1,
 'you know': 4,
 'thereto pricked': 1,
 'pride dared': 1,
 'world esteemed': 1,
 'him did': 1,
 'heraldry did': 1,
 'he stood': 1,
 'competent was': 1,
 'fortinbras had': 1,
 'he been': 1,
 'article designed': 1,
 'it doth': 2,
 'terms compulsatory': 1,
 'father lost': 3,
 'i think': 4,
 'it be': 4,
 'it sort': 1,
 'figure comes': 1,
 'julius fell': 1,
 'graves stood': 1,
 'dead did': 1,
 'empire stands': 1,
 'harbingers preceding': 1,
 'omen coming': 1,
 'it blast

In [6]:
print("Number of unique noun|pronoun + verb bigrams mined: ", len(bigrams))

Number of unique noun|pronoun + verb bigrams mined:  350


## Top 10 bigrams

In [7]:
sorted_bigrams = list(sorted(bigrams.items(), key=lambda item: item[1], reverse = True))
top_ten = sorted_bigrams[0:10]
top_ten

[('it is', 27),
 ('i have', 7),
 ('i do', 6),
 ('i am', 6),
 ('you are', 6),
 ('you have', 5),
 ('we have', 4),
 ('you know', 4),
 ('i think', 4),
 ('it be', 4)]

In [8]:
# write all bigrams and their counts to a text file.

# open text file
act1_bigrams = open("act1_hamlet_bigrams.txt", "w")

# Set file title
n = act1_bigrams.write("Act 1 - Hamlet Bigrams (noun|pronoun -> verb) \n\n")

# for each value in sorted bigrams, write it on the text file
for i in sorted_bigrams:
    n = act1_bigrams.write(str(i) + "\n")

# close text file
act1_bigrams.close()