In [11]:
# Bruno Ugolini

# Safe dict reading

define a function `safe_dict(d, k)` that takes in a python dict `d` and a key `k` and makes it safe to read even with keys that aren't in the dictionary. If you try to read from the dictionary with a bad key, it should return 0 instead.

```
d = {1 : 2, 3 : 4}
safe_dict(d, 1) -> 2
safe_dict(d, 'cat') -> 0
```

In [13]:
def safe_dict(d, k):
    """
    For a given dictionary d, the value of
    key k is output while checking for wrong
    key inputs.
    """
    
    try:
        print("The value of key {0} is {1}".format(k, d[k]))
    except KeyError:
        print("There is no key {0} in the dictionary".format(k))
        
    return

d = {1:2, 3:4}
safe_dict(d, 1)
safe_dict(d, 3)
safe_dict(d, 'cat')

The value of key 1 is 2
The value of key 3 is 4
There is no key cat in the dictionary


# File Reading: Hamlet Exercises

Open `hamlet.txt` in the `data` folder

### 1. Mentionned Hamlet

How many times is hamlet mentioned in the book?

Use python and line iteration to count it up

In [15]:
import re, string

# open, read and close the Hamlet text.
pth = './data/hamlet.txt'
with open(pth, 'r', encoding='UTF-8') as f:
    lines = f.readlines()

# set the regex for unicode word recognition
wrds = re.compile(r'\w+')

# create the empty dictionary
all_wrds = {}

# sort line by line through the text
for line in lines:

    # strip the words of punctuation and underscores
    line = line.strip(string.punctuation)
    line = line.replace('_', '')

    # loop through each word in the line
    for k in wrds.findall(line):

        # convert to upper case to normalize convention
        k = k.upper()
        
        # add word to dict and/or count
        try:
            all_wrds[k] += 1
        except KeyError:
            all_wrds[k] = 1

wrd = 'HAMLET'
print(f"The name {wrd} appears {all_wrds[wrd]} times.")

The name HAMLET appears 474 times.


### 2. File Reading as a .py program

Make a python file that defines a function that counts the number of times hamlet is mentionned using the code in the previous exercise.

Then import it in your notebook and call it here.

In [17]:
filenm = 'hamlet.py'

fcn_des_txt = """
This function reads the hamlet text and
returns the word count for wrd
"""
code_txt = """
import re, string

# open, read and close the Hamlet text.
pth = './data/hamlet.txt'
with open(pth, 'r', encoding='UTF-8') as f:
    lines = f.readlines()

# set the regex for unicode word recognition
wrds = re.compile(r'\w+')

# create the empty dictionary
all_wrds = {}

# sort line by line through the text
for line in lines:

    # strip the words of punctuation and underscores
    line = line.strip(string.punctuation)
    line = line.replace('_', '')

    # loop through each word in the line
    for k in wrds.findall(line):

        # convert to upper case to normalize convention
        k = k.upper()
        
        # add word to dict and/or count
        try:
            all_wrds[k] += 1
        except KeyError:
            all_wrds[k] = 1

print(f"The name {wrd} appears {all_wrds[wrd.upper()]} times.")
"""

with open(filenm, 'w') as f:
    f.write("def hamlet_counts(wrd):\n")
    f.write('    """\n')
    f.write(fcn_des_txt)
    f.write('    """\n')
    lines = code_txt.split('\n')
    for line in lines:
        f.write('    '+line+'\n')
    f.close()

from hamlet import hamlet_counts

hamlet_counts('Hamlet')


The name Hamlet appears 474 times.


### 3. Unique words in hamlet

Write a program that counts the unique words in hamlet.

In [1]:
import re, string

# open, read and close the Hamlet text.
pth = './data/hamlet.txt'
with open(pth, 'r', encoding='UTF-8') as f:
    lines = f.readlines()

# set the regex for unicode word recognition
wrds = re.compile(r'\s+')

# create the empty dictionary
all_wrds = {}

# sort line by line through the text
for line in lines:

    # strip the words of punctuation and underscores
    line = line.strip(string.punctuation)
    line = line.replace('_', '')

    # loop through each word in the line
    for k in wrds.split(line):

        # convert to upper case to normalize convention
        #k = k.upper()
        
        # add word to dict and/or count
        try:
            all_wrds[k] += 1
        except KeyError:
            all_wrds[k] = 1

print(f"There are {len(all_wrds)} unique words in the text of Hamlet.")

There are 7664 unique words in the text of Hamlet.


# File Reading 2: A Python library.

In the `data` folder, you will find a folder called `csrgraph` which is a python library.

### 1. File count

Count the `py` files in the library using the `os` package

In [19]:
import os, re

# get the directory listing from os.
ls_out = os.listdir('./data/csrgraph')

# establish the regular expression 
# for our search string
regex = re.compile('.py')

# convert the multi-line output to a
# single string.
in_line_txt = '\n'.join(ls_out)

print(f"There are {len(regex.findall(in_line_txt))} python files.")

There are 8 python files.


### 2. For the following packages, count the number of files that import them:

- pandas 

- numpy

- numba

In [20]:
# define packages to search for
packs = ['pandas', 'numpy', 'numba']

# create a dictionary with zero counts
# for each package.
pack_cnt = {}
for pack in packs:
    pack_cnt[pack] = 0

import re

pth_txt = './data/csrgraph/'

for file in ls_out:
    
    # open each file in succession.
    f = open(pth_txt+file, 'r', encoding='UTF-8')
    
    # read the contents
    lines = f.readlines()
    
    # loop through the packs
    for pack in packs:
        
        # establish a regex for the package
        # and the word "import"
        regex = re.compile(pack)
        regex2 = re.compile('import')
        for line in lines:
            # only if both words appear on 
            # a common line do we count it
            if bool(regex.findall(line)) & bool(regex2.findall(line)):
                pack_cnt[pack] += 1
                # stop sorting for this package
                # in this file
                break
                
    f.close()

# display the results
for k in pack_cnt:
    print('The python package {0} is called {1} times in these 8 files.'.format(k, pack_cnt[k]))


The python package pandas is called 4 times in these 8 files.
The python package numpy is called 6 times in these 8 files.
The python package numba is called 6 times in these 8 files.


# First NLP Program: IDF

Given a list of words, the the inverse document frequency (IDF) is a basic statistic of the amount of information of each word in the text.

The IDF formulat is:

$$IDF(w) = ln(\dfrac{N}{1 + n(w)})$$

Where:

- $w$ is the token (unique word),
- $n(w)$ is the number of documents that $w$ occurs in,
- $N$ is the total number of documents

Write a function, `idf(docs)` that takes in a list of lists of words and returns a dictionary  `word -> idf score`

Example:

```
IDF([['interview', 'questions'], ['interview', 'answers']]) -> {'questions': 0.0, 
                                                                'interview': -0.4, 
                                                                'answers': 0.0}


```

In [21]:
def idf(docs):
    """
    This function calculates the Inverse
    Document Frequency (IDF) for the 
    document passed to it.
    """
    import math

    # create an empty dictionary    
    dct = {}
    
    # loop through each document in
    # the documents passed
    for doc in docs:
        
        # loop through the unique
        # elements in each document
        for elem in set(doc):
            
            # populate the dictionary
            try:
                dct[elem] += 1
            except KeyError:
                dct[elem] = 1
    
    # calculate the IDF
    for k in dct:
        dct[k] = math.log(len(docs)/(1+dct[k]))
        print(f"The IDF for {k} is: {dct[k]:.4f}.")
    
    return dct

_ = idf([['interview', 'questions'], ['interview', 'answers']])

The IDF for questions is: 0.0000.
The IDF for interview is: -0.4055.
The IDF for answers is: 0.0000.


In [22]:
_ = idf([['interview', 'questions', 'interview'], ['interview', 'answers'], ['questions', 'answers', 'worksheet']])

The IDF for questions is: 0.0000.
The IDF for interview is: 0.0000.
The IDF for answers is: 0.0000.
The IDF for worksheet is: 0.4055.


# Stretch Goal: TF-IDF on Hamlet

The TF-IDF score is a commonly used statistic for the importance of words. Its $\frac{TF}{IDF}$ where TF is the "term frequency" (eg. how often the words happens in the document).

Calculate the TF-IDF dictionary on the Hamlet book.

What's the TF-IDF of "Hamlet"?

What's the word with the highest TF-IDF in the book?

In [23]:
import re, string, math
import pandas as pd

# open, read and close the Hamlet text.
pth = './data/hamlet.txt'
with open(pth, 'r', encoding='UTF-8') as f:
    lines = f.readlines()

# create a dataframe to store results
df = pd.DataFrame(columns=['Word', 'nw', 'TF'])

# create the regular expression to parse with
wrds = re.compile(r'\w+')

# initialize document counter (line counter)
N = 0

for line in lines:
    
    # (because some lines are blank and we should
    # ignore them)
    if not line.strip():
        continue
    N += 1
    
    # strip the words of punctuation and underscores
    line = line.strip(string.punctuation)
    line = line.replace('_', '').upper()
    
    # loop through the unique set of words
    keys = set(wrds.findall(line))
    for k in keys:

        # set a regex for this word
        regex = re.compile(k)
        cnts = len(regex.findall(line))

        if k not in df['Word'].values:
            # if not present, add the word
            s = pd.Series({'Word':k, 'nw':1, 'TF':cnts})
            df = df.append(s, ignore_index=True)
        else:
            # if present increment the word counts
            df.loc[df['Word'] == k, 'nw'] += 1
            df.loc[df['Word'] == k, 'TF'] += cnts

# add a column for the TF/IDF calculation
df['TF_IDF'] = df.apply(lambda row: row.TF / math.log(N/(1+row.nw)), axis=1)

# view the results sorted by TF/IDF ratio
df.sort_values('TF_IDF', ascending=False)

Unnamed: 0,Word,nw,TF,TF_IDF
7,I,593,1744,799.770090
10,A,508,1804,772.571820
4,THE,949,1315,768.536627
81,AND,932,1040,601.468692
51,TO,683,785,384.889184
...,...,...,...,...
2384,HYRCANIAN,1,1,0.126994
2386,RESEMBLE,1,1,0.126994
2387,COUCHED,1,1,0.126994
2388,OMINOUS,1,1,0.126994


In [24]:
# print out the word with the highest TF/IDF ratio
df.loc[df['TF_IDF'].idxmax()]
# 
#           The word is with the highest
#           TF/IDF ratio is "I"
#

Word           I
nw           593
TF          1744
TF_IDF    799.77
Name: 7, dtype: object

# Stretch Goal: Speaker count

Use a regular expression and looping over the `hamlet.txt` file to build a dictionary `character_name -> # times speaking`.

Who speaks the most often? Who speaks the least often?

In [25]:
with open('./data/hamlet.txt', 'r', encoding='UTF-8') as f:
    lines = f.readlines()

import re, string
import pandas as pd

# create empty dictionary
speakers = {}

# create parser for :
# 1) start of line
# 2) word format
# 3) all CAPS
# 4) followed by a period
# 5) followed by new line
wrds = re.compile('^\w[A-Z]+\.\n')

for line in lines:
    
    # if no matches to parser, continue
    if wrds.match(line) != None:
        # eliminate period and new line
        name = wrds.match(line).string.replace('.','')
        name = name.replace('\n','')
        # enter name into dict and/or increment count
        try:
            speakers[name] += 1
        except KeyError:
            speakers[name] = 1

{k: v for k, v in sorted(speakers.items(), key=lambda item: item[1])}

{'BOTH': 1,
 'ALL': 1,
 'VOLTEMAND': 1,
 'PROLOGUE': 1,
 'LUCIANUS': 1,
 'SERVANT': 1,
 'DANES': 2,
 'MESSENGER': 2,
 'PRIEST': 2,
 'GENTLEMAN': 3,
 'LORD': 3,
 'FORTINBRAS': 6,
 'CAPTAIN': 7,
 'FRANCISCO': 8,
 'REYNALDO': 13,
 'GHOST': 14,
 'BARNARDO': 18,
 'OSRIC': 25,
 'GUILDENSTERN': 29,
 'MARCELLUS': 31,
 'ROSENCRANTZ': 45,
 'OPHELIA': 58,
 'LAERTES': 62,
 'QUEEN': 69,
 'POLONIUS': 86,
 'KING': 102,
 'HORATIO': 107,
 'HAMLET': 358}

In [26]:
print(f"The character who speaks the most is {max(speakers, key=speakers.get)}.")
# delete general or non-character entries
del speakers['BOTH']
del speakers['ALL']
del speakers['PROLOGUE']
# collect those who speak only once
for k, v in speakers.items():
    if v == 1:
        print(f"{k} only speaks once in Hamlet.")


The character who speaks the most is HAMLET.
VOLTEMAND only speaks once in Hamlet.
LUCIANUS only speaks once in Hamlet.
SERVANT only speaks once in Hamlet.
