In [1]:
import re
import glob

In [2]:
# Read in cliches into a list
with open("cliche_edited.txt") as cliche_file:
    cliches_list = [line.rstrip() for line in cliche_file]

First we tried to see if we could get any results on one author to check if the cliches existed in the data

In [4]:
# Try running on one author
txt_files = glob.glob("novels_clean/Lewis Carroll/*.txt")

cliche_count = 0

# Read each file
for file in txt_files:
    f = open(file, "r")
    
    # Loop through each line in the text
    for line in f:
        # Check for each cliche
        for cliche in cliches_list:
            if cliche in line:
                cliche_count += 1
    
    f.close()

print("Number of cliches: ")
print(cliche_count)

Number of cliches: 
627


Once that was successful, we tried the same approach on all authors

In [34]:
# Try running on all authors
# Collect list of all author names
folders = glob.glob("novels_clean/*")

# For each author, check # of cliches
for author in folders:
    path = author + "/*.txt"
    txt_files = glob.glob(path)

    cliche_count = 0

    # Read each file
    for file in txt_files:
        f = open(file, "r")

        # Loop through each line in the text
        for line in f:
            # Check for each cliche
            for cliche in cliches_list:
                if cliche in line:
                    cliche_count += 1

        f.close()

    # Print number of cliches found in each authors total files
    print("Number of cliches for " + author + ": ")
    print(cliche_count)

Number of cliches for novels_clean/H.B. Marriott Watson: 
273
Number of cliches for novels_clean/William Harrison Ainsworth: 
2013
Number of cliches for novels_clean/Clemence Housman: 
134
Number of cliches for novels_clean/James Payn: 
762
Number of cliches for novels_clean/Georgiana Fullerton: 
260
Number of cliches for novels_clean/Emma Leslie: 
250
Number of cliches for novels_clean/Frederick Marryat: 
5476
Number of cliches for novels_clean/Leonard Merrick: 
593
Number of cliches for novels_clean/Emily Bronte: 
118
Number of cliches for novels_clean/William Makepeace Thackeray: 
2585
Number of cliches for novels_clean/Florence Marryat: 
293
Number of cliches for novels_clean/Israel Zangwill: 
1932
Number of cliches for novels_clean/Hall Caine: 
2499
Number of cliches for novels_clean/Morley Roberts: 
759
Number of cliches for novels_clean/Ann Radcliffe: 
239
Number of cliches for novels_clean/H.G. Wells: 
4962
Number of cliches for novels_clean/E. Phillips Oppenheim: 
3159
Number 

This gave results, but looking back at our list of expressions, it seems there may be some false positives.  As such, we're going to exclude one-word expressions and add in word boundaries.  Additionaly, we were counting the number of cliches present in the file—we've updated the code to count duplicate occurances of specific cliches.

In [20]:
# Remove single word cliches
new_cliches_list = []
for cliche in cliches_list:
    if " " in cliche.strip():
        new_cliches_list.append(cliche)

In [17]:
# Try new approach running on one author
txt_files = glob.glob("novels_clean/Jane Austen/*.txt")

cliche_count = 0

# Read each file
for file in txt_files:
    f = open(file, "r")
    
    # Loop through each line in the text
    for line in f:
        
        # Check for each cliche
        for cliche in new_cliches_list:
            cliche_count += line.count(cliche)
    
    f.close()

print("Number of cliches: ")
print(cliche_count)

Number of cliches: 
8646


Playing with the results, these seem much more meaningful.  Let's try this for all the authors.

In [23]:
# Dictionary to hold author: cliche count
author_results = {}

In [24]:
# Try running on all authors
# Collect list of all author names
folders = glob.glob("novels_clean/*")

# For each author, check # of cliches
for author in folders:
    path = author + "/*.txt"
    txt_files = glob.glob(path)

    cliche_count = 0

    # Read each file
    for file in txt_files:
        f = open(file, "r")

        # Loop through each line in the text
        for line in f:
            # Check for each cliche
            for cliche in new_cliches_list:
                cliche_count += line.count(cliche)

        f.close()

    # Print number of cliches found in each authors total files
    author = author[13:]
    print("Number of cliches for " + author + ": ")
    print(cliche_count)
    
    # Add to dictionary
    author_results[author] = cliche_count

Number of cliches for H.B. Marriott Watson: 
991
Number of cliches for William Harrison Ainsworth: 
9763
Number of cliches for Clemence Housman: 
322
Number of cliches for James Payn: 
2369
Number of cliches for Georgiana Fullerton: 
1860
Number of cliches for Emma Leslie: 
758
Number of cliches for Frederick Marryat: 
26371
Number of cliches for Leonard Merrick: 
2110
Number of cliches for Emily Bronte: 
548
Number of cliches for William Makepeace Thackeray: 
14037
Number of cliches for Florence Marryat: 
1044
Number of cliches for Israel Zangwill: 
7227
Number of cliches for Hall Caine: 
13542
Number of cliches for Morley Roberts: 
2264
Number of cliches for Ann Radcliffe: 
1846
Number of cliches for H.G. Wells: 
20170
Number of cliches for E. Phillips Oppenheim: 
11341
Number of cliches for R.D. Blackmore: 
18888
Number of cliches for Thomas Hughes: 
4889
Number of cliches for Thomas Love Peacock: 
935
Number of cliches for Edgar Jepson: 
950
Number of cliches for Ouida: 
9448
Numbe

In [25]:
print(author_results)

{'H.B. Marriott Watson': 991, 'William Harrison Ainsworth': 9763, 'Clemence Housman': 322, 'James Payn': 2369, 'Georgiana Fullerton': 1860, 'Emma Leslie': 758, 'Frederick Marryat': 26371, 'Leonard Merrick': 2110, 'Emily Bronte': 548, 'William Makepeace Thackeray': 14037, 'Florence Marryat': 1044, 'Israel Zangwill': 7227, 'Hall Caine': 13542, 'Morley Roberts': 2264, 'Ann Radcliffe': 1846, 'H.G. Wells': 20170, 'E. Phillips Oppenheim': 11341, 'R.D. Blackmore': 18888, 'Thomas Hughes': 4889, 'Thomas Love Peacock': 935, 'Edgar Jepson': 950, 'Ouida': 9448, 'Maurice Hewlett': 6204, 'Thomas Hardy': 12908, 'Walter Besant': 9368, 'G.A. Henty': 61749, 'Bernard Capes': 7192, 'Charles Dickens': 37035, 'Frederick Rolfe': 610, 'Hugh Conway': 403, 'Maria Edgeworth': 11079, 'Elizabeth Gaskell': 13872, 'William Godwin': 2159, 'Walter Pater': 1152, 'Grace Aguilar': 3679, 'Charles Kingsley': 6568, 'Charlotte Bronte': 3395, 'George Grossmith': 279, 'Anthony Trollope': 56956, 'Flora Annie Steel': 10062, 'Ame