In [1]:
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import nltk
import re
import os
from tqdm.notebook import tqdm



In [2]:
cwd = os.getcwd()

os.chdir(cwd+"/data/")

In [3]:
def parser(html_file):
    
    with open(html_file) as html:
        soup = BeautifulSoup(html, 'html.parser')
        
    if len(soup.find_all("ul", id="theatre-details")) != 0:
        original_work = soup.find_all("ul", id="theatre-details")[0].get_text().split("\n")[1]
    else:
        original_work = soup.find_all("a")[1].get_text()
    
    title = soup.find_all("title")[0].get_text()
    author = soup.find_all("meta", attrs={"name":"author"})[0]["content"]
    
    overall_rating_index = str(soup.find("table", id="ratings-stars")).split("<tr>")[-1].index("title=")
    overall_rating = str(soup.find("table", id="ratings-stars")).split("<tr>")[-1][overall_rating_index+len("title="):overall_rating_index+len("title=")+3]
    overall_rating = re.sub(r'[^\w\s]', '', overall_rating)
    
    content_list = [p.get_text().strip() for p in soup.find_all("p")] 
    for c in content_list.copy():
        if c[:1] == "\n":
            content_list.remove(c)
        if c[:7] == "Photos:":
            content_list.remove(c)
        if "www." in c:
            content_list.remove(c)
        if c[:7] == "Discuss":
            content_list.remove(c)
        if "To reserve tickets, please call" in c:
            content_list.remove(c)
        if "Reservations:" in c:
            content_list.remove(c)
        if "Reviews on this site are subject to" in c:
            content_list.remove(c)
        if "Click to listen to" in c:
            content_list.remove(c)
        if "This review also appears on" in c:
            content_list.remove(c)
        if "{ijseo_redirect id=1}" in c:
            content_list.remove(c)
        if "Buy this book on Amazon.com" in c:
            content_list.remove(c)
        
    content = " ".join([s for s in content_list]).replace("\n", "").replace("\\", "").replace("/", "").replace("  ", " ")
    
    if content == "":
        content_list = [p.get_text().strip() for p in soup.find_all("span", {"class": "reviewstext"})]
        if len(content_list) > 0:
            content = " ".join(content_list).replace("\n", "").replace("\\", "").replace("/", "").replace("  ", " ")
        
        else:
            content_list = [p.get_text().strip() for p in soup.find_all("div", {"class": "jrListingFulltext"})]
            if len(content_list) > 0:
                content = " ".join(content_list).replace("\n", "").replace("\n", "").replace("\\", "").replace("/", "").replace("  ", " ")
    
    content = re.sub(r',|"|”|“|\(|\)|’|‘|\#|\$|£|&', '', content)
    content = re.sub(r'\-|\―|\/|\+|…|\—|–', ' ', content)
    content = content.lower()
    
    character_names_tn = r'orsino|sebastian|antonio|valentine|andrew|cesario|curio|toby|malvolio|fabian|feste|olivia|viola|maria'
    character_names_aw = r'lafew|parolles|rinaldo|lavatch|helena|diana|mariana|violenta'
    character_names_asyl = r'amiens|jaques|charles|oliver|orlando|adam|dennis|touchstone|martext|corin|silvius|hymen|rosalind|celia|phebe|audrey'
    character_names_ham = r'claudius|hamlet|polonius|horatio|laertes|voltemand|cornelius|rosencrantz|guildenstern|osric|marcellus|barnado|francisco|reynaldo|fortinbras|gertrude|ophelia'
    character_names_lll = r'ferdinand|berowne|longaville|dumaine|boyet|marcade|adriano|armado|nathaniel|holofernes|costard|forester|rosaline|maria|katherine|jaquenetta|blackamoors'
    character_names_mac = r'duncan|malcolm|donalbain|macbeth|banquo|macduff|lennox|rosse|menteth|angus|cathness|fleance|siward|seyton|hecat'
    character_names_mnd = r'theseus|egeus|lysander|demetrius|philostrate|quince|starveling|hippolyta|hermia|helena|oberon|titania|puck|peaseblossom|cobweb|mustardseed'
    character_names_rj = r'escalus|paris|romeo|juliet|montague|capulet|mercutio|benvolio|tybalt|petruchio|lawrence|john|balthasar|abram|sampson|gregory|peter|anthony|potpan'
    character_names_oth = r'brabantio|gratiano|lodovico|othello|cassio|lago|roderigo|montano|desdemona|emilia|bianca'
    character_names_jc = r'julius|caesar|octavius|mark|antony|lepidus|cicero|publius|popilius|brutus|cassius|casca|trebonius|caius|ligarius|decius|metellus|cimber|cinna|flavius|murellus|artemidorus|cnidos|soothsayer|lucilius|titinius|messala|volumnius|varrus|clitus|claudio|strato|lucius|dardanius|pindarus|calphurnia|cobbler|portia'
    
    character_names = '|'.join([character_names_tn, character_names_aw, character_names_asyl,
                               character_names_ham, character_names_lll, character_names_mac,
                               character_names_mnd, character_names_rj, character_names_oth,
                               character_names_jc])
    filtered_words = "|".join([character_names, "shakespeare|shakespeares|william|williams"])
    
    content = re.sub(filtered_words, '', content)
    
    content_df_sent = pd.DataFrame({"review_id" : [title],
                               'content' : content})['content']\
                            .str.split(r'[.!?:;]+', expand=True).stack().to_frame('sent_str')
    content_df_sent['sent_str'] = content_df_sent['sent_str'].str.strip()
    content_df_sent = content_df_sent[~content_df_sent['sent_str'].str.match(r'^\s*$')]
    content_df_sent.index.names = ['review_id', 'sent_id']
    
    content_df_token = content_df_sent['sent_str'].str.split(r"[\s',-]", expand=True).stack().to_frame('term_str')
    content_df_token['term_str'] = content_df_token['term_str'].str.strip()
    content_df_token = content_df_token[~content_df_token['term_str'].str.match(r'^\s*$')]
    content_df_token.index.names = ['review_id', 'sent_id', 'token_id']
    
    content_df_token = content_df_token.reset_index()
    content_df_token.review_id = [title for i in range(content_df_token.shape[0])]
    content_df_token = content_df_token.set_index(['review_id', 'sent_id', 'token_id'])
    
    content_df_token['pos_tuple'] = nltk.pos_tag(content_df_token.term_str)
    
    
    #------ LIB
    lib = pd.DataFrame({"Original Work":[original_work],
                          "Review Title":[title],
                          "Review Author":[author],
                          "Content":[content],
                          "Overall Rating":[overall_rating]})
    
    return content_df_token, lib

In [4]:
d, l = parser('review1.html')
d.head(20)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,term_str,pos_tuple
review_id,sent_id,token_id,Unnamed: 3_level_1,Unnamed: 4_level_1
Twelfth Night is a New Perspective at the National,0,0,golden,"(golden, JJ)"
Twelfth Night is a New Perspective at the National,0,1,brown,"(brown, NN)"
Twelfth Night is a New Perspective at the National,0,2,leaves,"(leaves, VBZ)"
Twelfth Night is a New Perspective at the National,0,3,decorate,"(decorate, VBP)"
Twelfth Night is a New Perspective at the National,0,4,the,"(the, DT)"
Twelfth Night is a New Perspective at the National,0,5,edges,"(edges, NNS)"
Twelfth Night is a New Perspective at the National,0,6,of,"(of, IN)"
Twelfth Night is a New Perspective at the National,0,7,the,"(the, DT)"
Twelfth Night is a New Perspective at the National,0,8,stage,"(stage, NN)"
Twelfth Night is a New Perspective at the National,0,9,in,"(in, IN)"


In [None]:
num_of_file = len(os.listdir())-1

DOC = pd.DataFrame()
LIB = pd.DataFrame()

for i in tqdm(range(1, num_of_file)):
    
    filename = "review" + str(i) + ".html"
    
    doc, lib = parser(filename)
    
    DOC = pd.concat([DOC, doc])
    
    LIB = pd.concat([LIB, lib])

DOC["pos"] = [i[1] for i in list(DOC["pos_tuple"])]    

DOC.head(20)

  0%|          | 0/292 [00:00<?, ?it/s]

In [None]:
LIB['Genre'] = np.where(LIB["Original Work"].isin(['Hamlet','Macbeth', 'Romeo & Juliet', 'Othello', 'Julius Caesar']), 
                        "Tragedy", "Comedy")
LIB['Rating Category'] = np.where(LIB["Overall Rating"].astype(int) > 3, "Positive", "Negative")
LIB = LIB.set_index(["Original Work", "Review Title"])
LIB.head(20)

In [None]:
VOCAB = DOC.term_str.value_counts().to_frame('n').sort_index()
VOCAB.index.name = 'term_str'
VOCAB['n_chars'] = VOCAB.index.str.len()
VOCAB['p'] = VOCAB.n / VOCAB.n.sum()
VOCAB['i'] = -np.log2(VOCAB.p)
VOCAB['max_pos'] = DOC[['term_str','pos']].value_counts().unstack(fill_value=0).idxmax(1)
VOCAB.head(20)

In [None]:
DOC.to_csv(cwd+"/output/DOC.csv")
LIB.to_csv(cwd+"/output/LIB.csv")
VOCAB.to_csv(cwd+"/output/VOCAB.csv")