In [1]:
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import nltk
from nltk.stem.porter import PorterStemmer
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.lancaster import LancasterStemmer
import re
import os
from tqdm.notebook import tqdm



In [2]:
cwd = os.getcwd()

os.chdir(cwd+"/data/")

In [3]:
def parser(html_file):
    
    with open(html_file) as html:
        soup = BeautifulSoup(html, 'html.parser')
        
    if len(soup.find_all("ul", id="theatre-details")) != 0:
        original_work = soup.find_all("ul", id="theatre-details")[0].get_text().split("\n")[1]
    else:
        original_work = soup.find_all("a")[1].get_text()
    
    title = soup.find_all("title")[0].get_text()
    author = soup.find_all("meta", attrs={"name":"author"})[0]["content"]
    
    overall_rating_index = str(soup.find("table", id="ratings-stars")).split("<tr>")[-1].index("title=")
    overall_rating = str(soup.find("table", id="ratings-stars")).split("<tr>")[-1][overall_rating_index+len("title="):overall_rating_index+len("title=")+3]
    overall_rating = re.sub(r'[^\w\s]', '', overall_rating)
    
    content_list = [p.get_text().strip() for p in soup.find_all("p")] 
    for c in content_list.copy():
        if c[:1] == "\n":
            content_list.remove(c)
        if c[:7] == "Photos:":
            content_list.remove(c)
        if "www." in c:
            content_list.remove(c)
        if c[:7] == "Discuss":
            content_list.remove(c)
        if "To reserve tickets, please call" in c:
            content_list.remove(c)
        if "Reservations:" in c:
            content_list.remove(c)
        if "Reviews on this site are subject to" in c:
            content_list.remove(c)
        if "Click to listen to" in c:
            content_list.remove(c)
        if "This review also appears on" in c:
            content_list.remove(c)
        if "{ijseo_redirect id=1}" in c:
            content_list.remove(c)
        if "Buy this book on Amazon.com" in c:
            content_list.remove(c)
        
    content = " ".join([s for s in content_list]).replace("\n", "").replace("\\", "").replace("/", "").replace("  ", " ")
    
    if content == "":
        content_list = [p.get_text().strip() for p in soup.find_all("span", {"class": "reviewstext"})]
        if len(content_list) > 0:
            content = " ".join(content_list).replace("\n", "").replace("\\", "").replace("/", "").replace("  ", " ")
        
        else:
            content_list = [p.get_text().strip() for p in soup.find_all("div", {"class": "jrListingFulltext"})]
            if len(content_list) > 0:
                content = " ".join(content_list).replace("\n", "").replace("\n", "").replace("\\", "").replace("/", "").replace("  ", " ")
    
    content = re.sub(r',|"|”|“|\(|\)|’|‘|\#|\$|£|&', '', content)
    content = re.sub(r'\-|\―|\/|\+|…|\—|–', ' ', content)
    content = content.lower()
    
    content_df_sent = pd.DataFrame({"review_id" : [title],
                               'content' : content})['content']\
                            .str.split(r'[.!?:;]+', expand=True).stack().to_frame('sent_str')
    content_df_sent['sent_str'] = content_df_sent['sent_str'].str.strip()
    content_df_sent = content_df_sent[~content_df_sent['sent_str'].str.match(r'^\s*$')]
    content_df_sent.index.names = ['review_id', 'sent_id']
    
    character_names_tn = r'orsino|sebastian|antonio|valentine|andrew|cesario|curio|toby|malvolio|fabian|feste|olivia|viola|maria'
    character_names_aw = r'lafew|parolles|rinaldo|lavatch|helena|diana|mariana|violenta'
    character_names_asyl = r'amiens|jaques|charles|oliver|orlando|adam|dennis|touchstone|martext|corin|silvius|hymen|rosalind|celia|phebe|audrey'
    character_names_ham = r'claudius|hamlet|polonius|horatio|laertes|voltemand|cornelius|rosencrantz|guildenstern|osric|marcellus|barnado|francisco|reynaldo|fortinbras|gertrude|ophelia'
    character_names_lll = r'ferdinand|berowne|longaville|dumaine|boyet|marcade|adriano|armado|nathaniel|holofernes|costard|forester|rosaline|maria|katherine|jaquenetta|blackamoors'
    character_names_mac = r'duncan|malcolm|donalbain|macbeth|banquo|macduff|lennox|rosse|menteth|angus|cathness|fleance|siward|seyton|hecat'
    character_names_mnd = r'theseus|egeus|lysander|demetrius|philostrate|quince|starveling|hippolyta|hermia|helena|oberon|titania|puck|peaseblossom|cobweb|mustardseed'
    character_names_rj = r'escalus|paris|romeo|juliet|montague|capulet|mercutio|benvolio|tybalt|petruchio|lawrence|john|balthasar|abram|sampson|gregory|peter|anthony|potpan'
    character_names_oth = r'brabantio|gratiano|lodovico|othello|cassio|lago|roderigo|montano|desdemona|emilia|bianca|iago'
    character_names_jc = r'julius|caesar|octavius|mark|antony|lepidus|cicero|publius|popilius|brutus|cassius|casca|trebonius|caius|ligarius|decius|metellus|cimber|cinna|flavius|murellus|artemidorus|cnidos|soothsayer|lucilius|titinius|messala|volumnius|varrus|clitus|claudio|strato|lucius|dardanius|pindarus|calphurnia|cobbler|portia'
    
    character_names = '|'.join([character_names_tn, character_names_aw, character_names_asyl,
                               character_names_ham, character_names_lll, character_names_mac,
                               character_names_mnd, character_names_rj, character_names_oth,
                               character_names_jc])
    filtered_words = "|".join([character_names, "shakespeare|shakespeares|william|williams|midsummer"])
    
    
    content_df_token = content_df_sent['sent_str'].str.split(r"[\s,-]", expand=True).stack().to_frame('term_str')
    content_df_token['term_str'] = content_df_token['term_str'].str.strip()
    content_df_token = content_df_token[~content_df_token['term_str'].str.match(r'^\s*$')]
    content_df_token = content_df_token[~content_df_token['term_str'].str.contains(filtered_words, case=False, regex=True)]
    content_df_token.index.names = ['review_id', 'sent_id', 'token_id']
    
    content_df_token = content_df_token.reset_index()
    content_df_token.review_id = [title for i in range(content_df_token.shape[0])]
    content_df_token = content_df_token.set_index(['review_id', 'sent_id', 'token_id'])
    
    content_df_token['pos_tuple'] = nltk.pos_tag(content_df_token.term_str)
    
    
    #------ LIB
    lib = pd.DataFrame({"Original Work":[original_work],
                          "Review Title":[title],
                          "Review Author":[author],
                          "Content":[content],
                          "Overall Rating":[overall_rating]})
    
    return content_df_token, lib

In [4]:
num_of_file = len(os.listdir())-1

CORPUS = pd.DataFrame()
LIB = pd.DataFrame()

for i in tqdm(range(1, num_of_file)):
    
    filename = "review" + str(i) + ".html"
    
    corpus, lib = parser(filename)
    
    CORPUS = pd.concat([CORPUS, corpus])
    
    LIB = pd.concat([LIB, lib])

CORPUS["pos"] = [i[1] for i in list(CORPUS["pos_tuple"])]    

CORPUS.sample(10)

  0%|          | 0/292 [00:00<?, ?it/s]

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,term_str,pos_tuple,pos
review_id,sent_id,token_id,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Never Anything Can Be Amiss,29,9,the,"(the, DT)",DT
Romeo & Juliet: A Requiem,0,66,these,"(these, DT)",DT
Richard Burton's Hamlet (1964),21,2,discovery,"(discovery, NN)",NN
Delaware Midsummer at Rodney Square,11,2,is,"(is, VBZ)",VBZ
Long-Awaited Pomp is a Proud Display of the Bard's Circumstance,5,54,of,"(of, IN)",IN
The Scottish King and the CIA,21,0,collin,"(collin, NN)",NN
RSC's Caesar Strikes Too Close To Home,41,19,era,"(era, NN)",NN
80s Throwback is a Dream,25,12,and,"(and, CC)",CC
Carolino's Midsummer Fantasies Require Some Shaping,10,12,to,"(to, TO)",TO
Always Loving the One You Can't Have,26,25,ponder,"(ponder, VB)",VB


In [6]:
LIB['Genre'] = np.where(LIB["Original Work"].isin(['Hamlet','Macbeth', 'Romeo & Juliet', 'Othello', 'Julius Caesar']), 
                        "Tragedy", "Comedy")
LIB['Rating Category'] = np.where(LIB["Overall Rating"].astype(int) > 3, "Positive", "Negative")
LIB = LIB.set_index(["Original Work", "Review Title"])
LIB.sample(10)

Unnamed: 0_level_0,Unnamed: 1_level_0,Review Author,Content,Overall Rating,Genre,Rating Category
Original Work,Review Title,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Julius Caesar,CSF’s Julius Caesar Traditional But Still Relevant,Ginny Quaney,shakespeares julius caesar has been in the new...,4,Tragedy,Positive
As You Like It,"Drums, Cymbals, and Sight Gags Abound",Cynthia Greenwood,shakespeares as you like it is a loquacious pa...,4,Comedy,Positive
Hamlet,Classical Theatre Company's Hamlet Soars Under Careful Direction,Cynthia Greenwood,although the classical theatre company officia...,4,Tragedy,Positive
Julius Caesar,Julius Caesar Takes a Stab at the Shakespeare,Georgina Petronella,julius caesar is the first shakespeare play i ...,4,Tragedy,Positive
Hamlet,Students Tackle Hamlet with Heart,Georgina Petronella,hamlet is the tragedy of a young man; a studen...,3,Tragedy,Negative
All's Well That Ends Well,"A Nobleman, A Commoner and A Royal Baby: Art Imitates Life",Melissa Crismon,youre on a budget but want to enjoy the finer ...,5,Comedy,Positive
Othello,The Ills We Do: Race and Gender in ASP's Othello,Deirdre Yee,othello has always fascinated audiences becau...,3,Tragedy,Negative
Hamlet,Take Wing and Soar Productions' High-Spirited Hamlet,Elizabeth Bachner,theres always the danger that a bare bones low...,4,Tragedy,Positive
Midsummer Night's Dream,Off Center Midsummer Takes the Stage,Michael Meigs,despite their name chaotic theatre this produc...,3,Comedy,Negative
All's Well That Ends Well,As You Like It a Matter of Taste,Tanya Gough,kenneth branagh's as you like it set in meiji ...,2,Comedy,Negative


In [11]:
VOCAB = CORPUS.term_str.value_counts().to_frame('n').sort_index()
VOCAB.index.name = 'term_str'
VOCAB['n_chars'] = VOCAB.index.str.len()
VOCAB['p'] = VOCAB.n / VOCAB.n.sum()
VOCAB['i'] = -np.log2(VOCAB.p)
VOCAB['max_pos'] = CORPUS[['term_str','pos']].value_counts().unstack(fill_value=0).idxmax(1)
VOCAB['n_pos'] = CORPUS[['term_str','pos']].value_counts().unstack().count(1)
VOCAB['cat_pos'] = CORPUS[['term_str','pos']].value_counts().to_frame('n').reset_index()\
    .groupby('term_str').pos.apply(lambda x: set(x))
sw = pd.DataFrame(nltk.corpus.stopwords.words('english'), columns=['term_str'])
sw = sw.reset_index().set_index('term_str')
sw.columns = ['dummy']
sw.dummy = 1
VOCAB['stop'] = VOCAB.index.map(sw.dummy)
VOCAB['stop'] = VOCAB['stop'].fillna(0).astype('int')
VOCAB = VOCAB.drop('cat_pos', 1) 

stemmer1 = PorterStemmer()
VOCAB['stem_porter'] = VOCAB.apply(lambda x: stemmer1.stem(x.name), 1)

stemmer2 = SnowballStemmer("english")
VOCAB['stem_snowball'] = VOCAB.apply(lambda x: stemmer2.stem(x.name), 1)

stemmer3 = LancasterStemmer()
VOCAB['stem_lancaster'] = VOCAB.apply(lambda x: stemmer3.stem(x.name), 1)

VOCAB.sort_values('p', ascending=False).head(10)

  VOCAB = VOCAB.drop('cat_pos', 1)


Unnamed: 0_level_0,n,n_chars,p,i,max_pos,n_pos,stop,stem_porter,stem_snowball,stem_lancaster
term_str,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
the,19213,3,0.06827,3.872596,DT,1,1,the,the,the
and,10397,3,0.036944,4.758511,CC,1,1,and,and,and
of,8942,2,0.031774,4.976009,IN,1,1,of,of,of
a,7691,1,0.027329,5.193435,DT,1,1,a,a,a
to,6429,2,0.022844,5.452012,TO,1,1,to,to,to
in,5407,2,0.019213,5.701778,IN,1,1,in,in,in
is,5271,2,0.01873,5.73853,VBZ,1,1,is,is,is
as,3562,2,0.012657,6.303919,IN,3,1,as,as,as
with,3319,4,0.011794,6.405858,IN,1,1,with,with,with
that,2479,4,0.008809,6.826848,IN,4,1,that,that,that


In [9]:
CORPUS.to_csv(cwd+"/output/CORPUS.csv")
LIB.to_csv(cwd+"/output/LIB.csv")
VOCAB.to_csv(cwd+"/output/VOCAB.csv")