# Webscraping and analyzing reviews on comics for readcomiconline.to

In [107]:
# GENERAL
import re
import json
import numpy as np
import pandas as pd

# WEBSCRAPING
import requests
import bs4 as bs

# SELENIUM
from selenium import webdriver
from selenium.webdriver.support.wait import WebDriverWait

# NLP
import spacy
from textblob import TextBlob
from collections import Counter
from nltk import word_tokenize
from nltk.stem import SnowballStemmer
from nltk.stem import WordNetLemmatizer
from nltk.sentiment.vader import SentimentIntensityAnalyzer

# GRAPHICS
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [2]:
# LOAD NLP MODEL
# nlp = spacy.load('en_core_web_lg')

# 0. Webscrape results, export to dataframe

## 0.0 Functions

In [50]:
def get_comment_soup(driver, url):
    """
    For a given url, uses Selenium to get the "src" link to the Disqus comments page.
    Returns bs4 soup file for Disqus comments page.
    """
    driver.get(url)
    driver.implicitly_wait(5) # Wait to load
    
    iframe = driver.find_element_by_xpath("//iframe[2]") # Disqus comments are in XPATH //inframe[2]"
    iframe_url = iframe.get_attribute('src') # "src" is link to Disqus comments
    driver.close() # close webpage
    # Instantiate BS, create soup for Disqus url
    source_code = requests.get(iframe_url)
    plain_text = source_code.text
    soup = bs.BeautifulSoup(plain_text, 'html.parser')
    
    return soup

In [51]:
# GET COMMENTS FROM SOUP FILE
def soup_to_json(soup):
    """
    Returns json-formatted messages of bs4 soup file
    Returns json_data, a list of dicts
    """
    json_data_str = str(soup.find("script", {"id" : "disqus-threadData"})) # string of json_data
    json_data_str2 = json_data_str[json_data_str.find('json')+6:-9] # remove head/foot tags
    json_data = json.loads(json_data_str2) # string to json
    json_data = json_data['response']['posts']
    
    return json_data

In [65]:
def get_messages(df, url, json_data):
    """
    Takes in a df of form ['Webpage','Message', 'Author', 'Likes', 'Dislikes', 'NumReports']
    Appends each message and metadata (50 max) to that df.
    Returns the updated df
    """
    if "?" in url:
        webpage = url[33:url.find('?id')] # set webpage (comic title/issue)
    else:
        webpage = url[33:]
        
    # Check all messages (50 max) per webpage
    for i in range(len(json_data)):
#         print(json_data[i])

        message = json_data[i]['message'] # MESSAGE
        message = re.compile(r'<[^>]+>').sub('', message) # remove all html tags
        try:
            author = json_data[i]['author']['username'] # if the author has a username
        except:
            author = json_data[i]['author']['name'] # if the author is a guest
        likes = json_data[i]['likes']
        dislikes = json_data[i]['dislikes']
        numReports = json_data[i]['numReports']

        data=[webpage, message, author, likes, dislikes, numReports]
        append_me = pd.Series(data=data, index = df.columns)
        df = df.append(append_me, ignore_index=True) # Append the new Series into df
        
    return df

## 0.1 Webscrape comments for one issue

In [66]:
def scrape_issue(url, df=None):
    """
    Scrapes issue for a single url (e.g. "https://readcomiconline.to/Comic/The-Wild-Storm/Issue-4")
    Can be passed a df with some issues already scraped
    Returns a df with comments for single scraped issue
    """
    if df is None:
        df = pd.DataFrame(columns=['Webpage','Message', 'Author', 'Likes', 'Dislikes', 'NumReports'])
        print("new df created")

    else:
        print("df loaded")

    driver = webdriver.Opera(executable_path='/Users/abgrss/Documents/Projects/Brain Station/00 Capstone project/operadriver_mac64/operadriver')

    if url[33:] not in df['Webpage'].values: # If the webpage hasn't yet been scraped
        # RUN ALL FUNCTIONS
        try:
            print(f"Now scraping: {url[33:]}")
            soup = get_comment_soup(driver, url)
            json_data = soup_to_json(soup)
            df = get_messages(df, url, json_data)
            df.to_csv('comic_comments/comic_comments.csv') # SAVE AFTER EVERY SUCCESSFUL ISSUE SCRAPE
        # IF WEBSCRAPING FAILS
        except:
            print(f"SCRAPING FAILED for {url}")

    return df

### 0.1.1 Single Issue: "The Wild Storm", Issue 4

In [14]:
# SAMPLE SINGLE-ISSUE SCRAPING
url = "https://readcomiconline.to/Comic/The-Wild-Storm/Issue-4"
df_wild_issue = scrape_issue(url)
df_wild_issue

new df created
Now scraping: The-Wild-Storm/Issue-4


Unnamed: 0,Webpage,Message,Author,Likes,Dislikes,NumReports
0,The-Wild-Storm/Issue-4,This is great writing. I don't like all the ch...,matejsojka,6,0,0
1,The-Wild-Storm/Issue-4,,Guest,5,0,0
2,The-Wild-Storm/Issue-4,shame it's not yet confirmed to be part of the...,kareematta,0,0,0
3,The-Wild-Storm/Issue-4,,Guest,3,0,0
4,The-Wild-Storm/Issue-4,"Some of the best art in comics right now, I ge...",disqus_2fQDYhg09f,2,0,0
5,The-Wild-Storm/Issue-4,"Oh, the plot thickens. But the game is not afo...",MotherOfCreation,1,0,0
6,The-Wild-Storm/Issue-4,cant wait for next issue. hoping for more acti...,NANANANANANAN_batman,1,0,0
7,The-Wild-Storm/Issue-4,Just realized Kenesha (Savant) is now black. S...,vadimfv,3,4,0
8,The-Wild-Storm/Issue-4,It's more interesting this way. Things get to...,jasonhughnon,5,2,0
9,The-Wild-Storm/Issue-4,I agree with what stan lee said regarding this...,LEGENDOFLEGAIA,1,0,0


## 0.2 Webscrape comic series

In [20]:
def scrape_series(series_title, series_length, start_issue=1, df=None, prev_fails=0):
    """
    Takes in a a root page directory (e.g. https://readcomiconline.to/Comic/The-Wild-Storm/)
    Can be passed a df with some issues already scraped
    Returns a df with comments for scraped issues
    """
    fails = 0
    # INITIATE DF
    if df is None:
        df = pd.DataFrame(columns=['Webpage','Message', 'Author', 'Likes', 'Dislikes', 'NumReports'])
        print("new df created")

    else:
        print("df loaded")

    ### ADD CODE: GET LENGTH FROM PAGE ###
    
    # FOR EVERY ISSUE
    for i in range(start_issue,series_length+1):
        url = series_title+"Issue-"+str(i)
        if url[33:] not in df['Webpage'].values: # If the webpage hasn't yet been scraped
        
            # RUN ALL WEBSCRAPING FUNCTIONS
            driver = webdriver.Opera(executable_path='/Users/abgrss/Documents/Projects/Brain Station/00 Capstone project/operadriver_mac64/operadriver')
            try:
                print(f"Now scraping: {url[33:]}")
                soup = get_comment_soup(driver, url)
                json_data = soup_to_json(soup)
                df = get_messages(df, url, json_data)
                df.to_csv('comic_comments/comic_comments.csv') # SAVE AFTER EVERY SUCCESSFUL ISSUE SCRAPE
            # IF WEBSCRAPING FAILS
            except:
                print(f"SCRAPING FAILED for {url}")
                fails = fails + 1
                continue
    # IF ANY SCRAPING FAILS
    if fails > 0:
        if fails != prev_fails: #IF FAILS NOT THE SAME AS BEFORE, RUNS FUNCTION RECURSIVELY
            df = scrape_series(series_title, series_length, start_issue=1, df=df, prev_fails=fails)
        else:
            print(f"TOTAL SCRAPE FAILS: {fails}")
    
    return df

### 0.2.1 "Justice League" 2018

In [6]:
# SCRAPE SERIES (JUSTICE LEAUGE 2018)
series_length=23
series_title = "https://readcomiconline.to/Comic/Justice-League-2018/"
df_justice = scrape_series(series_title, series_length)
df_justice.to_csv('comic_comments/justice_league_2018_comments.csv')

new df created
Now scraping: Justice-League-2018/Issue-1
Now scraping: Justice-League-2018/Issue-2
Now scraping: Justice-League-2018/Issue-3
Now scraping: Justice-League-2018/Issue-4
Now scraping: Justice-League-2018/Issue-5
Now scraping: Justice-League-2018/Issue-6
Now scraping: Justice-League-2018/Issue-7
Now scraping: Justice-League-2018/Issue-8
Now scraping: Justice-League-2018/Issue-9
Now scraping: Justice-League-2018/Issue-10
Now scraping: Justice-League-2018/Issue-11
Now scraping: Justice-League-2018/Issue-12
SCRAPING FAILED for https://readcomiconline.to/Comic/Justice-League-2018/Issue-12
Now scraping: Justice-League-2018/Issue-13
Now scraping: Justice-League-2018/Issue-14
Now scraping: Justice-League-2018/Issue-15
Now scraping: Justice-League-2018/Issue-16
Now scraping: Justice-League-2018/Issue-17
Now scraping: Justice-League-2018/Issue-18
Now scraping: Justice-League-2018/Issue-19
Now scraping: Justice-League-2018/Issue-20
Now scraping: Justice-League-2018/Issue-21
Now scrap

In [8]:
df_justice.tail()

Unnamed: 0,Webpage,Message,Author,Likes,Dislikes,NumReports
1137,Justice-League-2018/Issue-12,"i personally love how Poseidon was written, ve...",disqus_L3EKnoSckT,0,0,0
1138,Justice-League-2018/Issue-12,Another Jason Momoa cover as Aquaman. I like h...,matejsojka,0,0,0
1139,Justice-League-2018/Issue-12,Loved the art in this,MrRootbeer94,0,0,0
1140,Justice-League-2018/Issue-12,BTW how did Manta and/or the Triumvirate knew ...,disqus_rtgXlikKh3,0,0,0
1141,Justice-League-2018/Issue-12,"Luthor told Manta most likely, seeing as hes t...",abrahamgeoffdusk,0,0,0


### 0.2.2 "The Wild Storm"

In [26]:
series_length=22
series_title = "https://readcomiconline.to/Comic/The-Wild-Storm/"
df_wild_storm = scrape_series(series_title, series_length)
df_wild_storm.to_csv('comic_comments/the_wild_storm_comments.csv')

new df created
Now scraping: The-Wild-Storm/Issue-1
Now scraping: The-Wild-Storm/Issue-2
Now scraping: The-Wild-Storm/Issue-3
Now scraping: The-Wild-Storm/Issue-4
SCRAPING FAILED for https://readcomiconline.to/Comic/The-Wild-Storm/Issue-4
Now scraping: The-Wild-Storm/Issue-5
Now scraping: The-Wild-Storm/Issue-6
Now scraping: The-Wild-Storm/Issue-7
Now scraping: The-Wild-Storm/Issue-8
Now scraping: The-Wild-Storm/Issue-9
Now scraping: The-Wild-Storm/Issue-10
Now scraping: The-Wild-Storm/Issue-11
Now scraping: The-Wild-Storm/Issue-12
Now scraping: The-Wild-Storm/Issue-13
Now scraping: The-Wild-Storm/Issue-14
Now scraping: The-Wild-Storm/Issue-15
Now scraping: The-Wild-Storm/Issue-16
Now scraping: The-Wild-Storm/Issue-17
Now scraping: The-Wild-Storm/Issue-18
Now scraping: The-Wild-Storm/Issue-19
Now scraping: The-Wild-Storm/Issue-20
Now scraping: The-Wild-Storm/Issue-21
Now scraping: The-Wild-Storm/Issue-22
df loaded
Now scraping: The-Wild-Storm/Issue-4


In [71]:
df_wild_storm.tail()

Unnamed: 0,Webpage,Message,Author,Likes,Dislikes,NumReports
466,The-Wild-Storm/Issue-4,I agree with what stan lee said regarding this...,LEGENDOFLEGAIA,1,0,0
467,The-Wild-Storm/Issue-4,,Guest,2,0,0
468,The-Wild-Storm/Issue-4,He literally wrote sparks dood lol sparks was ...,LEGENDOFLEGAIA,0,1,0
469,The-Wild-Storm/Issue-4,I've read all the issues so far and am pretty ...,LEGENDOFLEGAIA,0,0,0
470,The-Wild-Storm/Issue-4,WOW... just wow. I can't get enough of this. ...,disqus_IXFawpGLcU,0,0,0


### 0.2.3 "Batman" 2016

In [None]:
# SCRAPE LONGER (BATMAN 2016)
series_length = 70
series_title = "https://readcomiconline.to/Comic/Batman-2016/"
df_batman = scrape_series(series_title, series_length)

In [None]:
# SINGLE ISSUE SCRAPING FOR MISSED ISSUE
# Batman 2016 Issue 32 is actually "Batman-2016/Issue-32-2"

url = "https://readcomiconline.to/Comic/Batman-2016/Issue-32-2"
df_batman = pd.read_csv('comic_comments/batman_2016_comments.csv', index_col='Unnamed: 0', encoding = "ISO-8859-1")
df_batman = scrape_issue(url, df=df_batman)
df_batman.to_csv('comic_comments/batman_2016_comments.csv')

# 1. NLP

Let's use both Textblob, Vader to analyze the sentiment of each comment.

## 1.0 Functions

In [96]:
def add_nlp_columns(df):
    """
    Updates df - adds NLP columns if it doesn't already have them
    Returns None
    """

    if "Compound" not in df.columns: # CHECK IF COLUMNS ALREADY EXIST
    
        # ADD NLP COLUMNS
        df['Polarity'] = None
        df['Subjectivity'] = None
        df['Positive'] = None
        df['Negative'] = None
        df['Neutral'] = None
        df['Compound'] = None

    return None

In [103]:
def sentiment_analysis(csv_file_path):
    """
    Takes in a webscraped csv of comments (comments in "Message" column)
    Returns an updated df with TextBlob and Vader columns for each comment
    Writes the updated df to file
    """
    # LOAD COMMENTS DF
    df = pd.read_csv(csv_file_path, index_col='Unnamed: 0', encoding = "ISO-8859-1")
    
    # ADD SENTIMENT COLUMNS
    add_nlp_columns(df)
    
    # Instantiate Vader analyzer
    analyzer = SentimentIntensityAnalyzer()
    
    for i in range(len(df)):
        comment = TextBlob(str(df['Message'].iloc[i])) # CREATE TextBlob OBJECT

        # FILL IN TextBlob COLUMNS
        df['Polarity'].iloc[i] = round(comment.sentiment[0], 3)
        df['Subjectivity'].iloc[i] = round(comment.sentiment[1], 3)

        # FILL IN Vader COLUMNS
        comment = str(comment) # Vader TAKES A STRING

        pos, neg, neu, compound = analyzer.polarity_scores(comment)['pos'],\
            analyzer.polarity_scores(comment)['neg'], analyzer.polarity_scores(comment)['neu'],\
            analyzer.polarity_scores(comment)['compound']

        df['Positive'].iloc[i] = round(pos,3)
        df['Negative'].iloc[i] = round(neg,3)    
        df['Neutral'].iloc[i] = round(neu,3)
        df['Compound'].iloc[i] = round(compound,3)
    
    # WRITE TO FILE
    df.to_csv(csv_file_path)
    
    return df

## 1.1 Sample sentiment analysis

Let's apply sentiment scores to comments for a scraped comic series

In [106]:
csv_file_path = "comic_comments/batman_2016_comments.csv"

df_comments = sentiment_analysis(csv_file_path)

df_comments.head()

Unnamed: 0,Webpage,Message,Author,Likes,Dislikes,NumReports,Polarity,Subjectivity,Positive,Negative,Neutral,Compound
0,Batman-2016/Issue-1,"""Would... would they have -- Mother and Father...",disqus_MNI1UQuqIh,78,1,1,0.15,0.711,0.167,0.0,0.833,0.612
1,Batman-2016/Issue-1,dude i almost cried.,GwenpoolLove,14,0,0,0.0,0.0,0.0,0.536,0.464,-0.32
2,Batman-2016/Issue-1,I was rolling my eyes at that.,Anette_Halbestunde,13,5,0,0.0,0.0,0.0,0.0,1.0,0.0
3,Batman-2016/Issue-1,Me too. It's pretty fucking obvious they would...,disqus_h1pDVQqybe,10,1,0,0.388,0.75,0.194,0.098,0.708,0.477
4,Batman-2016/Issue-1,Holy shit! This was epic. I love the new B :TA...,aadityaphadnis,24,0,0,0.122,0.564,0.379,0.177,0.444,0.624


In [None]:
# PLOT SENTIMENTS
