In [1]:
%load_ext lab_black

In [2]:
import requests
import json
import os
import pathlib
import time
import bs4
import sqlite3
import newspaper
import spacy

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from IPython.display import clear_output
from urllib3.exceptions import LocationParseError
from multiprocessing import Pool
from math import floor, ceil
from fuzzywuzzy import fuzz, process
from subreddit_sql import create_connection

The first two steps in data cleaning will be determining if the content was submitted to the correct subreddit, and if that content has been submitted before. To handle the first problem, we'll determine if the domain of the article is a known satire website or not. The website [realorsatire.com](https://realorsatire.com/) maintains a curated list of websites which publish satirical articles. This website will be scraped and a list of domains will be formed. Anything in this list will be allowed in the r/TheOnion table, and anything not present will be considered appropriate for r/nottheonion.

In [3]:
def convertable_to_int(string):
    try:
        x = int(string)
        return True
    except ValueError:
        return False


def create_satire_list():
    satire_domains = []

    # total number of pages available starting from page 2
    source = "https://realorsatire.com/websites-that-are/satire/page/"
    r = requests.get(source + "2/")

    html = r.content
    soup = bs4.BeautifulSoup(html)

    words = soup.find("title").get_text().split(" ")
    page_numbers = [int(word) for word in words if convertable_to_int(word)]
    num_pages = max(page_numbers)

    for n in range(1, num_pages + 1):
        r = requests.get(source + f"{n}/")
        html = r.content
        soup = bs4.BeautifulSoup(html)

        title_blocks = soup.find_all("a", rel="bookmark")
        for tb in title_blocks:

            satire_domains.append(tb.get_text().lower())

    return satire_domains


satire_list = create_satire_list()

From the r/TheOnion table, all posts with article content and appropriate domains will be fetched.

In [4]:
# from sql, return rows with article content from acceptable domains sorted by score
def pull_onion(conn, domains):
    sql_cmd = f"""SELECT
                      title,
                      article_text,
                      score
                  FROM theonion
                  WHERE article_text IS NOT NULL AND
                  ({" OR ".join([f"domain LIKE '%{d}%'" for d in domains])})
                  ORDER BY score DESC;"""

    c = conn.cursor()
    c.execute(sql_cmd)
    rows = c.fetchall()

    return rows


conn = create_connection("../data/reddit.db")
onion_df = pd.DataFrame(
    pull_onion(conn, satire_list), columns=["title", "article_text", "score"]
)

onion_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12946 entries, 0 to 12945
Data columns (total 3 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   title         12946 non-null  object 
 1   article_text  12946 non-null  object 
 2   score         12824 non-null  float64
dtypes: float64(1), object(2)
memory usage: 303.5+ KB


From r/nottheonion, the same criteria need to be met, but due to the sheer number of posts we will limit it to only fetching twice the number of posts that are retrieved from r/TheOnion. This will ensure that after data cleaning a sufficient number of posts are present in both. Furthermore, by sorting the posts by their score, we help ensure that the subset of posts we do fetch are the "best" representation of r/nottheonion.

In [5]:
def pull_not_onion(conn, domains, limit):
    sql_cmd = f"""SELECT
                      title,
                      article_text, 
                      score
                  FROM nottheonion
                  WHERE article_text IS NOT NULL AND
                  ({" AND ".join([f"domain NOT LIKE '%{d}%'" for d in domains])})
                  ORDER BY score DESC
                  LIMIT {limit};"""

    c = conn.cursor()
    c.execute(sql_cmd)
    rows = c.fetchall()

    return rows


not_onion_df = pd.DataFrame(
    pull_not_onion(conn, satire_list, 2 * len(onion_df)),
    columns=["title", "article_text", "score"],
)

not_onion_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25892 entries, 0 to 25891
Data columns (total 3 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   title         25892 non-null  object
 1   article_text  25892 non-null  object
 2   score         25892 non-null  int64 
dtypes: int64(1), object(2)
memory usage: 607.0+ KB


To actually remove duplicate posts is a tricky process. pandas provides a drop_duplicates() method, however this does not take into account typos titles with slightly changed wording. Therefore, I chose to use fuzzy matching to cross-check every title against every other title. This process is slow, but thorough. Using the library fuzzywuzzy, each title pair is given a score from 0 to 100, with 100 indicating identical titles. By setting a threshold of 98, posts that are very similar will be removed, but this will still leave us with sufficient data to work with.

In [6]:
# check for duplicates within the onion titles
# to help ensure only true duplicates are removed, we will use a very high threshold for similarity
FUZZY_THRESHOLD = 98


def remove_duplicate(df, column, threshold):
    values = df[column]
    N = len(values)
    duplicates = {n: [] for n in range(len(values))}

    for n, value in enumerate(values):
        clear_output(wait=True)
        possible_matches = process.extract(value, values)
        for possible_match in possible_matches:
            # value, score, index unpacking
            v, s, i = possible_match
            if i != n and s >= threshold:
                duplicates[n].append(i)

        print(f"Values processed: {n + 1}/{N}")

    dupe_keys = []
    for key in duplicates.keys():
        if key not in dupe_keys:
            for dupe in duplicates[key]:
                dupe_keys.append(dupe)

    df = df.drop(index=dupe_keys).reset_index(drop=True)

    return df

The steps taken in this notebook will help ensure a few things. Firstly, there is no null data, and thus nothing to impute. The corpus is sufficiently large that this is an appropriate measure. Second, there is a high degree of confidence that all articles are unique, and therefore the data will have good variety for the model to learn. Lastly, by selecting a roughly equal number of posts from each (and further subsetting the data in the next notebook), a balanced corpus will be created.

Finally, the data with duplicates removed will be saved to files.

In [7]:
no_dupes_path = pathlib.Path("../data/duplicates_removed")
no_dupes_path.mkdir(exist_ok=True, parents=True)

onion_df = remove_duplicate(onion_df, "title", FUZZY_THRESHOLD)
onion_df.to_csv("../data/duplicates_removed/theonion.csv", index=False)

Values processed: 12946/12946


In [8]:
not_onion_df = remove_duplicate(not_onion_df, "title", FUZZY_THRESHOLD)
not_onion_df.to_csv("../data/duplicates_removed/nottheonion.csv", index=False)

Values processed: 25892/25892
