In [1]:
# Imports
import re
import string

import numpy as np
import pandas as pd
from yarl import URL
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline
from markdown import markdown
from bs4 import BeautifulSoup

import scipy.sparse as sp
from sklearn.base import BaseEstimator, TransformerMixin, clone
from sklearn.utils.validation import check_is_fitted
from sklearn.feature_extraction.text import _document_frequency

from dask_ml.model_selection import train_test_split
from dask_ml.xgboost import XGBClassifier
import nltk

import dask as d
import dask.dataframe as dd
from dask.distributed import Client

from markdown import Markdown
from io import StringIO

np.random.seed(10)

In [2]:
nltk.download('stopwords')
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/adithyabalaji/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:
client = Client()

[Dask Dashboard](localhost:8787)

# Load / Clean Data

## xkcd dataset

First we load in the xkcd dataset from https://www.explainxkcd.com

This dataset has 2388 xkcd comics run on (22 November 2020)

Each row has the following features:

* **xkcd**: The link to the official xkcd comic URL
* **xkcd_num**: The extracted comic number from the URL
* **Title**: The link to the Explain XKCD wiki page for that comic
* **Image**: Link to a backup hosted image of the XKCD comic
* **Date**: The original date of publication of the comic
* **TitleText**: Title of the comic
* **Explanation**: A community explanation of the comic deciphering the sometimes pithy
or cryptic humor
* **Transcript**: If the comic has characters speaking, this section has the text of the
comic.

In [5]:
# Process explain xkcd data
links_df = dd.read_parquet("./data/xkcd/links_df.parquet") # .set_index("Title")
# There is a bug in the data collection which is caused by this surprise:
# https://www.explainxkcd.com/wiki/index.php/Disappearing_Sunday_Update
# its a comic with the same id which he speculates will break automated system. Sure
# broke mine!
links_df = links_df[links_df["TitleText"] != "Disappearing Sunday Update"].set_index("Title")
pages_df = dd.read_parquet("./data/xkcd/pages_df.parquet", blocksize=None) # .set_index("Title")
pages_df = pages_df.drop_duplicates()
xkcd_df = dd.merge(links_df, pages_df, how='left', on="Title")
xkcd_df["xkcd_num"] = xkcd_df["xkcd"].apply(
    lambda url: int(URL(url).path.replace("/", "")), meta='str'
)
print(xkcd_df.columns)
CURR_MAX_COMIC = xkcd_df["xkcd_num"].max().compute()

Index(['Title', 'xkcd', 'Image', 'Date', 'TitleText', 'Explanation',
       'Transcript', 'xkcd_num'],
      dtype='object')


In [6]:
xkcd_df.head()

Unnamed: 0,Title,xkcd,Image,Date,TitleText,Explanation,Transcript,xkcd_num
0,https://www.explainxkcd.com/wiki/index.php/1,https://xkcd.com/1,https://www.explainxkcd.com/wiki/index.php/Fil...,2005-09-30,Barrel - Part 1,The comic shows a young boy floating in a barr...,[A boy sits in a barrel which is floating in a...,1
1,https://www.explainxkcd.com/wiki/index.php/10,https://xkcd.com/10,https://www.explainxkcd.com/wiki/index.php/Fil...,2005-09-30,Pi Equals,There are two possible references here. One is...,"[A huge π to the left, then a large equal-to s...",10
2,https://www.explainxkcd.com/wiki/index.php/100,https://xkcd.com/100,https://www.explainxkcd.com/wiki/index.php/Fil...,2006-05-10,Family Circus,The Family Circus is a comic characterized by ...,[Picture shows a pathway winding through trees...,100
3,https://www.explainxkcd.com/wiki/index.php/1000,https://xkcd.com/1000,https://www.explainxkcd.com/wiki/index.php/Fil...,2012-01-06,1000 Comics,This comic is the 1000th comic shown on xkcd c...,"[1000 characters, numerous of which have appea...",1000
4,https://www.explainxkcd.com/wiki/index.php/1001,https://xkcd.com/1001,https://www.explainxkcd.com/wiki/index.php/Fil...,2012-01-09,AAAAAA,Megan and Cueball get the idea to build a rota...,[Cueball clinging onto bed sheets while being ...,1001


## reddit dataset

Next we load in the reddit dataset which is a collection of every reference of an xkcd
url on Reddit.

This dataset has 313485 samples and 9 features. The comments are collected from 2007 to
2019, inclusive.

Each sample has the following features:

* **body**: The text in the comment body (should have an xkcd url)
* **author**: The reddit user's name
* **score**: The comment's score (should be >= 1)
* **permalink**: The permalink to the comment
* **parent_***: The previous four attributes for the child comment's parent.
* **xkcd**: The xkcd comic url extracted from the child comment
* **xkcd_num**: The comic number extracted from the URL

In [7]:
%%time

# Process reddit data
file_names =  [
    *list(map(str, range(2007, 2015))),
    *[f"{year}_{month:02d}" for year in range(2015, 2020) for month in range(1, 13)]
]
reddit_dfs = [
    dd.read_parquet(f"./data/reddit/{file_name}.parquet")
    for file_name in file_names
]
reddit_df = dd.concat(reddit_dfs, ignore_index=True)
print(reddit_df.columns)

Index(['body', 'author', 'score', 'permalink', 'xkcd', 'parent_body',
       'parent_author', 'parent_score', 'parent_permalink'],
      dtype='object')
CPU times: user 657 ms, sys: 41.5 ms, total: 699 ms
Wall time: 722 ms


In [8]:
reddit_df.tail()

Unnamed: 0,body,author,score,permalink,xkcd,parent_body,parent_author,parent_score,parent_permalink
2719,"""The recommended amount of wolves inside someo...",TheMelonBandit,38,http://reddit.com/r/notliketheothergirls/comme...,https://xkcd.com/1471/,INSIDE YOU THERE ARE TWO WOLVES. THE RECOMMEND...,DungeonCrawlingFool,730,http://reddit.com/r/notliketheothergirls/comme...
2720,https://xkcd.com/37/,Agent-008,38,http://reddit.com/r/IncreasinglyVerbose/commen...,https://xkcd.com/37/,Samsung makes weird ass emojis,DarkBlade1212,102,http://reddit.com/r/IncreasinglyVerbose/commen...
2721,[The sword is traditionally carried on the lef...,lesser_panjandrum,38,http://reddit.com/r/TrollXChromosomes/comments...,https://xkcd.com/1403/,Come again?\n\nI may be inclined to get a PhD...,Pufflehuffy,11,http://reddit.com/r/TrollXChromosomes/comments...
2722,[this is the one](https://xkcd.com/37/),Allfather2002,38,http://reddit.com/r/Damnthatsinteresting/comme...,https://xkcd.com/37/,Something something xkcd,TobyM02,47,http://reddit.com/r/Damnthatsinteresting/comme...
2723,[Relevant xkcd](https://xkcd.com/2205/),xkcloud,38,http://reddit.com/r/NoStupidQuestions/comments...,https://xkcd.com/2205/,It's called an order of magnitude calculation....,jansencheng,75,http://reddit.com/r/NoStupidQuestions/comments...


In [9]:
%%time

# Clean up reddit_df

# remove null rows in important columns
reddit_df = reddit_df[~(
        reddit_df["xkcd"].isnull()
        | reddit_df["parent_body"].isnull()
        | reddit_df["body"].isnull()
)]

# # Cannot remove individual rows in dask
# # remove malformed row
# reddit_df = reddit_df.drop(labels=[52737], axis=1)

# Clean up multiple versions of URL to singular version
# (i.e. m.xkcd, ending with slash, without slash, etc...)
reddit_df["xkcd"] = reddit_df["xkcd"].apply(
    lambda url: "https://xkcd.com/" + URL(url).path.replace("/", ""), meta=str
)
# Drop invalid comic numbers
# the convert_dtype=False is required here because some annoying people used invalid URLs
# with really large numbers
reddit_df["xkcd_url_type"] = reddit_df["xkcd"].apply(lambda url: URL(url), meta=URL)

def convert_to_num(url):
    url_num = int(url.path[1:])
    if url_num < 1 or url_num > CURR_MAX_COMIC:
        return -1
    else:
        return url_num

# Add URL --> number column
reddit_df["xkcd_num"] = reddit_df["xkcd_url_type"].apply(convert_to_num, meta=int)
reddit_df = reddit_df[
    (reddit_df["xkcd_num"] > 0)
    & ~reddit_df["xkcd_num"].isnull()
]

# naive remove samples with xkcd in parent
# likely over fit signal (e.g. reminds of this specific xkcd 33)
# or low signal... (e.g. does anyone have the xkcd link)
reddit_df = reddit_df[~reddit_df["parent_body"].str.contains("xkcd")]

def strip_markdown(sample):
    html = markdown(sample)
    return ''.join(BeautifulSoup(html).findAll(text=True))

# strip markdown from text
# technically we don't use the child body comment so we don't have to do this
# reddit_df["body"] = reddit_df["body"].apply(unmark, meta=str)
reddit_df["parent_body"] = reddit_df["parent_body"].apply(strip_markdown, meta=str)

reddit_df.compute()

CPU times: user 8.79 s, sys: 1.3 s, total: 10.1 s
Wall time: 1min 19s


Unnamed: 0,body,author,score,permalink,xkcd,parent_body,parent_author,parent_score,parent_permalink,xkcd_url_type,xkcd_num
0,Or maybe it's because we're so fucking tired o...,schizobullet,7,http://reddit.com/r/reddit.com/comments/5zioz/...,https://xkcd.com/16,Youngsters didn't see Python's Holy Grail... h...,multubunu,10,http://reddit.com/r/reddit.com/comments/5zioz/...,https://xkcd.com/16,16
1,Obligatory Snopes comic.\r\n\r\nhttp://xkcd.co...,paro,17,http://reddit.com/r/reddit.com/comments/2qnru/...,https://xkcd.com/250,"Emphasis on the unbelievable. \nNo author, no ...",reddit_doe,52,http://reddit.com/r/reddit.com/comments/2qnru/...,https://xkcd.com/250,250
2,&gt; Well I would hope that some of the women ...,MarkByers,13,http://reddit.com/r/reddit.com/comments/2yl1s/...,https://xkcd.com/322,Well I would hope that some of the women here ...,feanor512,78,http://reddit.com/r/reddit.com/comments/2yl1s/...,https://xkcd.com/322,322
3,http://xkcd.com/75/,morner,12,http://reddit.com/r/reddit.com/comments/2ogwe/...,https://xkcd.com/75,Bloody fuckin' hell? :)\nCan I mix curses?,Figs,7,http://reddit.com/r/reddit.com/comments/2ogwe/...,https://xkcd.com/75,75
5,[ObGarfield](http://xkcd.com/78/),RichardPeterJohnson,17,http://reddit.com/r/reddit.com/comments/2yikf/...,https://xkcd.com/78,Thanks. You know- I never had this problem wh...,christianjb,88,http://reddit.com/r/reddit.com/comments/2yikf/...,https://xkcd.com/78,78
...,...,...,...,...,...,...,...,...,...,...,...
2718,https://xkcd.com/767/,azephrahel,38,http://reddit.com/r/todayilearned/comments/egn...,https://xkcd.com/767,I'm worried for the day a scandal finally come...,Radidactyl,45,http://reddit.com/r/todayilearned/comments/egn...,https://xkcd.com/767,767
2719,"""The recommended amount of wolves inside someo...",TheMelonBandit,38,http://reddit.com/r/notliketheothergirls/comme...,https://xkcd.com/1471,INSIDE YOU THERE ARE TWO WOLVES. THE RECOMMEND...,DungeonCrawlingFool,730,http://reddit.com/r/notliketheothergirls/comme...,https://xkcd.com/1471,1471
2720,https://xkcd.com/37/,Agent-008,38,http://reddit.com/r/IncreasinglyVerbose/commen...,https://xkcd.com/37,Samsung makes weird ass emojis,DarkBlade1212,102,http://reddit.com/r/IncreasinglyVerbose/commen...,https://xkcd.com/37,37
2721,[The sword is traditionally carried on the lef...,lesser_panjandrum,38,http://reddit.com/r/TrollXChromosomes/comments...,https://xkcd.com/1403,Come again?\nI may be inclined to get a PhD...,Pufflehuffy,11,http://reddit.com/r/TrollXChromosomes/comments...,https://xkcd.com/1403,1403


In [10]:
%%time

# what are the most common referenced xkcds on Reddit?
# For some reason value_counts does not work with modin dataframes
print(reddit_df["xkcd"].value_counts().nlargest(15).compute())

https://xkcd.com/37      58187
https://xkcd.com/1053    16984
https://xkcd.com/917      8396
https://xkcd.com/927      5033
https://xkcd.com/386      4684
https://xkcd.com/936      3793
https://xkcd.com/1357     3338
https://xkcd.com/1013     3056
https://xkcd.com/979      3009
https://xkcd.com/327      2820
https://xkcd.com/1756     2492
https://xkcd.com/435      2416
https://xkcd.com/323      2401
https://xkcd.com/538      2264
https://xkcd.com/774      2112
Name: xkcd, dtype: int64
CPU times: user 5.03 s, sys: 661 ms, total: 5.69 s
Wall time: 1min 8s


In [12]:
%%time

# how many xkcds have never been referenced on Reddit?
xkcds = dd.from_pandas(pd.Series(range(1, CURR_MAX_COMIC+1), name="xkcds"), npartitions=1)
# reddit_set = set(reddit_df["xkcd_num"].tolist())
num = (~xkcds.isin(reddit_df["xkcd_num"].unique().compute().tolist())).sum().compute()
print(f"Number of unreferenced xkcds: {num}")
print(f"Percentage of total: {num / len(xkcds) * 100:.2f}%")

Number of unreferenced xkcds: 312
Percentage of total: 12.66%
CPU times: user 4.35 s, sys: 547 ms, total: 4.9 s
Wall time: 1min 1s


In [13]:
%%time

# simple tfidf model that uses the explanations from explain xkcd
tfidf = TfidfVectorizer(strip_accents='ascii', stop_words='english', ngram_range=(1, 6), min_df=0.03)
exp_vec = tfidf.fit_transform(xkcd_df['Explanation'].compute())
reddit_vec = tfidf.transform(reddit_df['parent_body'].compute())

CPU times: user 45.6 s, sys: 2.06 s, total: 47.7 s
Wall time: 1min 45s


In [15]:
%%time

y = reddit_df["xkcd_num"].values.compute().reshape((-1, 1))
# subtract 1 from y so that the xkcd numbers are 0 indexed
y -= 1

CPU times: user 5 s, sys: 635 ms, total: 5.64 s
Wall time: 1min 7s


In [16]:
cos_y_hat = cosine_similarity(reddit_vec, exp_vec)

In [17]:
def accuracy_n(y, y_hat, n=1):
    """Calculate the top-n accuracy given predicted class probabilities"""
    # arg sort along the rows
    top_n = np.argsort(y_hat, 1)[:, -n:]
    return np.mean(np.fromiter((
        1 if y[k] in top_n[k]
        else 0
        for k in range(len(top_n))
    ), dtype=np.int8))

In [18]:
%%time

top_1 = accuracy_n(y, cos_y_hat)
top_5 = accuracy_n(y, cos_y_hat, n=5)
print(f"Top-1 Acc: {top_1*100:.3f}%")
print(f"Top-5 Acc: {top_5*100:.3f}%")

Top-1 Acc: 0.020%
Top-5 Acc: 0.119%
CPU times: user 55 s, sys: 13.1 s, total: 1min 8s
Wall time: 1min 9s


In [19]:
# BM25
class BM25Transformer(BaseEstimator, TransformerMixin):
    """
    Parameters
    ----------
    use_idf : boolean, optional (default=True)
    k1 : float, optional (default=2.0)
    b : float, optional (default=0.75)
    References
    ----------
    Okapi BM25: a non-binary model - Introduction to Information Retrieval
    http://nlp.stanford.edu/IR-book/html/htmledition/okapi-bm25-a-non-binary-model-1.html
    """
    def __init__(self, use_idf=True, k1=2.0, b=0.75):
        self.use_idf = use_idf
        self.k1 = k1
        self.b = b

    def fit(self, X):
        """
        Parameters
        ----------
        X : sparse matrix, [n_samples, n_features]
            document-term matrix
        """
        if not sp.issparse(X):
            X = sp.csc_matrix(X)
        if self.use_idf:
            n_samples, n_features = X.shape
            df = _document_frequency(X)
            idf = np.log((n_samples - df + 0.5) / (df + 0.5))
            self._idf_diag = sp.spdiags(idf, diags=0, m=n_features, n=n_features)
        return self

    def transform(self, X, copy=True):
        """
        Parameters
        ----------
        X : sparse matrix, [n_samples, n_features]
            document-term matrix
        copy : boolean, optional (default=True)
        """
        if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
            # preserve float family dtype
            X = sp.csr_matrix(X, copy=copy)
        else:
            # convert counts or binary occurrences to floats
            X = sp.csr_matrix(X, dtype=np.float64, copy=copy)

        n_samples, n_features = X.shape

        # Document length (number of terms) in each row
        # Shape is (n_samples, 1)
        dl = X.sum(axis=1)
        # Number of non-zero elements in each row
        # Shape is (n_samples, )
        sz = X.indptr[1:] - X.indptr[0:-1]
        # In each row, repeat `dl` for `sz` times
        # Shape is (sum(sz), )
        # Example
        # -------
        # dl = [4, 5, 6]
        # sz = [1, 2, 3]
        # rep = [4, 5, 5, 6, 6, 6]
        rep = np.repeat(np.asarray(dl), sz)
        # Average document length
        # Scalar value
        avgdl = np.average(dl)
        # Compute BM25 score only for non-zero elements
        data = X.data * (self.k1 + 1) / (X.data + self.k1 * (1 - self.b + self.b * rep / avgdl))
        X = sp.csr_matrix((data, X.indices, X.indptr), shape=X.shape)

        if self.use_idf:
            check_is_fitted(self, '_idf_diag', 'idf vector is not fitted')

            expected_n_features = self._idf_diag.shape[0]
            if n_features != expected_n_features:
                raise ValueError("Input has n_features=%d while the model"
                                 " has been trained with n_features=%d" % (
                                     n_features, expected_n_features))
            # *= doesn't work
            X = X * self._idf_diag

        return X

In [20]:
re_stopwords = re.compile(r'\b(' + r'|'.join(stopwords.words('english')) + r')\b\s*')
# remove stop words and punctuation
replace_vec = np.vectorize(
    lambda item: re_stopwords.sub('', item).translate(str.maketrans('', '', string.punctuation))
)

class StopWordRemover(BaseEstimator, TransformerMixin):

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return replace_vec(X)

StopWordRemover().fit_transform(np.array([
    ["This is a test", "hello %world this is a test."],
    ["another one", "of how well"],
    ["hello world, today is a good day.", "this works."]
]))

array([['This test', 'hello world test'],
       ['another one', 'well'],
       ['hello world today good day', 'works']], dtype='<U26')

In [21]:
%%time

# TODO: Look into dask_ml to replace these custom transformers so
#       they can be a lot faster
p = Pipeline([
    ('stop', StopWordRemover()),
    ('count_vec', CountVectorizer(ngram_range=(1, 6))),
    ('bm25', BM25Transformer()),
])

exp_vec2 = p.fit_transform(xkcd_df['Explanation'])
reddit_vec2 = p.transform(reddit_df['parent_body'])

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


CPU times: user 1min 55s, sys: 22.1 s, total: 2min 17s
Wall time: 4min 30s


In [22]:
cos_y_hat2 = cosine_similarity(reddit_vec2, exp_vec2)
top_1 = accuracy_n(y, cos_y_hat2)
top_5 = accuracy_n(y, cos_y_hat2, n=5)
print(f"Top-1 Acc: {top_1*100:.3f}%")
print(f"Top-5 Acc: {top_5*100:.3f}%")

Top-1 Acc: 0.032%
Top-5 Acc: 0.157%


In [23]:
%%time

# This takes about 10 minutes right now
X_train_raw, X_test_raw, y_train, y_test = train_test_split(reddit_df['parent_body'], reddit_df["xkcd_num"] - 1, test_size=0.25)
xgb_pipe = clone(p)
X_train = xgb_pipe.fit_transform(X_train_raw)
X_test = xgb_pipe.transform(X_test_raw)

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


CPU times: user 3min 24s, sys: 30.9 s, total: 3min 55s
Wall time: 7min 56s


In [26]:
eval_set = [(X_train, y_train), (X_test, y_test)]

# TODO: Fix bug attribute to_delayed not found (basically everything works up until this point)

# clf = XGBClassifier()
# clf.fit(X_train, y_train, eval_set=eval_set)

dask.dataframe.core.Series

In [None]:
# clf.score(X_test_raw, y_test)