In [1]:
# Imports
from pprint import pprint

import numpy as np
from yarl import URL
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from markdown import markdown
from bs4 import BeautifulSoup

In [2]:
import os
import ray

os.environ["MODIN_ENGINE"] = "ray"
import modin.pandas as pd

2020-12-18 21:54:24,313	INFO services.py:1090 -- View the Ray dashboard at [1m[32mhttp://127.0.0.1:8265[39m[22m


# Load / Clean Data

## xkcd dataset

First we load in the xkcd dataset from https://www.explainxkcd.com

This dataset has 2388 xkcd comics run on (22 November 2020)

Each row has the following features:

* **xkcd**: The link to the official xkcd comic URL
* **xkcd_num**: The extracted comic number from the URL
* **Title**: The link to the Explain XKCD wiki page for that comic
* **Image**: Link to a backup hosted image of the XKCD comic
* **Date**: The original date of publication of the comic
* **TitleText**: Title of the comic
* **Explanation**: A community explanation of the comic deciphering the sometimes pithy
or cryptic humor
* **Transcript**: If the comic has characters speaking, this section has the text of the
comic.

In [3]:
# Process explain xkcd data
links_df = pd.read_csv("./data/xkcd/links_df.csv") # .set_index("Title")
# There is a bug in the data collection which is caused by this surprise:
# https://www.explainxkcd.com/wiki/index.php/Disappearing_Sunday_Update
# its a comic with the same id which he speculates will break automated system. Sure
# broke mine!
links_df = links_df[links_df["TitleText"] != "Disappearing Sunday Update"].set_index("Title")
pages_df = pd.read_csv("./data/xkcd/pages_df.csv") # .set_index("Title")
pages_df = pages_df.drop_duplicates()
xkcd_df = pd.merge(links_df, pages_df, how='left', on="Title", validate="one_to_one")
xkcd_df["xkcd_num"] = xkcd_df["xkcd"].apply(
    lambda url: int(URL(url).path.replace("/", ""))
)
print(xkcd_df.columns)
CURR_MAX_COMIC = xkcd_df["xkcd_num"].max()



Index(['Title', 'xkcd', 'Image', 'Date', 'TitleText', 'Explanation',
       'Transcript', 'xkcd_num'],
      dtype='object')


In [3]:
xkcd_df.head()

Unnamed: 0,Title,xkcd,Image,Date,TitleText,Explanation,Transcript,xkcd_num
0,https://www.explainxkcd.com/wiki/index.php/1,https://xkcd.com/1,https://www.explainxkcd.com/wiki/index.php/Fil...,2005-09-30,Barrel - Part 1,The comic shows a young boy floating in a barr...,[A boy sits in a barrel which is floating in a...,1
1,https://www.explainxkcd.com/wiki/index.php/2,https://xkcd.com/2,https://www.explainxkcd.com/wiki/index.php/Fil...,2005-09-30,Petit Trees (sketch),This comic does not present a particular point...,[Two trees are growing on opposite sides of a ...,2
2,https://www.explainxkcd.com/wiki/index.php/3,https://xkcd.com/3,https://www.explainxkcd.com/wiki/index.php/Fil...,2005-09-30,Island (sketch),This comic does not present a particular point...,[A color sketch of an island.],3
3,https://www.explainxkcd.com/wiki/index.php/4,https://xkcd.com/4,https://www.explainxkcd.com/wiki/index.php/Fil...,2005-09-30,Landscape (sketch),This comic does not present a particular point...,[A sketch of a landscape with sun on the horiz...,4
4,https://www.explainxkcd.com/wiki/index.php/5,https://xkcd.com/5,https://www.explainxkcd.com/wiki/index.php/Fil...,,Blown apart,This comic is a mathematical and technical jok...,[A black number 70 sees a red package and a li...,5


## reddit dataset

Next we load in the reddit dataset which is a collection of every reference of an xkcd
url on Reddit.

This dataset has 313485 samples and 9 features. The comments are collected from 2007 to
2019, inclusive.

Each sample has the following features:

* **body**: The text in the comment body (should have an xkcd url)
* **author**: The reddit user's name
* **score**: The comment's score (should be >= 1)
* **permalink**: The permalink to the comment
* **parent_***: The previous four attributes for the child comment's parent.
* **xkcd**: The xkcd comic url extracted from the child comment
* **xkcd_num**: The comic number extracted from the URL

In [4]:
%%time

# Process reddit data
file_names =  [
    *list(map(str, range(2007, 2015))),
    *[f"{year}_{month:02d}" for year in range(2015, 2020) for month in range(1, 13)]
]
reddit_dfs = [
    pd.read_csv(f"./data/reddit/{file_name}.csv")
    for file_name in file_names
]
reddit_df = pd.concat(reddit_dfs, ignore_index=True)
print(reddit_df.columns)

Index(['body', 'author', 'score', 'permalink', 'xkcd', 'parent_body',
       'parent_author', 'parent_score', 'parent_permalink'],
      dtype='object')
CPU times: user 2.08 s, sys: 451 ms, total: 2.53 s
Wall time: 4.46 s


In [5]:
reddit_df.tail()

Unnamed: 0,body,author,score,permalink,xkcd,parent_body,parent_author,parent_score,parent_permalink
313480,[The sword is traditionally carried on the lef...,lesser_panjandrum,38,http://reddit.com/r/TrollXChromosomes/comments...,https://xkcd.com/1403/,Come again?\n\nI may be inclined to get a PhD...,Pufflehuffy,11,http://reddit.com/r/TrollXChromosomes/comments...
313481,[Hardest Refresh](https://xkcd.com/1854/),ultimate_doge06,38,http://reddit.com/r/furry_irl/comments/ei71rs/...,https://xkcd.com/1854/,How do I delete the internet?,Zorxs,41,http://reddit.com/r/furry_irl/comments/ei71rs/...
313482,https://xkcd.com/387/,unquietwiki,38,http://reddit.com/r/AskReddit/comments/earnak/...,https://xkcd.com/387/,"""That was cool and all, but what was going on ...",Ytterbro,117,http://reddit.com/r/AskReddit/comments/earnak/...
313483,[Around 2026](https://xkcd.com/2014/),Skeptophile,38,http://reddit.com/r/space/comments/ei2dka/c/fc...,https://xkcd.com/2014/,Is the James Webb telescope going to launch an...,MasterofMistakes007,363,http://reddit.com/r/space/comments/ei2dka/c/fc...
313484,[Relevant xkcd](https://xkcd.com/2205/),xkcloud,38,http://reddit.com/r/NoStupidQuestions/comments...,https://xkcd.com/2205/,It's called an order of magnitude calculation....,jansencheng,75,http://reddit.com/r/NoStupidQuestions/comments...


In [6]:
%%time

# Clean up reddit_df

# remove null rows in important columns
reddit_df = reddit_df[~(
        reddit_df["xkcd"].isnull()
        | reddit_df["parent_body"].isnull()
        | reddit_df["body"].isnull()
)]
# remove malformed row
reddit_df = reddit_df.drop(index=52737)
# Clean up multiple versions of URL to singular version
# (i.e. m.xkcd, ending with slash, without slash, etc...)
reddit_df["xkcd"] = reddit_df["xkcd"].apply(
    lambda url: "https://xkcd.com/" + URL(url).path.replace("/", "")
)
# Drop invalid comic numbers
# the convert_dtype=False is required here because some annoying people used invalid URLs
# with really large numbers
mask = reddit_df["xkcd"].apply(
    lambda url: int(URL(url).path[1:]) < CURR_MAX_COMIC, convert_dtype=False
).values.astype(bool)
reddit_df = reddit_df[mask]

# Add URL --> number column
reddit_df["xkcd_num"] = reddit_df["xkcd"].apply(lambda url: int(URL(url).path[1:]))

# naive remove samples with xkcd in parent
# likely over fit signal (e.g. reminds of this specific xkcd 33)
# or low signal... (e.g. does anyone have the xkcd link)
reddit_df = reddit_df[~reddit_df["parent_body"].str.contains("xkcd")]

def strip_markdown(sample):
    html = markdown(sample)
    return ''.join(BeautifulSoup(html).findAll(text=True))

# strip markdown from text
reddit_df["body"] = reddit_df["body"].apply(strip_markdown)
reddit_df["parent_body"] = reddit_df["parent_body"].apply(strip_markdown)

# reset index numbering
reddit_df.reset_index(drop=True, inplace=True)
print(len(reddit_df))

266244
CPU times: user 21.5 s, sys: 2.4 s, total: 23.9 s
Wall time: 30.8 s


In [9]:
# what are the most common referenced xkcds on Reddit?
# For some reason value_counts does not work with modin dataframes
print(reddit_df["xkcd"]._to_pandas().value_counts().nlargest(15))

https://xkcd.com/37      58187
https://xkcd.com/1053    16984
https://xkcd.com/917      8396
https://xkcd.com/927      5033
https://xkcd.com/386      4684
https://xkcd.com/936      3793
https://xkcd.com/1357     3338
https://xkcd.com/1013     3056
https://xkcd.com/979      3008
https://xkcd.com/327      2820
https://xkcd.com/1756     2492
https://xkcd.com/435      2416
https://xkcd.com/323      2401
https://xkcd.com/538      2264
https://xkcd.com/774      2112
Name: xkcd, dtype: int64


In [10]:
# how many xkcds have never been referenced on Reddit?
xkcds = set(range(1, CURR_MAX_COMIC))
reddit_set = set(reddit_df["xkcd_num"].tolist())
num = len(xkcds - reddit_set)
print(f"Number of unreferenced xkcds: {num}")
print(f"Percentage of total: {num / len(xkcds) * 100:.2f}%")

To request implementation, send an email to feature_requests@modin.org.


Number of unreferenced xkcds: 234
Percentage of total: 9.80%


In [12]:
%%time

# simple tfidf model that uses the explanations from explain xkcd
tfidf = TfidfVectorizer(strip_accents='ascii', stop_words='english', ngram_range=(1, 6), min_df=0.03)
exp_vec = tfidf.fit_transform(xkcd_df['Explanation'].values)
reddit_vec = tfidf.transform(reddit_df['parent_body'].values)

CPU times: user 49.7 s, sys: 3.35 s, total: 53.1 s
Wall time: 3min 53s


In [14]:
%%time

y = reddit_df["xkcd_num"].values.reshape((-1, 1))
cos_y_hat = cosine_similarity(reddit_vec, exp_vec)

CPU times: user 19.4 s, sys: 6.6 s, total: 26 s
Wall time: 29 s


In [21]:
def accuracy_n(y, y_hat, n=1):
    """Calculate the top-n accuracy given predicted class probabilities"""
    # arg sort along the rows
    top_n = np.argsort(y_hat, 1)[:, -n:]
    return np.mean(np.fromiter((
        1 if y[k] in top_n[k]
        else 0
        for k in range(len(top_n))
    ), dtype=np.int8))

In [22]:
%%time

top_1 = accuracy_n(y, cos_y_hat)
top_5 = accuracy_n(y, cos_y_hat, n=5)
print(f"Top-1 Acc: {top_1*100:.3f}%")
print(f"Top-5 Acc: {top_5*100:.3f}%")

Top-1 Acc: 0.029%
Top-5 Acc: 0.161%
CPU times: user 45.8 s, sys: 5.81 s, total: 51.6 s
Wall time: 54.3 s


In [32]:
# BM25

import scipy.sparse as sp
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.utils.validation import check_is_fitted
from sklearn.feature_extraction.text import _document_frequency

class BM25Transformer(BaseEstimator, TransformerMixin):
    """
    Parameters
    ----------
    use_idf : boolean, optional (default=True)
    k1 : float, optional (default=2.0)
    b : float, optional (default=0.75)
    References
    ----------
    Okapi BM25: a non-binary model - Introduction to Information Retrieval
    http://nlp.stanford.edu/IR-book/html/htmledition/okapi-bm25-a-non-binary-model-1.html
    """
    def __init__(self, use_idf=True, k1=2.0, b=0.75):
        self.use_idf = use_idf
        self.k1 = k1
        self.b = b

    def fit(self, X):
        """
        Parameters
        ----------
        X : sparse matrix, [n_samples, n_features]
            document-term matrix
        """
        if not sp.issparse(X):
            X = sp.csc_matrix(X)
        if self.use_idf:
            n_samples, n_features = X.shape
            df = _document_frequency(X)
            idf = np.log((n_samples - df + 0.5) / (df + 0.5))
            self._idf_diag = sp.spdiags(idf, diags=0, m=n_features, n=n_features)
        return self

    def transform(self, X, copy=True):
        """
        Parameters
        ----------
        X : sparse matrix, [n_samples, n_features]
            document-term matrix
        copy : boolean, optional (default=True)
        """
        if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
            # preserve float family dtype
            X = sp.csr_matrix(X, copy=copy)
        else:
            # convert counts or binary occurrences to floats
            X = sp.csr_matrix(X, dtype=np.float64, copy=copy)

        n_samples, n_features = X.shape

        # Document length (number of terms) in each row
        # Shape is (n_samples, 1)
        dl = X.sum(axis=1)
        # Number of non-zero elements in each row
        # Shape is (n_samples, )
        sz = X.indptr[1:] - X.indptr[0:-1]
        # In each row, repeat `dl` for `sz` times
        # Shape is (sum(sz), )
        # Example
        # -------
        # dl = [4, 5, 6]
        # sz = [1, 2, 3]
        # rep = [4, 5, 5, 6, 6, 6]
        rep = np.repeat(np.asarray(dl), sz)
        # Average document length
        # Scalar value
        avgdl = np.average(dl)
        # Compute BM25 score only for non-zero elements
        data = X.data * (self.k1 + 1) / (X.data + self.k1 * (1 - self.b + self.b * rep / avgdl))
        X = sp.csr_matrix((data, X.indices, X.indptr), shape=X.shape)

        if self.use_idf:
            check_is_fitted(self, '_idf_diag', 'idf vector is not fitted')

            expected_n_features = self._idf_diag.shape[0]
            if n_features != expected_n_features:
                raise ValueError("Input has n_features=%d while the model"
                                 " has been trained with n_features=%d" % (
                                     n_features, expected_n_features))
            # *= doesn't work
            X = X * self._idf_diag

        return X

In [31]:
import re
import string
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

re_stopwords = re.compile(r'\b(' + r'|'.join(stopwords.words('english')) + r')\b\s*')
# remove stop words and punctuation
replace_vec = np.vectorize(
    lambda item: re_stopwords.sub('', item).translate(str.maketrans('', '', string.punctuation))
)

class StopWordRemover(BaseEstimator, TransformerMixin):

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return replace_vec(X)

StopWordRemover().fit_transform(np.array([
    ["This is a test", "hello %world this is a test."],
    ["another one", "of how well"],
    ["hello world, today is a good day.", "this works."]
]))

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/adithyabalaji/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


array([['This test', 'hello world test'],
       ['another one', 'well'],
       ['hello world today good day', 'works']], dtype='<U26')

In [35]:
%%time

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline

p = Pipeline([
    ('stop', StopWordRemover()),
    ('count_vec', CountVectorizer(ngram_range=(1, 6))),
    ('bm25', BM25Transformer()),
])

exp_vec2 = p.fit_transform(xkcd_df['Explanation'].values)
reddit_vec2 = p.transform(reddit_df['parent_body'].values)



CPU times: user 2min 1s, sys: 19.4 s, total: 2min 20s
Wall time: 4min 9s


In [36]:
cos_y_hat2 = cosine_similarity(reddit_vec2, exp_vec2)
top_1 = accuracy_n(y, cos_y_hat2)
top_5 = accuracy_n(y, cos_y_hat2, n=5)
print(f"Top-1 Acc: {top_1*100:.3f}%")
print(f"Top-5 Acc: {top_5*100:.3f}%")

Top-1 Acc: 0.035%
Top-5 Acc: 0.210%
