In [174]:
# Imports
import numpy as np
import pandas as pd
from yarl import URL
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import accuracy_score

In [96]:
# Process explain xkcd data
links_df = pd.read_csv("./data/xkcd/links_df.csv") # .set_index("Title")
# There is a bug in the data collection which is caused by this surprise:
# https://www.explainxkcd.com/wiki/index.php/Disappearing_Sunday_Update
# its a comic with the same id which he speculates will break automated system. Sure
# broke mine!
links_df = links_df[links_df["TitleText"] != "Disappearing Sunday Update"].set_index("Title")
pages_df = pd.read_csv("./data/xkcd/pages_df.csv") # .set_index("Title")
pages_df = pages_df.drop_duplicates()
xkcd_df = pd.merge(links_df, pages_df, how='left', on="Title", validate="one_to_one")

xkcd_df.head()

Unnamed: 0,Title,xkcd,Image,Date,TitleText,Explanation,Transcript
0,https://www.explainxkcd.com/wiki/index.php/1,https://xkcd.com/1,https://www.explainxkcd.com/wiki/index.php/Fil...,2005-09-30,Barrel - Part 1,The comic shows a young boy floating in a barr...,[A boy sits in a barrel which is floating in a...
1,https://www.explainxkcd.com/wiki/index.php/2,https://xkcd.com/2,https://www.explainxkcd.com/wiki/index.php/Fil...,2005-09-30,Petit Trees (sketch),This comic does not present a particular point...,[Two trees are growing on opposite sides of a ...
2,https://www.explainxkcd.com/wiki/index.php/3,https://xkcd.com/3,https://www.explainxkcd.com/wiki/index.php/Fil...,2005-09-30,Island (sketch),This comic does not present a particular point...,[A color sketch of an island.]
3,https://www.explainxkcd.com/wiki/index.php/4,https://xkcd.com/4,https://www.explainxkcd.com/wiki/index.php/Fil...,2005-09-30,Landscape (sketch),This comic does not present a particular point...,[A sketch of a landscape with sun on the horiz...
4,https://www.explainxkcd.com/wiki/index.php/5,https://xkcd.com/5,https://www.explainxkcd.com/wiki/index.php/Fil...,,Blown apart,This comic is a mathematical and technical jok...,[A black number 70 sees a red package and a li...


In [168]:
# Process reddit data
file_names =  [
    *list(map(str, range(2007, 2015))),
    *[f"{year}_{month:02d}" for year in range(2015, 2020) for month in range(1, 13)]
]
reddit_dfs = [
    pd.read_csv(f"./data/reddit/{file_name}.csv")
    for file_name in file_names
]
reddit_df = pd.concat(reddit_dfs, ignore_index=True)
reddit_df.head()

Unnamed: 0,body,author,score,permalink,xkcd,parent_body,parent_author,parent_score,parent_permalink
0,Or maybe it's because we're so fucking tired o...,schizobullet,7,http://reddit.com/r/reddit.com/comments/5zioz/...,http://xkcd.com/16/,Youngsters didn't see Python's Holy Grail... h...,multubunu,10,http://reddit.com/r/reddit.com/comments/5zioz/...
1,Obligatory Snopes comic.\r\n\r\nhttp://xkcd.co...,paro,17,http://reddit.com/r/reddit.com/comments/2qnru/...,http://xkcd.com/250/,Emphasis on the unbelievable. \r\n\r\nNo autho...,reddit_doe,52,http://reddit.com/r/reddit.com/comments/2qnru/...
2,Somehow I think [this might apply](http://www....,wacky,1,http://reddit.com/r/reddit.com/comments/5z3ab/...,http://www.xkcd.com/322/,This is not a conversation I expected to see o...,LoveGoblin,1,http://reddit.com/r/reddit.com/comments/5z3ab/...
3,"uh... http://xkcd.com/207/\r\n\r\nyep, that's ...",bgstratt,4,http://reddit.com/r/reddit.com/comments/2grzt/...,http://xkcd.com/207/,http://xkcd.com/about/\r\n\r\nits the third qu...,afedele,6,http://reddit.com/r/reddit.com/comments/2grzt/...
4,Time to ruin the joke:\r\n\r\nhttp://xkcd.com/...,craigus,1,http://reddit.com/r/programming/comments/2hmdw...,http://xkcd.com/303/,Have you tried balancing on an office chair wh...,kmactane,20,http://reddit.com/r/programming/comments/2hmdw...


In [169]:
# Clean up reddit_df
CURR_MAX_COMIC = 2400
# remove null rows
reddit_df = reddit_df[~(reddit_df["xkcd"].isnull() | reddit_df["parent_body"].isnull())]
# remove malformed row
reddit_df = reddit_df.drop(index=52737)
# Clean up multiple versions to singular version
reddit_df["xkcd"] = reddit_df["xkcd"].apply(
    lambda url: "https://xkcd.com/" + URL(url).path.replace("/", "")
)
# Drop invalid comic numbers
mask = reddit_df["xkcd"].apply(lambda url: int(URL(url).path[1:]) < CURR_MAX_COMIC, convert_dtype=False).values.astype(bool)
reddit_df = reddit_df[mask]
reddit_df.reset_index(drop=True, inplace=True)

In [170]:
# Some quick maths
reddit_df["xkcd"].value_counts().nlargest(15)

tfidf = TfidfVectorizer(strip_accents='ascii', stop_words='english', ngram_range=(1, 6), min_df=0.03)
exp_vec = tfidf.fit_transform(xkcd_df['Explanation'])
reddit_vec = tfidf.transform(reddit_df['parent_body'])

In [171]:
sim = cosine_similarity(reddit_vec, exp_vec)

In [175]:
def accuracy_n(y, y_hat, n=1):
    pass

y = reddit_df["xkcd"].apply(lambda url: int(URL(url).path[1:])).values.reshape((-1, 1))

acc = accuracy_score(np.argmax(sim, 1), y)
n = 5
topn = np.argsort(sim, 1)[:, -n:]
argmax_labels = y.ravel()
topn_acc = np.mean(np.array([
    1 if argmax_labels[k] in topn[k]
    else 0
    for k in range(len(topn))
]))
print(acc)
print(topn_acc)


0.0003893049630000734
0.001611467264877353
