# Subreddit Overlap Data

In [None]:
import json
import sqlite3
import sqlalchemy
import subprocess
import numpy as np
import pandas as pd
from os.path import isfile
from sqlalchemy import create_engine
from matplotlib import pyplot as plt

%matplotlib inline
%config InlineBackend.figure_format = 'retina'
pd.set_option('display.max_columns', None)

In [None]:
DB_NAME = "data/RC_2020_jan-june.db"
conn = sqlite3.connect(DB_NAME)

Take a look at the raw data

In [None]:
df = pd.read_sql_query("""SELECT * FROM reddit_comments LIMIT 100""", conn)
df = df.drop(columns=["index"])
df.head()

Create a list of number of users in each subreddit, and save the result a new table

In [None]:
QUERY1 = """
CREATE TABLE subr_users AS
    SELECT subreddit, authors, DENSE_RANK() OVER (ORDER BY authors DESC) AS rank_authors
    FROM (SELECT subreddit, SUM(1) as authors
         FROM (SELECT subreddit, author, COUNT(1) as cnt 
             FROM reddit_comments
             GROUP BY subreddit, author HAVING cnt > 0)
         GROUP BY subreddit) t
    ORDER BY authors DESC;
"""

c = conn.cursor()
c.execute(QUERY1)
conn.commit()

In [None]:
df = pd.read_sql_query("""SELECT * FROM subr_users""", conn)
df.head()

In [None]:
plt.figure(figsize=(16,6))

plt.subplot(121)
plt.title("authors")
plt.plot(df.index, df.authors)
plt.yscale("log")

plt.subplot(122)
plt.title("authors hist")
plt.hist(df.authors, bins=50)
plt.yscale("log")

plt.show()

In [None]:
plt.figure(figsize=(16,6))

plt.title("authors")
plt.plot(df.index[200:2201], df.authors[200:2201])
plt.show()

Using the table `subr_users`, we create a list of number of users who authored at least 10 posts in pairs of subreddits. The result is a table called `overlapping_subr_users`, where each row contains a from subreddit, a to subreddit, and the number of unique commenters that the two subreddits have in common.

In [None]:
QUERY2 = """
CREATE TABLE overlapping_subr_users AS
    SELECT t1.subreddit, t2.subreddit, SUM(1) AS NumOverlaps
    FROM (SELECT subreddit, author, COUNT(1) AS cnt 
         FROM reddit_comments
         WHERE subreddit IN (SELECT subreddit FROM subr_users
           WHERE rank_authors>200 AND rank_authors<2201)
         GROUP BY subreddit, author HAVING cnt > 10) t1
    JOIN (SELECT subreddit, author, COUNT(1) as cnt 
         FROM reddit_comments
         GROUP BY subreddit, author HAVING cnt > 10) t2
    ON t1.author=t2.author
    WHERE t1.subreddit!=t2.subreddit
    GROUP BY t1.subreddit, t2.subreddit
"""

c = conn.cursor()
c.execute(QUERY2)
conn.commit()

Read in the result of the second query as a dataframe. Edit the column names and store it for later

In [None]:
df = pd.read_sql_query("""SELECT * FROM overlapping_subr_users""", conn)
# df = df.rename(columns={"subreddit":"t1_subreddit", "subreddit:1":"t2_subreddit"})
# df.describe()

In [None]:
df.head()

In [None]:
# print("t1_subreddit unique subreddits: {}".format(len(df["t1_subreddit"].unique())))
# print("t2_subreddit unique subreddits: {}".format(len(df["t2_subreddit"].unique())))

In [None]:
df.to_csv("data/RC_2020_jan-june_subreddit_overlaps.csv", index=False)

Close the database connection

In [None]:
conn.close()