In [1]:
from convokit import Corpus, download, HyperConvo
import pandas as pd
import numpy as np
from collections import Counter
from scipy import stats
import json
from datetime import datetime
import pickle as pk


In [3]:
dfs = dict()

subreddit = "Christianity"
corpus = Corpus.reconnect_to_db(f'subreddit-{subreddit}')
dfs[subreddit] = df = corpus.get_utterances_dataframe(selector = lambda utt: len(utt.text.split()) > 5)

subreddit = "TrueChristian"
corpus = Corpus.reconnect_to_db(f'subreddit-{subreddit}')
dfs[subreddit]  = df = corpus.get_utterances_dataframe(selector = lambda utt: len(utt.text.split()) > 5)

In [4]:
speakers = dict()
for subreddit in ["Christianity", "TrueChristian"]:
    speakers[subreddit] = dfs[subreddit]['speaker'].unique()

In [7]:
common_speakers = set(speakers["Christianity"]).intersection(speakers["TrueChristian"])

In [13]:
common_speakers.remove("[deleted]")


In [34]:
print(len(common_speakers))


12068


In [16]:
for subreddit in ["TrueChristian", "Christianity"]:
    df = dfs[subreddit]
    print(subreddit)
    print(df[df['speaker'].isin(common_speakers)].groupby(['speaker']).size().describe())

TrueChristian
count    12068.000000
mean        33.587753
std        166.506911
min          1.000000
25%          1.000000
50%          4.000000
75%         16.000000
max       5730.000000
dtype: float64
Christianity
count    12068.000000
mean       224.120815
std       1031.910221
min          1.000000
25%          4.000000
50%         17.000000
75%         78.000000
max      27546.000000
dtype: float64


In [18]:
common_speakers_utt = pd.DataFrame()
for subreddit in ["TrueChristian", "Christianity"]:
    df = dfs[subreddit]
    common_speakers_utt = pd.concat([common_speakers_utt, df[df['speaker'].isin(common_speakers)]])

In [20]:
common_speakers_utt['text_len'] = common_speakers_utt['text'].apply(lambda t: len(t.split()))

In [21]:
common_speakers_utt.groupby(['meta.subreddit'])['text_len'].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
meta.subreddit,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Christianity,2704690.0,70.181515,112.102642,6.0,17.0,36.0,77.0,6846.0
TrueChristian,405337.0,87.611289,143.714178,6.0,20.0,44.0,95.0,6034.0


In [23]:
for subreddit in ["TrueChristian", "Christianity"]:
    df = dfs[subreddit]
    df['text_len'] = df['text'].apply(lambda t: len(t.split()))
    df['speaker in both TrueChristian and Christianity'] = df['speaker'].apply(lambda s: s in common_speakers)

In [26]:
pd.set_option('display.max_rows', 1000)

In [30]:
print("r/Christianity")
dfs['Christianity'].groupby(['speaker in both TrueChristian and Christianity'])['text_len'].describe()

r/Christianity


Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
speaker in both TrueChristian and Christianity,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
False,3407322.0,71.217512,109.151058,6.0,18.0,37.0,81.0,6891.0
True,2704690.0,70.181515,112.102642,6.0,17.0,36.0,77.0,6846.0


In [31]:
print("r/TrueChristian")
dfs['TrueChristian'].groupby(['speaker in both TrueChristian and Christianity'])['text_len'].describe()

r/TrueChristian


Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
speaker in both TrueChristian and Christianity,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
False,79717.0,87.839608,134.983852,6.0,20.0,45.0,101.0,4463.0
True,405337.0,87.611289,143.714178,6.0,20.0,44.0,95.0,6034.0


In [33]:
dfs['TrueChristian']['speaker'].nunique()

19547

In [38]:
import collections.abc
collections.Hashable = collections.abc.Hashable
from numpy_ml.preprocessing.nlp import tokenize_words, ngrams, strip_punctuation
common_speakers_utt['tokens'] = common_speakers_utt['text'].apply(lambda t: tokenize_words(strip_punctuation(t), filter_stopwords=False))



In [39]:
tokens = set([item for l in common_speakers_utt['tokens'] for item in l])

In [40]:
len(tokens)

926401

In [41]:
common_speakers_utt['tokens'].apply(lambda t: 'I' in t)

id
1dhq02     False
1dgp1n     False
1dg7gp     False
1dksc4     False
1djqmd     False
           ...  
e8tjyac    False
e8tk2dh    False
e8tk2wn    False
e8tke19    False
e8tkhxv    False
Name: tokens, Length: 3110027, dtype: bool

In [None]:
for tok in tokens:
    