In [None]:
import pandas as pd
import numpy as np

The files we are importing here should be Reddit data taken from GoogleBig Query which has been run through the LIWC software. The data should be formatted with the reddit metadata (i.e. post content, author, subreddit, url) in the first few columns, followed by the LIWC variables (i.e., WC, WPS, they, tentative etc)

In [None]:
posts_data = pd.read_csv() #filepath to Reddit comments which have been through the LIWC software
comments_data = pd.read_csv() #filepath to Reddit posts which have been through the LIWC software
# These need to be treated separately as they contain different headings 

If these have come straight from the LIWC software, you may need to rename the headings. 

In [None]:
comments_data.head()

In [None]:
posts_data.head()

In [None]:
comments_data.rename(columns={'B':'author','C':'body','D':'subreddit'}, inplace=True)
posts_data.rename(columns={'B':'author','C':'title','D':'selftext', 'E': 'subreddit','F':'url'}, inplace=True)


## Data Preparation

Here we remove posts and comments that are not suitable for the analysis. This includes posts/comments which: 
(i) have been removed or deleted, 
(ii) are from authors who have been removed or deleted, 
(iii) contain strings including the word 'bot' as bots often identify themselves
(iv)  contain urls
(v)  contain words under the set word count limit (often 50)

Note: In the LIWC software there is a variable titled 'body', you may need to rename this variable so as not to confuse the LIWC variable with the 'body' column referring to the body of text in the comments. 

In [None]:
def clean_posts(df, wordcount):
    print('Posts starting:', df.shape)
    df= df.loc[df['selftext']!='[removed]']
    df= df.loc[df['selftext']!='[deleted]']
    print('selftext removed/deleted: ', len(df))
    df= df.loc[df['author']!='[removed]']
    df= df.loc[df['author']!='[deleted]']
    print('author removed/deleted: ', len(df))
    df=df[~df.selftext.str.contains(' bot ', na=False)]
    print('text with no bots: ', len(df))
    df=df[~df.author.str.contains('AutoModerator', na=False)]
    df=df[~df.author.str.contains('bot', na=False)]
    print('author no bots: ', len(df))
    print(df.shape)
    df = df[df['url'].str.contains("reddit")]
    print('URLs removed: ', len(df))
    df = df.loc[df['WC']>wordcount]
    print('WC removed: ', len(df))
    return df

def clean_comments(df, wordcount):
    print('Comments starting:', df.shape)
    df= df.loc[df['body']!='[removed]']
    df= df.loc[df['body']!='[deleted]']
    print('body removed/delted: ', len(df))
    df= df.loc[df['author']!='[removed]']
    df= df.loc[df['author']!='[deleted]']
    print('author removed/deleted: ', len(df))
    df=df[~df.body.str.contains(' bot ', na=False)]
    print('text with no bots: ', len(df))
    df=df[~df.author.str.contains('AutoModerator', na=False)]
    df=df[~df.author.str.contains('bot', na=False)]
    print('author no bots: ', len(df))
    df = df.loc[df['WC'] > wordcount]
    print('WC removed: ', len(df))
    print(df.shape)
    return df

In [None]:
cleancomments = clean_comments(comments_data, 49)
cleanposts = clean_posts(posts_data, 49)

Here, we drop any duplicate posts/comments in each forum and find out how many posts/comments we have remaining in each forum.

In [None]:
forums = cleanposts.subreddit.unique()

post_lengths=[]
comment_lengths=[]

for forum in forums:
    cleanposts.drop_duplicates(['selftext','title'], inplace=True)
    cleancomments.drop_duplicates('body', inplace=True)
    posts_filtered = cleanposts.loc[cleanposts['subreddit']==forum]
    print(forum, 'posts : ', len(posts_filtered))
    post_lengths.append(len(posts_filtered))
    comments_filtered = cleancomments.loc[cleancomments['subreddit']==forum]
    print(forum, 'comments : ', len(comments_filtered))
    comment_lengths.append(len(comments_filtered))



Next, we take a sample of data from each forum which matches the forum with the lowest number of posts/comments. That is, if one forum has only 10,000 comments, we will take 10,000 comments from all forums in order to ensure equal sample sizes. This is important as it impacts how we interpret our later AUCs. Please note that there are also other methods available to ensure equal sample sizes such as creating synthetic data. 

In [None]:
totalsample=pd.DataFrame()

allstyle = ['WPS', 'Sixltr','i', 'we', 'you', 'shehe', 'they', 'ipron',
       'article', 'prep', 'auxverb', 'adverb', 'conj', 'negate', 'verb',
       'adj', 'compare', 'interrog', 'number', 'quant', 'affect','insight', 'cause',
       'discrep', 'tentat', 'certain', 'differ','see', 'hear',
       'feel', 'focuspast', 'focuspresent', 'focusfuture', 'motion', 'space',
       'time','swear', 'netspeak', 'assent', 'nonflu', 'filler', 'subreddit']

for forum in forums:
    posts_filtered = cleanposts.loc[cleanposts['subreddit']==forum]
    print(forum, 'posts : ', len(posts_filtered))
    posts_sample = posts_filtered.sample(min(post_lengths))
    comments_filtered = cleancomments.loc[cleancomments['subreddit']==forum]
    print(forum, 'comments : ', len(comments_filtered))
    comments_sample = comments_filtered.sample(min(comment_lengths))
    sampledf = pd.concat([comments_sample[allstyle], posts_sample[allstyle]]) #Keep only columns relevant to analysis
    totalsample = pd.concat([totalsample, sampledf])


## Data Analysis

First step is to create the test and training set. 

In [None]:
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.model_selection import cross_val_score
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report
from mpl_toolkits import mplot3d
from sklearn.metrics import roc_curve, roc_auc_score, accuracy_score

In [None]:
testdata = pd.DataFrame()
traindata = pd.DataFrame()

test_train_size = (min(post_lengths) + min(comment_lengths))//2
print (test_train_size)

for forum in forums:
    filtered = totalsample.loc[totalsample['subreddit']==forum]
    print(forum)
    test = filtered.sample(int(test_train_size))
    train = filtered[~filtered.isin(test)].dropna()
    print('train = ',len(train), 'test=',(len(test)))
    testdata = pd.concat([testdata, test])
    traindata = pd.concat ([traindata, train])

print(len(traindata), len(testdata))

Next, we run the Extra Trees classifier. Here we use 5 fold cross validation. 

In [None]:
def run_classifier(identity1, identity2):
    train1 = traindata.loc[traindata['subreddit']==identity1].drop(['subreddit'], axis=1)
    train2 = traindata.loc[traindata['subreddit']==identity2].drop(['subreddit'], axis=1)
    test1 = testdata.loc[testdata['subreddit']==identity1].drop(['subreddit'], axis=1)
    test2 = testdata.loc[testdata['subreddit']==identity2].drop(['subreddit'], axis=1)
    
    print(len(train1), len(train2), len(test1),len(test2))
    t = np.ones((train1.shape[0]+train2.shape[0]))
    t[:train1.shape[0]] = 0

    X = np.vstack((train1.values, 
            train2.values))
    X.shape, train1.shape, train2.shape
    
    t1 = np.ones((test1.shape[0]+test2.shape[0]))
    t1[:test1.shape[0]] = 0

    t2 = np.vstack((test1.values, 
            test2.values))
    t2.shape, test1.shape, test2.shape

    et = ExtraTreesClassifier(n_estimators=300, max_depth=None, 
                              min_samples_split=2, random_state=8, 
                              n_jobs=-1)

    scores = ['accuracy', 'roc_auc']

    for score in scores:
        result = cross_val_score(et, X, t, cv=5, scoring=score)
        print("ExtraTrees for %s" % score)
        print("%0.3f (+/-%0.03f)" % (np.mean(result), np.std(result)*2))

    print(X.shape)
    et.fit(X, t)
    s_train = et.predict_proba(X)

    y_true, y_pred = t1, et.predict(t2)
    s = et.predict_proba(t2)[:,1]
    print(classification_report(y_true, y_pred))
    print()


    fpr, tpr, thresholds = roc_curve(y_true, s)
    auc = roc_auc_score(y_true, s)
    acc = accuracy_score(y_true, y_pred)
    print('ExtraTrees  AUC %g   Accuracy %g' % (auc, acc))
    return auc, acc

In [None]:
analysisdf = pd.DataFrame(columns=forums, index=forums)
x=0

while x < (len(forums)+1): #Loop through the number of forums
    for forum in forums:
        print(forum, forums[x])
        auc, acc = run_classifier(forum, forums[x])
        analysisdf.at[forum, forums[x]] = auc
    x+=1

print(analysisdf)

Now we've got the AUCs for each pair of identities, we can run the multidimensional scaling analysis

In [None]:
D = analysisdf.values
D=np.array(D, dtype=float)
plt.imshow(D)

Here, we should see a symmetrical heat map of the AUCs from the different identity pairs. To check they are symmetrical, run the code below. It should return True. 

In [None]:
def check_symmetric(a, tol=1e-8):
    return np.all(np.abs(a-a.T) < tol)
check_symmetric(D)


In [None]:
from sklearn.metrics import pairwise_distances
from sklearn.manifold import MDS
from sklearn import preprocessing
from mpl_toolkits import mplot3d
    
%matplotlib inline

In [None]:
model = MDS(n_components=2, dissimilarity='precomputed', random_state=1)
mds_out = model.fit_transform(D)

In [None]:
fig, ax = plt.subplots(figsize=(10, 8))
ax.set_aspect('equal')
plot = ax.scatter(mds_out[:,0], mds_out[:,1], s=20, vmin=-0.6, vmax=0.6)


for i, txt in enumerate(analysisdf.columns.values):
        ax.annotate(txt, xy=(mds_out[:,0][i], mds_out[:,1][i]), xytext=(10, 10),
                    fontsize=15, va='top',
                    xycoords='data', textcoords='offset points')


# Ticks and labels


ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
plt.tick_params(axis='both', labelsize=15)
plt.tight_layout()




In [None]:
model = MDS(n_components=3, dissimilarity='precomputed', random_state=1)
mds_out = model.fit_transform(D)

In [None]:
fig = plt.figure(figsize=(10,10))
ax = plt.axes(projection='3d')

ax.scatter3D(mds_out[:,0], mds_out[:,1], mds_out[:,2], s=100, c=mds_out[:,2])

ax.view_init(azim=20, elev=30)

for i, txt in enumerate(analysisdf.columns.values):
    ax.text(mds_out[:,0][i], mds_out[:,1][i], mds_out[:,2][i], txt, fontsize=18)