# Library

In [1]:
import pandas as pd

In [2]:
import datetime as dt

# Function

In [3]:
#Choose comments within 60 days of bot implementation
def within_60days(df,year,month,day):
    after = int((dt.datetime(year=year,month=month,day=day)-dt.timedelta(days = 30)).timestamp())
    before = int((dt.datetime(year=year,month=month,day=day)+dt.timedelta(days = 30)).timestamp()) 
    res = df[(df['created_utc'] >=  after) & (df['created_utc'] <= before)] #need to check again
    return res

In [65]:
def comm_report(df):
    #print no of comments
    print(f'This df originally has {len(df)} comments.')
    
    #print num of pre and post
    num_pre = len(df[df['post']==0])
    num_post = len(df[df['post']==1])
    print(f'Pre:{num_pre}, Post:{num_post}')

    #print no of del comments
    num_del = len(df[(df['body'] == '[deleted]') | (df['body'] == '[removed]')])
    print(f'{num_del} comments were deleted/removed.')

    #print comments by AutoMod
    num_automod = len(df[df['author'] =='AutoModerator'])
    print(f'Automod posted {num_automod} comments.')

In [62]:
def subm_report(df):
    #print num of sub
    print(f'This df has {len(df)} submissions.')

    #print num of pre and post
    num_pre = len(df[df['post']==0])
    num_post = len(df[df['post']==1])
    print(f'Pre:{num_pre}, Post:{num_post}')

    #print subm blank
    num_null = len(df[df['selftext'].notna()])
    print(f'{num_null} submissions has content')

    #print no of del submissions
    num_del = len(df[(df['selftext'] == '[deleted]') | (df['selftext'] == '[removed]')])
    print(f'{num_del} comments were deleted or removed.')

    #print subm by AutoMod
    num_automod = len(df[df['author'] =='AutoModerator'])
    print(f'Automod posted {num_automod} submissions.')

    #print subm is meme
    num_meme_pre = len(df[((df['domain'] == 'i.redd.it') | (df['domain'] == 'i.imgur.com') | (df['domain'] == 'imgur.com')) & (df['post']==0)])
    print(f'{num_meme_pre} submissions are images pre.')
    
    num_meme_post = len(df[((df['domain'] == 'i.redd.it') | (df['domain'] == 'i.imgur.com') | (df['domain'] == 'imgur.com')) & (df['post']==1)])
    print(f'{num_meme_post} submissions are images post.')


In [72]:
def clean_comments(source_dir,save_dir,year,month,day):
    df = pd.read_csv(source_dir)
    df = df.drop_duplicates()

    #read files and choose relevant vars
    df = df.reindex(columns = ['body','author','created_utc','retrieved_on','permalink','parent_id','subreddit','score','post'])
    df = df[['body','author','created_utc','retrieved_on','permalink','parent_id','subreddit','score','post']]

    #filter comments within 60 days of bot implementation
    df = within_60days(df=df,year=year,month=month,day=day)

    #change epoch time to human time
    df['created_utc'] = pd.to_datetime(df['created_utc'], unit='s')
    df['retrieved_on'] = pd.to_datetime(df['retrieved_on'], unit='s')
    df['updated_utc'] = pd.to_datetime(df['retrieved_on'], unit='s')
    
    #print numbers
    comm_report(df)

    #filter out deleted and removed comments
    df = df[(df['body'] != '[deleted]') & (df['body'] != '[removed]') & (df['author'] !='AutoModerator')]

    #moare report
    num_case(df)

    #write csv
    df.to_csv(save_dir,encoding = 'utf-8-sig')

    return df


In [73]:
def clean_subm(source_dir,save_dir,year,month,day):
    df = pd.read_csv(source_dir)
    df = df.drop_duplicates()

    #read files and choose relevant vars
    df = df.reindex(columns = ['author','author_flair_text','created_utc','retrieved_on','domain','full_link','id','is_reddit_media_domain','permalink','is_video','locked','num_comments','subreddit','subreddit_id','score','selftext','subreddit_subscribers','title','total_awards_received','updated_utc','removed_by','poll_data','post'])
    df = df[['author','author_flair_text','created_utc','retrieved_on','domain','full_link','id','is_reddit_media_domain','permalink','is_video','locked','num_comments','subreddit','subreddit_id','score','selftext','subreddit_subscribers','title','total_awards_received','updated_utc','removed_by','poll_data','post']]

    #filter comments within 60 days of bot implementation
    df = within_60days(df=df,year=year,month=month,day=day)

    #print numbers
    subm_report(df)

    #filter out deleted and removed comments
    df = df[(df['selftext'] != '[deleted]') & (df['selftext'] != '[removed]')]
    df = df[(df['author'] !='AutoModerator')]
    df = df[(df['domain'] != 'i.redd.it') & (df['domain'] != 'i.imgur.com') &  (df['domain'] != 'imgur.com')]
    df = df[df['selftext'].notna()]

    #change epoch time to human time
    df['created_utc'] = pd.to_datetime(df['created_utc'], unit='s')
    df['retrieved_on'] = pd.to_datetime(df['retrieved_on'], unit='s')

    #moare report
    num_case(df)

    #write csv
    df.to_csv(save_dir,encoding = 'utf-8-sig')

    return df

In [78]:
def num_case(df):
    num_post = len(df[df['post']==1])
    num_pre = len(df[df['post']==0])
    print(f'Pre: {num_pre}, Post: {num_post}, Total: {num_pre+num_post}')

# Apply to subr

## r/exfds
Nov 23 2020

In [74]:
exfds_comments = clean_comments(source_dir = './data/exfds/exfds_comments.csv', save_dir = './data/exfds/exfds_clean_comments.csv', year = 2020, month = 11, day =23)

This df originally has 600 comments.
Pre:468, Post:132
21 comments were deleted/removed.
Automod posted 0 comments.
Post: 127, Pre: 452, Total: 579


In [75]:
exfds_subm = clean_subm(source_dir = './data/exfds/exfds_subm.csv', save_dir = './data/exfds/exfds_clean_subm.csv', year = 2020, month = 11, day =23)

This df has 47 submissions.
Pre:35, Post:12
27 submissions has content
7 comments were deleted or removed.
Automod posted 0 submissions.
10 submissions are images pre.
3 submissions are images post.
Post: 5, Pre: 15, Total: 20


# r/WitchesVSPatriarchy

Dec 22, 2020

In [76]:
wvsp_comments = clean_comments(source_dir = './data/witchesvspatriarchy/wvsp_comments.csv', save_dir = './data/exfds/wvsp_clean_comments.csv', year = 2020, month = 12, day =22)

  df = pd.read_csv(source_dir)


This df originally has 43894 comments.
Pre:23015, Post:20879
9560 comments were deleted/removed.
Automod posted 0 comments.
Post: 15610, Pre: 18724, Total: 34334


In [79]:
wvsp_subm = clean_subm(source_dir = './data/witchesvspatriarchy/wvsp_subm.csv', save_dir = './data/exfds/wvsp_clean_subm.csv', year = 2020, month = 12, day =22)

This df has 3075 submissions.
Pre:1664, Post:1411
664 submissions has content
251 comments were deleted or removed.
Automod posted 0 submissions.
1004 submissions are images pre.
894 submissions are images post.
Pre: 238, Post: 175, Total: 413


  df = pd.read_csv(source_dir)


# Manual as reference

### After comments

In [None]:
fds_comments_after = pd.read_csv('./data/fds_comments_after.csv')

In [None]:
#Only choose what relevant
fds_comments_after_df = fds_comments_after[['body','author','created_utc','retrieved_on','permalink','parent_id','subreddit','score']]

In [None]:
#Change epoch time to human time 
fds_comments_after_df['created_utc'] = pd.to_datetime(fds_comments_after_df['created_utc'], unit='s')
fds_comments_after_df['retrieved_on'] = pd.to_datetime(fds_comments_after_df['retrieved_on'], unit='s')

In [None]:
#Get month (Oct and Nov)
# Two conditions, to check if data is what I want
fds_comments_after_df[(fds_comments_after_df['created_utc'].dt.month == 10) & (fds_comments_after_df['created_utc'].dt.day == 28)]

In [None]:
#Get month (Oct and Nov) - 1 mo after bot; Dec - 2 mo after bot; Jan - 3 mo after bot
fds_comments_after_1mo_df = fds_comments_after_df[(fds_comments_after_df['created_utc'].dt.month == 10) | (fds_comments_after_df['created_utc'].dt.month == 11)]
fds_comments_after_2mo_df = fds_comments_after_df[fds_comments_after_df['created_utc'].dt.month == 12]
fds_comments_after_3mo_df = fds_comments_after_df[fds_comments_after_df['created_utc'].dt.month == 1]

In [None]:
#Save datasets
fds_comments_after_1mo_df.to_csv('fds_comments_after_1mo.csv')
fds_comments_after_2mo_df.to_csv('fds_comments_after_2mo.csv')
fds_comments_after_3mo_df.to_csv('fds_comments_after_3mo.csv')

### Before comments
Replicate the steps above

In [None]:
fds_comments_before = pd.read_csv('fds_comments_before.csv')

In [None]:
fds_comments_before_df = fds_comments_before[['body','author','created_utc','retrieved_on','permalink','parent_id','subreddit','score']]

In [None]:
#Change epoch time to human time 
fds_comments_before_df['created_utc'] = pd.to_datetime(fds_comments_before_df['created_utc'], unit='s')
fds_comments_before_df['retrieved_on'] = pd.to_datetime(fds_comments_before_df['retrieved_on'], unit='s')

In [None]:
#Check if the date range is what I want
fds_comments_before_df[(fds_comments_before_df['created_utc'].dt.month == 10) & (fds_comments_before_df['created_utc'].dt.day == 27)]

In [None]:
#Get month
fds_comments_before_1mo_df = fds_comments_after_df[fds_comments_after_df['created_utc'].dt.month == 10]
fds_comments_before_2mo_df = fds_comments_after_df[fds_comments_after_df['created_utc'].dt.month == 9]
fds_comments_before_3mo_df = fds_comments_after_df[fds_comments_after_df['created_utc'].dt.month == 8]
fds_comments_before_4mo_df = fds_comments_after_df[fds_comments_after_df['created_utc'].dt.month == 7]

In [None]:
#Save datasets
fds_comments_before_1mo_df.to_csv('fds_comments_before_1mo.csv')
fds_comments_before_2mo_df.to_csv('fds_comments_before_2mo.csv')
fds_comments_before_3mo_df.to_csv('fds_comments_before_3mo.csv')
fds_comments_before_4mo_df.to_csv('fds_comments_before_4mo.csv')

### Submission Before

In [None]:
fds_sub_b4 = pd.read_csv('./data/fds_submissions_before.csv')

In [None]:
fds_sub_b4_df = fds_sub_b4[['author','author_flair_text','created_utc','retrieved_on','domain','full_link','id','is_reddit_media_domain','permalink','is_video','locked','num_comments','subreddit','subreddit_id','score','selftext','subreddit_subscribers','title','total_awards_received','updated_utc','removed_by','poll_data']]

In [None]:
#Change to human date
fds_sub_b4_df['created_utc'] = pd.to_datetime(fds_sub_b4_df['created_utc'], unit='s')
fds_sub_b4_df['retrieved_on'] = pd.to_datetime(fds_sub_b4_df['retrieved_on'], unit='s')
fds_sub_b4_df['updated_utc'] = pd.to_datetime(fds_sub_b4_df['updated_utc'], unit='s')

In [None]:
#Check if the date range is what I want
fds_sub_b4_df[(fds_sub_b4_df['created_utc'].dt.month == 7) & (fds_sub_b4_df['created_utc'].dt.day == 27)].head()

In [None]:
#Get month
fds_subm_before_1mo_df = fds_sub_b4_df[fds_sub_b4_df['created_utc'].dt.month == 10]
fds_subm_before_2mo_df = fds_sub_b4_df[fds_sub_b4_df['created_utc'].dt.month == 9]
fds_subm_before_3mo_df = fds_sub_b4_df[fds_sub_b4_df['created_utc'].dt.month == 8]
fds_subm_before_4mo_df = fds_sub_b4_df[fds_sub_b4_df['created_utc'].dt.month == 7]

In [None]:
#Save datasets
fds_subm_before_1mo_df.to_csv('./data/fds_subm_before_1mo.csv')
fds_subm_before_2mo_df.to_csv('./data/fds_subm_before_2mo.csv')
fds_subm_before_3mo_df.to_csv('./data/fds_subm_before_3mo.csv')
fds_subm_before_4mo_df.to_csv('./data/fds_subm_before_4mo.csv')

### Submissions after

In [None]:
fds_sub_after = pd.read_csv('./data/fds_submissions_after.csv')

In [None]:
fds_sub_after_df = fds_sub_after[['author','author_flair_text','created_utc','retrieved_on','domain','full_link','id','is_reddit_media_domain','permalink','is_video','locked','num_comments','subreddit','subreddit_id','score','selftext','subreddit_subscribers','title','total_awards_received','updated_utc','removed_by']]

In [None]:
#Change to human date
fds_sub_after_df['created_utc'] = pd.to_datetime(fds_sub_after_df['created_utc'], unit='s')
fds_sub_after_df['retrieved_on'] = pd.to_datetime(fds_sub_after_df['retrieved_on'], unit='s')
fds_sub_after_df['updated_utc'] = pd.to_datetime(fds_sub_after_df['updated_utc'], unit='s')

In [None]:
fds_sub_after_df[(fds_sub_after_df['created_utc'].dt.month == 10) & (fds_sub_after_df['created_utc'].dt.day == 28)].head()

In [None]:
#Get month
fds_subm_after_1mo_df = fds_sub_after_df[(fds_sub_after_df['created_utc'].dt.month == 11) &(fds_sub_after_df['created_utc'].dt.month == 10)]
fds_subm_after_2mo_df = fds_sub_after_df[fds_sub_after_df['created_utc'].dt.month == 12]
fds_subm_after_3mo_df = fds_sub_after_df[fds_sub_after_df['created_utc'].dt.month == 1]

In [None]:
#Save datasets
fds_subm_after_1mo_df.to_csv('./data/fds_subm_after_1mo.csv')
fds_subm_after_2mo_df.to_csv('./data/fds_subm_after_2mo.csv')
fds_subm_after_3mo_df.to_csv('./data/fds_subm_after_3mo.csv')

## Anomaly Analysis
For Comments: Deleted comments

For Submissions: Deleted submissions, non-text content

In [None]:
#Import BEFORE data. Dataset already in this notebook
fds_b4_1mo =  pd.read_csv('./data/fds_comments_before_1mo.csv')

#Number of deleted and remove
len(fds_b4_1mo[(fds_b4_1mo['body'] == '[deleted]') | (fds_b4_1mo['body'] == '[removed]')])/len(fds_b4_1mo)*100

In [None]:
#Import AFTER data
fds_aft_1mo =  pd.read_csv('./data/fds_comments_after_1mo.csv')

#Number of deleted and remove
len(fds_aft_1mo[(fds_aft_1mo['body'] == '[deleted]') | (fds_aft_1mo['body'] == '[removed]')])/len(fds_aft_1mo)*100

### Number of deleted commments - Other months

In [None]:
#Import BEFORE data - Jul
fds_jul =  pd.read_csv('./data/fds_comments_before_4mo.csv')

#Number of deleted and remove
len(fds_jul[(fds_jul['body'] == '[deleted]') | (fds_jul['body'] == '[removed]')])/len(fds_jul)*100

In [None]:
#Import BEFORE data - Aug
fds_aug =  pd.read_csv('./data/fds_comments_before_3mo.csv')

#Number of deleted and remove
len(fds_aug[(fds_aug['body'] == '[deleted]') | (fds_aug['body'] == '[removed]')])/len(fds_aug)*100

In [None]:
#Import BEFORE data - Sep
fds_sep =  pd.read_csv('./data/fds_comments_before_2mo.csv')

#Number of deleted and remove
len(fds_sep[(fds_sep['body'] == '[deleted]') | (fds_sep['body'] == '[removed]')])/len(fds_sep)*100

In [None]:
#Import AFTER data - Dec
fds_dec =  pd.read_csv('./data/fds_comments_after_2mo.csv')

#Number of deleted and remove
len(fds_dec[(fds_dec['body'] == '[deleted]') | (fds_dec['body'] == '[removed]')])/len(fds_dec)*100

In [None]:
#Import AFTER data - Jan
fds_jan =  pd.read_csv('./data/fds_comments_after_3mo.csv')

#Number of deleted and remove
len(fds_jan[(fds_jan['body'] == '[deleted]') | (fds_jan['body'] == '[removed]')])/len(fds_jan)*100