In [2]:
import pandas as pd
import matplotlib.pyplot as plt

text_comments Data Loading + Cleaning

In [3]:
text_reader = pd.read_csv("data/text_comments.csv", iterator=True, chunksize=1000000,lineterminator='\n')

In [None]:
# text_comments.csv is very large, so going to split it into 2 time-based dataframes: before and after Feb 1, 2020 
# Each dataframe will have id, linkid, body, and created_utc
# Bot Detection: The dataframes will not contain author names but we will also remove rows with bot or mod in the author names
# Removed Comments: If a comment has body [deleted] or [removed] then, we remove those as well
# It's easier to deal with 2 smaller datasets but if necessary, we can concatenate the pre and post covid dataframes later on

In [4]:
temp = pd.DataFrame() # Will temporarily store dataframes for each chunk (not changing the data) 
lst = [] # will store dataframes after they have been pruned

In [5]:
for chunk in text_reader: 
    # Add each chunk of txt_comments to a df and append it to a list
    temp = pd.DataFrame()
    temp = chunk[["id", "link_id", "author", "body", "created_utc"]]
    lst.append(temp)
    # time: 5m

In [6]:
len(lst)

41

In [22]:
lst_shape = [] # will store how many rows are in each small dataframe - so that we can keep track that we're not missing rows by the end

In [31]:
# Modify each dataframe in lst by removing deleted comments, removing comments from bots, dropping the author column, and changing created_utc into int type

for i in range(len(lst)):
    print(i)

    # remove usernames that indicate it's a bot and comments that are [removed] and [deleted]
    lst[i] = lst[i][~((lst[i]["body"] == "[removed]") | (lst[i]["body"] == "[deleted]") | lst[i]["author"].str.lower().str.contains("bot|mod"))]
    # we're not keeping author column
    lst[i] = lst[i].drop(columns=['author'])
    # transform date into int
    lst[i]['created_utc'] = lst[i]['created_utc'].astype(int)
    lst_shape.append(lst[i].shape[0])
    # time: 10m
    

2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40


In [32]:
sum(lst_shape) # the total number of comments after pruning: 36449566

36449566

In [33]:
# dataframes that will only have comments split based on Feb 01, 2022 UTC
# start of the pandemic: February 1, 2020 
# created_at is a UNIX timestamp. Feb 1, 2020 00:00:00 UTC = 1580515200

lst_precovid = []
lst_postcovid = []

In [34]:
# For each dataframe in lst, create two dataframes that has comments based on dates and append it to the correct list

for i in range(len(lst)): 
    print(i)

    # Pre-covid 
    lst_precovid.append(lst[i][lst[i]["created_utc"] < 1580515200])
    # Post-covid 
    lst_postcovid.append(lst[i][lst[i]["created_utc"] >= 1580515200])

    # time: 7-10m

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40


In [36]:
# Concatenate all the dataframes that have comments before Feb 1, 2020 UTC into 1 dataframe
df_comments_precovid = pd.concat(lst_precovid) # time: 2-3m

In [37]:
# Concatenate all the dataframes that have comments on or after Feb 1, 2020 UTC into 1 dataframe
df_comments_postcovid = pd.concat(lst_postcovid) # time: 3min

In [40]:
df_comments_precovid

Unnamed: 0,id,link_id,body,created_utc
3058,t1_fg500j1,t3_ewseet,I really love how precise she is with makeup a...,1580514977
3062,t1_fd2eb6n,t3_ejq7ek,That part of the whole story is extremely susp...,1578145985
3069,t1_fd3xequ,t3_ejuxan,"Prep: If it's a constant-work period, I let pe...",1578162447
3070,t1_fd4llkb,t3_ek1pnm,Can you repost linking the actual video and no...,1578168690
3076,t1_fd63yhw,t3_ek3ifu,This gave me weird al vibes and I’m living for...,1578185200
...,...,...,...,...
40721545,t1_ew3s0p8,t3_cmccmo,"Did people forget Finn had to face Corbin, Jin...",1565092947
40721546,t1_ennpzrt,t3_bp2f3w,I hope,1557949685
40721547,t1_f02psre,t3_d3f1tn,I will literally cry if you're right dude xD L...,1568335635
40721548,t1_f5t4bhw,t3_dp6jz9,Wow. That's incredibly hostile for no reason.,1572450141


In [42]:
df_comments_postcovid.shape # (22814508, 4)

(22814508, 4)

In [None]:
# check the sizes are the same: 40721549 index. 40721550 rows total
# 22814508+13635058 = sum(lst_shape) = 36449566

In [71]:
# df_comments_pre.to_csv('data/df_comments_pre.csv') # time: 
# df_comments_postcovid.to_csv('data/df_comments_post.csv') # time: 

text_submissions Data Loading + Cleaning

In [10]:
text_submissions_df = pd.read_csv("data/text_submissions.csv")

  text_submissions_df = pd.read_csv("data/text_submissions.csv")


In [11]:
text_submissions_df

Unnamed: 0,id,author,created_utc,domain,is_self,score,selftext,title,subreddit
0,t3_npxigk,All_Consuming_Void,1622563615,self.BeautyGuruChatter,True,0.0,[removed],Hyram launches his own brand,BeautyGuruChatter
1,t3_nqj6bf,AutoModerator,1622631621,self.BeautyGuruChatter,True,38.0,What are the influencers trying to influence y...,What I'm not gonna buy Wednesday - Anti-haul,BeautyGuruChatter
2,t3_nk0btr,barrahhhh,1621869439,reddit.com,False,144.0,,Plouise goes off in facebook group for 'bullying',BeautyGuruChatter
3,t3_nrbybs,[deleted],1622722260,self.BeautyGuruChatter,True,2.0,[deleted],Is youtube algorithm against Susan Yara? She g...,BeautyGuruChatter
4,t3_nl0ebd,carlosShook,1621977767,vm.tiktok.com,False,0.0,,Sephora steals concept from Huntr Faulknr afte...,BeautyGuruChatter
...,...,...,...,...,...,...,...,...,...
3496175,t3_ae5pzt,middlefinger22,1547030976,self.yakuzagames,True,1.0,And they won't have so much changes as Kiwami2.,"Why it takes so long to release 3, 4 and 5 on ...",yakuzagames
3496176,t3_cer9jj,AutoModerator,1563449362,self.nrl,True,15.0,|NRL| ...,Round 18: Broncos vs Bulldogs | Post Match Dis...,nrl
3496177,t3_bqidb1,nuke8960,1558280683,i.redd.it,False,35.0,,Gotta spread the word,BirdsArentReal
3496178,t3_cqvbwk,anon3212,1565900167,self.ShadowBan,True,1.0,Am I shadowbanned?,Test,ShadowBan


In [12]:
# Dropping columns that aren't useful for us to make the dataframes smaller

text_submissions_df=text_submissions_df.drop(columns=['score'])

In [13]:
# Drop this row because it has an invalid date and other columns are nan
text_submissions_df=text_submissions_df[~(text_submissions_df['created_utc'] == "CPTSD")]

In [14]:
text_submissions_df['created_utc'] = text_submissions_df['created_utc'].astype(int)

In [15]:
# start of the pandemic as of February 1, 2020
# created_at is a UNIX timestamp. Feb 1, 2020 00:00:00 UTC = 1580515200
text_submissions_df_pre = text_submissions_df[text_submissions_df["created_utc"] < 1580515200]
text_submissions_df_post = text_submissions_df[text_submissions_df["created_utc"] >= 1580515200]

In [84]:
# TODO (Q): Bot Detection not necessary for Posts? just for text_comments?

In [None]:
# Downloading the 4 updated cvs which have cleaned up the data and split the comments and submissions csv files into 2 
# because they are smaller files, thus easier to load up and deal with
# text_submissions_df_pre.to_csv('data/text_submissions_pre.csv')
# text_submissions_df_post.to_csv("data/text_submissions_post.csv")