# Title of the proj

In [1]:
import pandas as pd
import matplotlib.pyplot as plt

# https://huggingface.co/bhadresh-savani/distilbert-base-uncased-emotion?text=I+feel+a+bit+let+down
# from transformers import pipeline

# from tqdm import tqdm # prgress bar

## Loading Comments and Submissions Data

In [5]:
text_submissions_df = pd.read_csv("data/text_submissions.csv")

MemoryError: Unable to allocate 512. KiB for an array with shape (65536,) and data type int64

In [2]:
text_comments_df = pd.read_csv("data/text_comments.csv", iterator=True, chunksize=1000000,lineterminator='\n')

In [None]:
# text_comments.csv is very large, so going to split it into 2 time-based dataframes: before and after Feb 1, 2020 
# Each dataframe will have id, linkid, body, and created_utc
# Bot Detection: The dataframes will not contain author names but we will also remove rows with bot or mod in the author names
# Removed Comments: If a comment has body [deleted] or [removed] then, we remove those as well
# It's easier to deal with 2 smaller datasets but if necessary, we can concatenate the pre and post covid dataframes later on

In [3]:
temp = pd.DataFrame() # Will temporarily store dataframes for each chunk (not changing the data) 
lst = [] # will store dataframes after they have been pruned
count = 0
for chunk in text_comments_df: 
    print(count)
    # Add each chunk of txt_comments to a df and append it to a list
    temp = pd.DataFrame()
    temp = chunk[["id", "link_id", "author", "body", "created_utc"]]
    lst.append(temp)
    count+=1
    # time: 5m-10

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40


In [None]:
len(lst)

In [None]:
lst_shape = [] # will store how many rows are in each small dataframe - so that we can keep track that we're not missing rows by the end

# Modify each dataframe in lst by removing deleted comments, removing comments from bots, dropping the author column, and changing created_utc into int type
for i in range(len(lst)):
    print(i, end=' ')

    # remove usernames that indicate it's a bot and comments that are [removed] and [deleted]
    lst[i] = lst[i][~((lst[i]["body"] == "[removed]") | (lst[i]["body"] == "[deleted]") | lst[i]["author"].str.lower().str.contains("bot|mod"))]
    # we're not keeping author column
    lst[i] = lst[i].drop(columns=['author'])
    # transform date into int
    lst[i]['created_utc'] = lst[i]['created_utc'].astype(int)
    lst_shape.append(lst[i].shape[0])
    # time: 10m
    

In [None]:
sum(lst_shape) # the total number of comments after pruning: 36449566

In [None]:
# dataframes that will only have comments split based on Feb 01, 2022 UTC
# start of the pandemic: February 1, 2020 
# created_at is a UNIX timestamp. Feb 1, 2020 00:00:00 UTC = 1580515200

lst_precovid, lst_postcovid = [], []
start_of_covid = 1580515200

# For each dataframe in lst, create two dataframes that has comments based on dates and append it to the correct list
for i in range(len(lst)): 
    print(i)

    # Pre-covid 
    lst_precovid.append(lst[i][lst[i]["created_utc"] < start_of_covid])
    # Post-covid 
    lst_postcovid.append(lst[i][lst[i]["created_utc"] >= start_of_covid])

    # time: 7-10m

In [None]:
# Concatenate all the dataframes that have comments before Feb 1, 2020 UTC into 1 dataframe
df_comments_precovid = pd.concat(lst_precovid) # time: 2-3m

# Concatenate all the dataframes that have comments on or after Feb 1, 2020 UTC into 1 dataframe
df_comments_postcovid = pd.concat(lst_postcovid) # time: 3min

In [None]:
# df_comments_precovid
# df_comments_postcovid.shape # (22814508, 4)
# check the sizes are the same: 40721549 index. 40721550 rows total
# 22814508+13635058 = sum(lst_shape) = 36449566
# df_comments_pre.to_csv('data/df_comments_pre.csv') # time: 
# df_comments_postcovid.to_csv('data/df_comments_post.csv') # time: 

## Data Cleaning and Preprocessing

In [None]:
# Dropping columns that aren't useful for us in order to make the dataframes smaller
text_submissions_df=text_submissions_df.drop(columns=['score'])

# Drop submissions with title or text that is empty or na
text_submissions_df.drop(text_submissions_df[text_submissions_df['title'].isna() | (text_submissions_df['title'] == '')].index, inplace=True)
text_submissions_df.drop(text_submissions_df[text_submissions_df['selftext'].isna() | (text_submissions_df['selftext'] == '')].index, inplace=True)
# text_submissions_df.dropna(subset=['title'], inplace=True) # Drop na or empty titles

# Drop this row because it has an invalid date and other columns are nan
text_submissions_df=text_submissions_df[~(text_submissions_df['created_utc'] == "CPTSD")]

text_submissions_df['created_utc'] = text_submissions_df['created_utc'].astype(int)

### Bot detection

In [None]:
# TODO (Q): Bot Detection not necessary for Posts? just for text_comments?

### Separating data to pre-pandemic and pandemic

In [None]:
# start of the pandemic as of February 1, 2020
# created_at is a UNIX timestamp. Feb 1, 2020 00:00:00 UTC = 1580515200
start_of_pandemic = 1580515200
text_submissions_df_pre = text_submissions_df[text_submissions_df["created_utc"] < start_of_pandemic]
text_submissions_df_post = text_submissions_df[text_submissions_df["created_utc"] >= start_of_pandemic]

In [None]:
# Downloading the 4 updated cvs which have cleaned up the data and split the comments and submissions csv files into 2 
# because they are smaller files, thus easier to load up and deal with
# text_submissions_df_pre.to_csv('data/text_submissions_pre.csv')
# text_submissions_df_post.to_csv("data/text_submissions_post.csv")

### Detecting COVID related posts
We detect submissions related to covid by searching in the their title for covid related words (with no case sensitivity).

In [None]:
covid_related_words = ['pandemic', 'covid']
search_pattern = '|'.join(covid_related_words)
covid_submissions = text_submissions_df[text_submissions_df['title'].str.contains(search_pattern, case=False)]

# printing the title of covid related posts
covid_submissions.head(5)

Unnamed: 0,id,author,created_utc,domain,is_self,selftext,title,subreddit
21,t3_krer7q,[deleted],1609902052,reddit.com,False,[deleted],James Charles calls out youtuber JennxPenn for...,BeautyGuruChatter
44,t3_fjm72g,ButterscotchFog,1584370462,i.redd.it,False,,Shout out to creators whose content continues ...,BeautyGuruChatter
45,t3_fko0g4,[deleted],1584530758,i.redd.it,False,[deleted],(UPDATE) Mykie/GlamandGore addressing her firs...,BeautyGuruChatter
103,t3_jzy1ls,[deleted],1606191804,self.BeautyGuruChatter,True,[removed],"I’m just Curious as to why BGs, celebs, models...",BeautyGuruChatter
126,t3_j1dj17,[deleted],1601305059,jezebel.com,False,[deleted],"Jeffree Star Took Covid-19 Loans, Months After...",BeautyGuruChatter


## Exploratory Analysis

In [None]:
# sample - delete later
classifier = pipeline("text-classification",model='bhadresh-savani/distilbert-base-uncased-emotion', return_all_scores=True)
prediction = classifier(["I love using transformers.", "I hate you so much"],)
prediction

[[{'label': 'sadness', 'score': 0.007511448580771685},
  {'label': 'joy', 'score': 0.8325576186180115},
  {'label': 'love', 'score': 0.014407153241336346},
  {'label': 'anger', 'score': 0.13556727766990662},
  {'label': 'fear', 'score': 0.008320360444486141},
  {'label': 'surprise', 'score': 0.0016362066380679607}],
 [{'label': 'sadness', 'score': 0.048631247133016586},
  {'label': 'joy', 'score': 0.007470840588212013},
  {'label': 'love', 'score': 0.03859551250934601},
  {'label': 'anger', 'score': 0.902003288269043},
  {'label': 'fear', 'score': 0.0024381077382713556},
  {'label': 'surprise', 'score': 0.00086104596266523}]]

### Average submission sentiment score pre-pandemic
We want to find the average score (0-1) for each sentiment (sadness, joy, love, anger, fear, surprise) for submissions and comments for posts pre-pandemic. 

In [None]:
# TODO for now we do it for submissions - figure it out for comments as well? - takes multiple hours
# TODO even for submissions selftext it's taken 10-11 hours.. so just doing it with title right now
# TODO also there's a limit to the characters for this classifier

# for now take 5% of text_submissions_df_pre (100% of the data takes >10 hours)
ten_percent = int(len(text_submissions_df_pre) * 0.05)
text_submissions_df_pre_subset = text_submissions_df_pre.head(ten_percent)

# add new columns in submissions dataframe for each emotion and set all to NaN
emotions = ['sadness', 'joy', 'love', 'anger', 'fear', 'surprise']
for emotion in emotions:
    text_submissions_df_pre_subset[emotion] = float('NaN')

def get_scores_and_update(row):
    scores = classifier(row['title'])[0]
    for score in scores:
        emotion = score['label']
        row[emotion] = score['score']
    return row
  
# extract score for each emotion and set the score to the particular column
total_rows = len(text_submissions_df_pre_subset)
with tqdm(total=total_rows) as pbar:
  for index, row in text_submissions_df_pre_subset.iterrows():
      scores = classifier(row['title'])[0]
      for score in scores:
          emotion = score['label']
          text_submissions_df_pre_subset.at[index, emotion] = score['score']
      
      pbar.update(1)
      pbar.set_description(f'Processing: {((index + 1) / total_rows) * 100:.2f}%')  # Display progress percentage

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  text_submissions_df_pre_subset[emotion] = float('NaN')
Processing: 4252.75%: 100%|██████████| 33518/33518 [16:04<00:00, 34.74it/s]


In [None]:
for emotion in emotions:
  print('average', emotion, 'score:', text_submissions_df_pre_subset[emotion].mean())

average sadness score: 0.08333218761260267
average joy score: 0.39390795732316913
average love score: 0.014010731028408201
average anger score: 0.35626362880260765
average fear score: 0.14184019473937085
average surprise score: 0.0106453003270222
