# Title of the proj

In [37]:
import pandas as pd
import matplotlib.pyplot as plt

# https://huggingface.co/bhadresh-savani/distilbert-base-uncased-emotion?text=I+feel+a+bit+let+down
from transformers import pipeline

from tqdm import tqdm # prgress bar

## Loading Comments and Submissions Data

In [56]:
text_submissions_df = pd.read_csv("text_submissions.csv")
text_comments_df = pd.read_csv("text_comments.csv", iterator=True, chunksize=1000000,lineterminator='\n')

  exec(code_obj, self.user_global_ns, self.user_ns)


In [12]:
# instead of looping over we can do get_chunk which gets the next 1M chunk of data in comments
# to set this back to the beginning of the data, recreate the df iterator with the line above
text_comments_df.get_chunk()

Unnamed: 0,id,score,link_id,author,subreddit,body,created_utc
0,t1_ftjl56l,4,t3_gzv6so,mega_trex,BeautyGuruChatter,Does anyone have a good cruelty free one? The ...,1591755558
1,t1_ftjpxmc,6,t3_gzv6so,[deleted],BeautyGuruChatter,(stares at my soft glam i've had for like 3 ye...,1591758382
2,t1_gzzxfyt,22,t3_nodb9e,divadream,BeautyGuruChatter,When Jen‚Äôs initial reactions came out to the s...,1622398357
3,t1_gzzy7nc,92,t3_no6qaj,Ziegenkoennenfliegen,BeautyGuruChatter,I think you mean a \n>Highschool *fucking* bully,1622398743
4,t1_h00tpbp,82,t3_nolx7p,meowrottenralph,BeautyGuruChatter,Ugh. I was honestly hoping that this brand wou...,1622414834
...,...,...,...,...,...,...,...
999995,t1_gworr24,1,t3_n3aw7j,[deleted],FreeKarma4U,[removed],1619977051
999996,t1_gworra8,1,t3_n3azvb,[deleted],FreeKarma4U,[removed],1619977054
999997,t1_gzyn0mk,1,t3_no81im,leusee_xx,FreeKarma4U,Upvoted. Please upvote my latest post üôèüèª,1622369532
999998,t1_gzyn2ee,1,t3_no85gx,[deleted],FreeKarma4U,[removed],1622369584


In [11]:
text_submissions_df

Unnamed: 0,id,author,created_utc,domain,is_self,score,selftext,title,subreddit
0,t3_npxigk,All_Consuming_Void,1622563615,self.BeautyGuruChatter,True,0.0,[removed],Hyram launches his own brand,BeautyGuruChatter
1,t3_nqj6bf,AutoModerator,1622631621,self.BeautyGuruChatter,True,38.0,What are the influencers trying to influence y...,What I'm not gonna buy Wednesday - Anti-haul,BeautyGuruChatter
2,t3_nk0btr,barrahhhh,1621869439,reddit.com,False,144.0,,Plouise goes off in facebook group for 'bullying',BeautyGuruChatter
3,t3_nrbybs,[deleted],1622722260,self.BeautyGuruChatter,True,2.0,[deleted],Is youtube algorithm against Susan Yara? She g...,BeautyGuruChatter
4,t3_nl0ebd,carlosShook,1621977767,vm.tiktok.com,False,0.0,,Sephora steals concept from Huntr Faulknr afte...,BeautyGuruChatter
...,...,...,...,...,...,...,...,...,...
3496175,t3_ae5pzt,middlefinger22,1547030976,self.yakuzagames,True,1.0,And they won't have so much changes as Kiwami2.,"Why it takes so long to release 3, 4 and 5 on ...",yakuzagames
3496176,t3_cer9jj,AutoModerator,1563449362,self.nrl,True,15.0,|NRL| ...,Round 18: Broncos vs Bulldogs | Post Match Dis...,nrl
3496177,t3_bqidb1,nuke8960,1558280683,i.redd.it,False,35.0,,Gotta spread the word,BirdsArentReal
3496178,t3_cqvbwk,anon3212,1565900167,self.ShadowBan,True,1.0,Am I shadowbanned?,Test,ShadowBan


## Data Cleaning and Preprocessing

In [57]:
# Dropping columns that aren't useful for us in order to make the dataframes smaller
text_submissions_df=text_submissions_df.drop(columns=['score'])

# Drop submissions with title or text that is empty or na
text_submissions_df.drop(text_submissions_df[text_submissions_df['title'].isna() | (text_submissions_df['title'] == '')].index, inplace=True)
text_submissions_df.drop(text_submissions_df[text_submissions_df['selftext'].isna() | (text_submissions_df['selftext'] == '')].index, inplace=True)
# text_submissions_df.dropna(subset=['title'], inplace=True) # Drop na or empty titles

# Drop this row because it has an invalid date and other columns are nan
text_submissions_df=text_submissions_df[~(text_submissions_df['created_utc'] == "CPTSD")]

text_submissions_df['created_utc'] = text_submissions_df['created_utc'].astype(int)

### Bot detection

In [None]:
# Bot Detection

### Separating data to pre-pandemic and pandemic

In [60]:
# start of the pandemic as of February 1, 2020
# created_at is a UNIX timestamp. Feb 1, 2020 00:00:00 UTC = 1580515200
start_of_pandemic = 1580515200
text_submissions_df_pre = text_submissions_df[text_submissions_df["created_utc"] < start_of_pandemic]
text_submissions_df_post = text_submissions_df[text_submissions_df["created_utc"] >= start_of_pandemic]

### Detecting COVID related posts
We detect submissions related to covid by searching in the their title for covid related words (with no case sensitivity).

In [24]:
covid_related_words = ['pandemic', 'covid']
search_pattern = '|'.join(covid_related_words)
covid_submissions = text_submissions_df[text_submissions_df['title'].str.contains(search_pattern, case=False)]

# printing the title of covid related posts
covid_submissions.head(5)

Unnamed: 0,id,author,created_utc,domain,is_self,selftext,title,subreddit
21,t3_krer7q,[deleted],1609902052,reddit.com,False,[deleted],James Charles calls out youtuber JennxPenn for...,BeautyGuruChatter
44,t3_fjm72g,ButterscotchFog,1584370462,i.redd.it,False,,Shout out to creators whose content continues ...,BeautyGuruChatter
45,t3_fko0g4,[deleted],1584530758,i.redd.it,False,[deleted],(UPDATE) Mykie/GlamandGore addressing her firs...,BeautyGuruChatter
103,t3_jzy1ls,[deleted],1606191804,self.BeautyGuruChatter,True,[removed],"I‚Äôm just Curious as to why BGs, celebs, models...",BeautyGuruChatter
126,t3_j1dj17,[deleted],1601305059,jezebel.com,False,[deleted],"Jeffree Star Took Covid-19 Loans, Months After...",BeautyGuruChatter


## Exploratory Analysis

In [30]:
# sample - delete later
classifier = pipeline("text-classification",model='bhadresh-savani/distilbert-base-uncased-emotion', return_all_scores=True)
prediction = classifier(["I love using transformers.", "I hate you so much"],)
prediction

[[{'label': 'sadness', 'score': 0.007511448580771685},
  {'label': 'joy', 'score': 0.8325576186180115},
  {'label': 'love', 'score': 0.014407153241336346},
  {'label': 'anger', 'score': 0.13556727766990662},
  {'label': 'fear', 'score': 0.008320360444486141},
  {'label': 'surprise', 'score': 0.0016362066380679607}],
 [{'label': 'sadness', 'score': 0.048631247133016586},
  {'label': 'joy', 'score': 0.007470840588212013},
  {'label': 'love', 'score': 0.03859551250934601},
  {'label': 'anger', 'score': 0.902003288269043},
  {'label': 'fear', 'score': 0.0024381077382713556},
  {'label': 'surprise', 'score': 0.00086104596266523}]]

### Average sentiment score pre-pandemic
We want to find the average score (0-1) for each sentiment (sadness, joy, love, anger, fear, surprise) for submissions and comments for posts pre-pandemic. 

In [72]:
# TODO for now we do it for submissions - figure it out for comments as well? - takes multiple hours
# TODO even for submissions selftext it's taken 10-11 hours.. so just doing it with title right now
# TODO also there's a limit to the characters for this classifier

# for now take 5% of text_submissions_df_pre (100% of the data takes >10 hours)
ten_percent = int(len(text_submissions_df_pre) * 0.05)
text_submissions_df_pre_subset = text_submissions_df_pre.head(ten_percent)

# add new columns in submissions dataframe for each emotion and set all to NaN
emotions = ['sadness', 'joy', 'love', 'anger', 'fear', 'surprise']
for emotion in emotions:
    text_submissions_df_pre_subset[emotion] = float('NaN')

def get_scores_and_update(row):
    scores = classifier(row['title'])[0]
    for score in scores:
        emotion = score['label']
        row[emotion] = score['score']
    return row
  
# extract score for each emotion and set the score to the particular column
total_rows = len(text_submissions_df_pre_subset)
with tqdm(total=total_rows) as pbar:
  for index, row in text_submissions_df_pre_subset.iterrows():
      scores = classifier(row['title'])[0]
      for score in scores:
          emotion = score['label']
          text_submissions_df_pre_subset.at[index, emotion] = score['score']
      
      pbar.update(1)
      pbar.set_description(f'Processing: {((index + 1) / total_rows) * 100:.2f}%')  # Display progress percentage

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  text_submissions_df_pre_subset[emotion] = float('NaN')
Processing: 4252.75%: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 33518/33518 [16:04<00:00, 34.74it/s]


In [73]:
for emotion in emotions:
  print('average', emotion, 'score:', text_submissions_df_pre_subset[emotion].mean())

average sadness score: 0.08333218761260267
average joy score: 0.39390795732316913
average love score: 0.014010731028408201
average anger score: 0.35626362880260765
average fear score: 0.14184019473937085
average surprise score: 0.0106453003270222


In [None]:
# Downloading the 4 updated cvs which have cleaned up the data and split the comments and submissions csv files into 2 
# because they are smaller files, thus easier to load up and deal with
# text_submissions_df_pre.to_csv('data/text_submissions_pre.csv')
# text_submissions_df_post.to_csv("data/text_submissions_post.csv")