In [1]:
import pandas as pd
import re

In [2]:
comments_file = 'csv_files/Comments/AITA_comments_2023_12.csv'
submissions_file = 'csv_files/Submissions/AITA_submissions_2023_12.csv'
output_file = 'csv_files/Labelled_NA/2023_12.csv'

# Comments - The top level comments will be used for labelling the posts dataset.

In [3]:
comments_df = pd.read_csv(comments_file)
comments_df.shape

  comments_df = pd.read_csv(comments_file)


(1248172, 72)

In [4]:
comments_df.columns

Index(['permalink', 'associated_award', 'created', 'subreddit_name_prefixed',
       'downs', 'num_reports', 'author_flair_type', 'comment_type',
       'treatment_tags', 'score_hidden', 'author_cakeday', 'subreddit_id',
       'parent_id', 'banned_by', 'body', 'mod_note', 'total_awards_received',
       'gildings', 'approved_at_utc', 'can_mod_post', 'created_utc',
       'all_awardings', '_meta', 'collapsed', 'gilded', 'locked', 'saved',
       'ups', 'awarders', 'author_fullname', 'collapsed_reason',
       'distinguished', 'author_flair_text', 'no_follow', 'mod_reason_by',
       'retrieved_on', 'stickied', 'link_id', 'author_premium',
       'top_awarded_type', 'author_flair_template_id', 'subreddit_type',
       'author_is_blocked', 'mod_reason_title', 'replies', 'name',
       'user_reports', 'author_flair_background_color', 'likes', 'subreddit',
       'is_submitter', 'edited', 'id', 'removal_reason', 'controversiality',
       'banned_at_utc', 'editable', 'can_gild', 'author',


## Drop all columns except link_id, parent_id and body

In [5]:
comments_df = comments_df[['link_id', 'parent_id', 'body']]

## Filter out only the top level comments

In [6]:
comments_df = comments_df[comments_df['link_id'] == comments_df['parent_id']]
# Drop the parent_id column
comments_df = comments_df.drop(columns=['parent_id'])
comments_df.shape

(694569, 2)

In [7]:
comments_df['body'].head()

0    I think many people make the mistake of thinki...
1    NTA, HOWEVER, windfalls like that could easily...
2    Nta. Sue him. I once heard a guy sued a family...
5     Pretty sure I read this story a month or so ago.
8    Yta \n\nI live on a road where the limit is 35...
Name: body, dtype: object

In [8]:
# Remove rows where 'body' equals '[removed]' or [deleted] or is NaN
comments_df = comments_df[(comments_df['body'] != '[removed]') & (comments_df['body'] != '[deleted]') & (comments_df['body'].notna())]
comments_df.shape

(635433, 2)

In [9]:
# Remove rows that were generated by bots
bot_str_1 = 'I am a bot, and this action was performed automatically'
bot_str_2 = 'Your post has been removed'

comments_df = comments_df[~(comments_df['body'].str.contains(bot_str_1) | comments_df['body'].str.contains(bot_str_2))]
print('Shape after filtering unwanted rows:')
comments_df.shape

Shape after filtering unwanted rows:


(581184, 2)

In [10]:
# Remove columns where all values are NaN
comments_df = comments_df.loc[:, ~comments_df.isna().all(axis=0)]

# Drop all duplicate columns
comments_df.drop_duplicates(keep='first', inplace=True)
comments_df.shape

(572436, 2)

In [11]:
comments_df.head()

Unnamed: 0,link_id,body
0,t3_187qp3s,I think many people make the mistake of thinki...
1,t3_187pj8a,"NTA, HOWEVER, windfalls like that could easily..."
2,t3_187n1wl,Nta. Sue him. I once heard a guy sued a family...
5,t3_187qgh4,Pretty sure I read this story a month or so ago.
8,t3_187n1wl,Yta \n\nI live on a road where the limit is 35...


In [12]:
def categorize_comments(data_frame):
    # Define patterns for full word match
    patterns = {
#         r'\bnta\b|\bywnbta\b|\byntah\b|\bynta\b': 'ntj',
#         r'\byta\b|\byat\b|\byah\b|\bah\b|\bywbta\b|\btah\b|\bytah\b': 'ytj',
#         r'\beah\b|\behs\b|\besh\b|\bnah\b|\binfo\b': 'na'
        
        r'\bnta\b|\bywnbta\b|\byntah\b|\bynta\b|\bnah\b': 'ntj',
        r'\beah\b|\behs\b|\besh\b|\byta\b|\byat\b|\byah\b|\bah\b|\bywbta\b|\btah\b|\bytah\b': 'ytj'
#         r'\binfo\b': 'na'
        
#         r'\bnah\b': 'njh',
#         r'\binfo\b': 'info'
    }

    # Function to assign labels
    def assign_label(text):
        found_labels = set()
        for pattern, label in patterns.items():
            if re.search(pattern, text, re.IGNORECASE):
                found_labels.add(label)
        # Create label strings
        if found_labels:
            labels_str = ', '.join(sorted(found_labels))  # Sort for consistency
            return labels_str, 'multiple' if len(found_labels) > 1 else labels_str
        else:
            return 'undefined', 'undefined'

    # Apply the function to the 'body' column and split the results into two new columns
    data_frame[['labels', 'final_label']] = pd.DataFrame(data_frame['body'].apply(assign_label).tolist(), index=comments_df.index)

In [13]:
categorize_comments(comments_df)

# Display the updated DataFrame
comments_df[['body', 'final_label']].head()

Unnamed: 0,body,final_label
0,I think many people make the mistake of thinki...,undefined
1,"NTA, HOWEVER, windfalls like that could easily...",multiple
2,Nta. Sue him. I once heard a guy sued a family...,ntj
5,Pretty sure I read this story a month or so ago.,undefined
8,Yta \n\nI live on a road where the limit is 35...,ytj


In [14]:
# Drop body column
comments_df = comments_df.drop(columns=['body', 'labels'])

In [15]:
comments_df['final_label'].value_counts()

ntj          277885
ytj          144784
undefined    137238
multiple      12529
Name: final_label, dtype: int64

In [16]:
# Drop all rows that contain label = 'undefined' or 'multiple'
comments_df = comments_df[(comments_df['final_label'] != 'undefined') & (comments_df['final_label'] != 'multiple')]
comments_df.shape

(422669, 2)

In [17]:
# Check the number of values for each label
comments_df['final_label'].value_counts()

ntj    277885
ytj    144784
Name: final_label, dtype: int64

In [18]:
comments_df['link_id'].value_counts()

t3_18ed9aq    2360
t3_18jbjgm    2324
t3_18mbv9j    2198
t3_18j5z07    1924
t3_18l7q59    1886
              ... 
t3_18lqr53       1
t3_18lhtdz       1
t3_18lrgo4       1
t3_18lrt9b       1
t3_18uzmbf       1
Name: link_id, Length: 15329, dtype: int64

In [19]:
# Get the label with the maximum occurence for each submission (link_id)

In [20]:
# Step 1: Count occurrences of each label for each link_id
counts = comments_df.groupby(['link_id', 'final_label']).size().reset_index(name='count')

# Step 2: Find the label with the maximum count for each link_id
# This involves sorting by 'count' and then dropping duplicates keeping the entry with the maximum count
result_df = counts.sort_values(by=['link_id', 'count'], ascending=[True, False]).drop_duplicates(subset='link_id')

# result_df now contains columns 'link_id', 'label', and 'count' where 'label' is the one with the maximum count for each link_id
result_df = result_df.drop(columns='count')  # Optional: Drop the count column if not needed


In [21]:
result_df.head()

Unnamed: 0,link_id,final_label
0,t3_142jic8,ntj
2,t3_143yx8q,ntj
3,t3_144u0s6,ntj
4,t3_149bfca,ntj
5,t3_149ssj1,ytj


In [22]:
comments_df[comments_df['link_id'] == 't3_1agspp0']

Unnamed: 0,link_id,final_label


# Submissions

In [23]:
sub_df = pd.read_csv(submissions_file)
sub_df.shape

  sub_df = pd.read_csv(submissions_file)


(32970, 111)

In [24]:
sub_df.head()

Unnamed: 0,_meta,all_awardings,allow_live_comments,approved_at_utc,approved_by,archived,author,author_flair_background_color,author_flair_css_class,author_flair_richtext,...,url,user_reports,view_count,visited,whitelist_status,wls,author_cakeday,link_flair_template_id,post_hint,preview
0,{'retrieved_2nd_on': 1701518430},[],False,,,False,Akziong511,,,[],...,https://www.reddit.com/r/AmItheAsshole/comment...,[],,False,some_ads,7,,,,
1,{'retrieved_2nd_on': 1701518465},[],False,,,False,AdAlternative6171,,,[],...,https://www.reddit.com/r/AmItheAsshole/comment...,[],,False,some_ads,7,,,,
2,"{'removal_type': 'moderator', 'retrieved_2nd_o...",[],False,,,False,Objective_Mistake611,,,[],...,https://www.reddit.com/r/AmItheAsshole/comment...,[],,False,some_ads,7,,,,
3,"{'is_edited': True, 'retrieved_2nd_on': 170151...",[],False,,,False,Soggy_Rope_,,,[],...,https://www.reddit.com/r/AmItheAsshole/comment...,[],,False,some_ads,7,,,,
4,"{'removal_type': 'moderator', 'retrieved_2nd_o...",[],False,,,False,Lacy7357,,,[],...,https://www.reddit.com/r/AmItheAsshole/comment...,[],,False,some_ads,7,,,,


In [25]:
sub_df.columns

Index(['_meta', 'all_awardings', 'allow_live_comments', 'approved_at_utc',
       'approved_by', 'archived', 'author', 'author_flair_background_color',
       'author_flair_css_class', 'author_flair_richtext',
       ...
       'url', 'user_reports', 'view_count', 'visited', 'whitelist_status',
       'wls', 'author_cakeday', 'link_flair_template_id', 'post_hint',
       'preview'],
      dtype='object', length=111)

# Drop all columns except selftext and name (id column)

In [26]:
sub_df = sub_df[['selftext', 'name']]
sub_df.shape

(32970, 2)

In [27]:
sub_df = pd.merge(result_df, sub_df, how='left', left_on='link_id', right_on='name')
sub_df.shape

(15329, 4)

In [28]:
# Remove rows where 'body' equals '[removed]' or [deleted] or is NaN
sub_df = sub_df[(sub_df['selftext'] != '[removed]') & (sub_df['selftext'] != '[deleted]') & (sub_df['selftext'].notna())]
sub_df.shape

(10626, 4)

In [29]:
sub_df.shape

(10626, 4)

In [30]:
sub_df.head()

Unnamed: 0,link_id,final_label,selftext,name
655,t3_187xeed,ytj,I have never been on here asking for help befo...,t3_187xeed
656,t3_187xeha,ytj,While mv girlfriend 29F was working in Montrea...,t3_187xeha
657,t3_187xh0d,ntj,I'm the most junior member of a small team (bo...,t3_187xh0d
658,t3_187xmox,ntj,"We lived in 4 single bedroom dorm, my roommate...",t3_187xmox
659,t3_187xooh,ntj,So I’m a bit of a mess. Due to my mental healt...,t3_187xooh


# Save Labelled File

In [31]:
sub_df.to_csv(output_file, index=False)