## Label Dataset 

This notebook enables users to manually label the dataset as one of these three labels: 

- Positive: Positively promoting the brand
- Negative: Negatively promoting the brand
- Neutral: Doesn't fit into either of these categories

In [1]:
%%capture
pip install -r ../requirements.txt

In [2]:
import sys

# set this on the path so that we can reference the commong data locations
sys.path.append("../scripts/")

In [3]:
from access_data import authenticate_google_drive, grab_google_drive_folder_data

drive = authenticate_google_drive()
df = grab_google_drive_folder_data(drive=drive,filename="reddit_filtered_data.csv")

Successfully loaded 'reddit_filtered_data.csv' into a DataFrame!


In [4]:
df.head() 

Unnamed: 0,submission_id,subreddit_topic,search_query,title,text,url,score,num_comments,username,created_at,data_pull_date,days_since_post_date,combine_text,flag_picture_posts
0,1gum4oc,FirstTimeHomeBuyer,Rocket Mortgage,Is rocket mortgage that bad?,I’m in the market to buy a house now (21m). I ...,https://www.reddit.com/r/FirstTimeHomeBuyer/co...,0,21,Valuable-Pilot-2818,2024-11-19 01:59:25,2025-04-02,134,Is rocket mortgage that bad?. I’m in the marke...,False
1,1541el1,FirstTimeHomeBuyer,Rocket Mortgage,WARNING- do NOT work with rocket mortgage!!!,EDIT: there is an extreme lack of transparency...,https://www.reddit.com/r/FirstTimeHomeBuyer/co...,594,471,justmeAlonekitty,2023-07-19 17:31:01,2025-04-02,623,WARNING- do NOT work with rocket mortgage!!!. ...,False
2,1hqgmvb,FirstTimeHomeBuyer,Rocket Mortgage,Avoid Rocket Mortgage,I have a RM mortgage and they call me once a y...,https://www.reddit.com/r/FirstTimeHomeBuyer/co...,607,191,maz4499,2024-12-31 15:12:06,2025-04-02,92,Avoid Rocket Mortgage. I have a RM mortgage an...,False
3,1doitpg,FirstTimeHomeBuyer,Rocket Mortgage,Rocket Mortgage Everybody,Took all of my financial information and would...,https://www.reddit.com/gallery/1doitpg,0,23,varicoseballs,2024-06-25 22:31:06,2025-04-02,281,Rocket Mortgage Everybody. Took all of my fina...,False
4,1b3xvqv,FirstTimeHomeBuyer,Rocket Mortgage,Rocket Mortgage: An honest review,As a first time home buyer and a first generat...,https://www.reddit.com/r/FirstTimeHomeBuyer/co...,36,71,Separate_Bar685,2024-03-01 16:04:32,2025-04-02,397,Rocket Mortgage: An honest review. As a first ...,False


Create a folder labeled: `labeled_data/` in here locally if you'd like to run this. 

In [5]:
import ipywidgets as widgets
import pandas as pd
from IPython.display import display, clear_output

class show_visual(): 
    def __init__(self, df, file_number=1, reviewer="aserban"):
        self.df = df
        self.labels = {}  # Store labeled data
        self.batch_size = 5
        self.current_index = 0  # Track progress
        self.label_options = [ 'Unknown', 'Positive', 'Negative', 'Neutral']
        self.label_widgets = []

        self.next_button = widgets.Button(description="Next Batch", button_style='success', layout=widgets.Layout(width='200px', height='50px'))
        self.save_button = widgets.Button(description="Save & Exit", button_style='danger', layout=widgets.Layout(width='200px', height='50px'))
        self.reviewer = reviewer
        self.file_name = f"labeled_data/{reviewer}_labeled_data_{file_number}.csv"

        self.selected_columns = ['submission_id', 'subredit_topic', 'search_query', 'combine_text', 'url', 'label', 'reviewer']
        
    # Function to display the current batch
    def show_batch(self):
        clear_output(wait=True)
        if self.current_index >= len(self.df):
            print("All samples labeled!")
            return
        
        batch_end = min(self.current_index + self.batch_size, len(self.df))
        current_batch = self.df.iloc[self.current_index:batch_end]
        
        self.label_widgets.clear()        
        for i, row in current_batch.iterrows():
            dropdown = widgets.Dropdown(
                options=self.label_options,
                description=f"Label: ",
                style={'description_width': 'initial'},
                layout=widgets.Layout(width='700px')
            )
            self.label_widgets.append((i, dropdown))
            print(f"Post Number Index: {i}")

            display(widgets.VBox([
                widgets.HTML(f"Brand: <b><font color='red'>{row['search_query']}</font></b><br><br>{row['combine_text']} <br>"),
                widgets.Label(f"{row['url']}", layout=widgets.Layout(width='1000px', word_wrap='break-word')),
                dropdown
            ]))
        
        display(widgets.HBox([self.next_button, self.save_button]))

    # Function to determine whether to save labels and proceed
    def save_labels(self,continue_labeling=True):
        for index, dropdown in self.label_widgets:
            self.labels[index] = dropdown.value
        
        self.current_index += self.batch_size
        if continue_labeling:
            self.show_batch()
        else:
            self.save_data()

    # Function to save labeled data locally
    def save_data(self):
        labeled_df = self.df.copy()
        labeled_df['label'] = labeled_df.index.map(self.labels)
        labeled_df['reviewer'] = self.reviewer

        selected_df = labeled_df[~labeled_df['label'].isna()].reset_index(drop=True)
        # selected columns 
        selected_df = selected_df[self.selected_columns]

        selected_df.to_csv(self.file_name, index=False)
        print(f"Data saved as '{self.file_name}'")


    def start_manually_labelling(self): 
        
        # Bind buttons
        self.next_button.on_click(lambda _: self.save_labels(True))
        self.save_button.on_click(lambda _: self.save_labels(False))

        # Start labeling
        self.show_batch()

In [6]:
# sv = show_visual(df=df, file_number=1, reviewer="aserban")
# sv.start_manually_labelling()

In [7]:
# sv = show_visual(df=df.sample(20, random_state=42), file_number=2, reviewer="aserban")
# sv.start_manually_labelling()

In [8]:
# sv = show_visual(df=df.sample(100, random_state=13), file_number=3, reviewer="aserban")
# sv.start_manually_labelling()

In [9]:
# sv = show_visual(df=df.sample(100, random_state=7), file_number=4, reviewer="aserban")
# sv.start_manually_labelling()

In [10]:
# file_path_check = "labeled_data/aserban_labeled_data_3.csv"
# check_df = pd.read_csv(file_path_check)

In [11]:
# check_df.shape

# Combined Labeled Data 

After we saved the files locally, we the imported it into another folder within google drive, and this is how we combiend the data.


In [12]:
import sys
sys.path.append("../../scripts/")

from access_data import authenticate_google_drive, grab_google_drive_folder_data

drive = authenticate_google_drive()

In [13]:
import json 

labeled_data_folder_location_file = "credentials/google_drive_labeled_data_folder_id.json"


with open(labeled_data_folder_location_file, 'r') as file:
    google_drive_credentials = json.load(file)
folder_id = google_drive_credentials["folder_id"]

In [14]:
file_list = drive.ListFile({'q': f"'{folder_id}' in parents and trashed=false"}).GetList()

In [15]:
# Iterate through all files and combine
combined_labeled_data = pd.DataFrame()
for file in file_list: 
    file_name = file['title']        
    file_df = grab_google_drive_folder_data(drive=drive,credential_file=labeled_data_folder_location_file,filename=file_name)
    file_df = file_df[file_df['label'] != 'Unknown'].reset_index(drop=True)
    if combined_labeled_data.empty : 
        combined_labeled_data = file_df 
    else: 
        combined_labeled_data  = pd.concat([combined_labeled_data, file_df], ignore_index=True)

Successfully loaded 'chrismca_labeled_data_5.csv' into a DataFrame!
Successfully loaded 'chrismca_labeled_data_10.csv' into a DataFrame!
Successfully loaded 'chrismca_labeled_data_9.csv' into a DataFrame!
Successfully loaded 'chrismca_labeled_data_8.csv' into a DataFrame!
Successfully loaded 'chrismca_labeled_data_7.csv' into a DataFrame!
Successfully loaded 'chrismca_labeled_data_6.csv' into a DataFrame!
Successfully loaded 'chrismca_labeled_data_3.csv' into a DataFrame!
Successfully loaded 'chrismca_labeled_data_2.csv' into a DataFrame!
Successfully loaded 'chrismca_labeled_data_4.csv' into a DataFrame!
Successfully loaded 'chriscma_labeled_data_1.csv' into a DataFrame!
Successfully loaded 'aserban_labeled_data_3.csv' into a DataFrame!
Successfully loaded 'aserban_labeled_data_2.csv' into a DataFrame!
Successfully loaded 'aserban_labeled_data_1.csv' into a DataFrame!


In [16]:
# Leverage combine_text from df rather than labeled_data
combined_labeled_data = combined_labeled_data.rename(columns={'subredit_topic': 'subreddit_topic'})
columns = ['submission_id', 'subreddit_topic', 'search_query', 'label', 'reviewer']

combined_labeled_data = combined_labeled_data[columns]
df_subset = df[['submission_id', 'subreddit_topic', 'search_query', 'combine_text', 'url']]

labeled_data = combined_labeled_data.merge(df_subset, on=['submission_id', 'subreddit_topic', 'search_query'], how='left')
labeled_data = labeled_data[['submission_id', 'subreddit_topic', 'search_query', 'combine_text', 'url', 'label', 'reviewer']]


In [17]:
# Drop where it is in labeled but not in df 
labeled_data = labeled_data[~labeled_data['combine_text'].isna()].reset_index(drop=True)

In [18]:
# Make sure there are no duplicates in labeling 
combined_labeled_data_grouped = pd.DataFrame(labeled_data.groupby(['submission_id','subreddit_topic','search_query', 'combine_text', 'url']).agg({'label': ['count', 'min', 'max']})).reset_index()
combined_labeled_data_grouped.columns = ['_'.join(col) for col in combined_labeled_data_grouped.columns]

# Check if there are instances that the labels are not the same 
combined_labeled_data_grouped[combined_labeled_data_grouped['label_count'] > 1]
combined_labeled_data_grouped['same_value'] = combined_labeled_data_grouped['label_min'] == combined_labeled_data_grouped['label_max']
print("Number of conflicting labels:  ", combined_labeled_data_grouped[~combined_labeled_data_grouped['same_value']].shape[0])

# Drop instances where the label was conflicting
conflicting_submission_ids = list(combined_labeled_data_grouped[~combined_labeled_data_grouped['same_value']]['submission_id_'])
labeled_data = labeled_data[~labeled_data['submission_id'].isin(conflicting_submission_ids)].reset_index(drop=True)

# The ones that have more than 1 and are the same value , coalesce those 
labeled_data = labeled_data.drop_duplicates(
    subset=['submission_id', 'subreddit_topic', 'search_query', 'combine_text', 'url'], 
    keep='first'
).reset_index(drop=True)


Number of conflicting labels:   1


In [19]:
combined_labeled_data_grouped['same_value'] = combined_labeled_data_grouped['label_min'] == combined_labeled_data_grouped['label_max']
print("Number of conflicting labels:  ", combined_labeled_data_grouped[~combined_labeled_data_grouped['same_value']].shape[0])

# Drop instances where the label was conflicting
conflicting_submission_ids = list(combined_labeled_data_grouped[~combined_labeled_data_grouped['same_value']]['submission_id_'])
labeled_data = labeled_data[~labeled_data['submission_id'].isin(conflicting_submission_ids)].reset_index(drop=True)

# The ones that have more than 1 and are the same value , coalesce those 
labeled_data = labeled_data.drop_duplicates(
    subset=['submission_id', 'subreddit_topic', 'search_query', 'combine_text', 'url'], 
    keep='first'
).reset_index(drop=True)


Number of conflicting labels:   1


In [20]:
# See the distribution of the labels 
print(labeled_data['label'].value_counts(dropna=False) )
print('\n')
print(labeled_data['label'].value_counts(normalize=True))

Neutral     78
Negative    15
Positive     2
Name: label, dtype: int64


Neutral     0.821053
Negative    0.157895
Positive    0.021053
Name: label, dtype: float64


In [21]:
print("Final file size: ",  labeled_data.shape[0])

Final file size:  95


In [22]:
from access_data import  save_google_drive_data

labeled_data_folder_location_file = "credentials/google_drive_folder_id.json"
file_name_combined = "combined_labeled_data.csv"

# Save the data in the Google Drive location
save_google_drive_data(drive=drive, 
                       dataframe =labeled_data, 
                       filename=file_name_combined)

File 'combined_labeled_data.csv' uploaded successfully to folder 1Ktcv4eaR7kH0teyGuLph4LSYWxI1qkIS!
