## Prepare the dataset 

Prepare the dataset so that it can be used to label the data.

- Posts only posted in 2024 through 2025 
- Combine the title and text 
- Remove spaces in combine_text
- Add in identifier column


In [1]:
%%capture
pip install -r ../../requirements.txt

In [2]:
# Install the required packages
import sys 
import json #needed to translate JSON data
import requests #needed to perform HTTP GET and POST requests
import pandas as pd
import pprint # allows us to print more readable JSON data
from datetime import datetime 
import time 
import io

# NLP
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

# Need this set to None otherwise text columns will truncate!
pd.set_option('display.max_colwidth', None) 

In [3]:
import sys

# set this on the path so that we can reference the commong data locations
sys.path.append("../../scripts/")
from process_text_data import text_embeddings, compute_similarity_scores

In [4]:
from data_collection import authenticate_google_drive, grab_google_drive_folder_data

drive = authenticate_google_drive('../0_data_collection/credentials/google_drive_client_secret.json')
df = grab_google_drive_folder_data(drive=drive,credential_file="../0_data_collection/credentials/google_drive_folder_id.json",filename="reddit_data.csv")

Successfully loaded 'reddit_data.csv' into a DataFrame!


In [5]:
# Convert the 'timestamp' column from object (string) to datetime
df['created_at'] = pd.to_datetime(df['created_at'], errors='coerce')

# Filter data between 2024 and 2025
filtered_df = df[(df['created_at'].dt.year >= 2023) & (df['created_at'].dt.year <= 2025)]


In [6]:
## Combine text and remove spacing 
filtered_df['combine_text'] = filtered_df['title']+". "+ filtered_df['text']
filtered_df['combine_text'] = filtered_df['combine_text'].str.strip().str.replace(r'\s+', ' ', regex=True)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df['combine_text'] = filtered_df['title']+". "+ filtered_df['text']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df['combine_text'] = filtered_df['combine_text'].str.strip().str.replace(r'\s+', ' ', regex=True)


In [7]:
# Some duplicate posts since the same post may mention two brands 
print('Number of unique submission ids: ', filtered_df['submission_id'].nunique())
print('Number of total submission ids: ', filtered_df['submission_id'].count())

# Mini investigation that shows the posts have two different brands 
# submission_id_dup = filtered_df['submission_id'].value_counts().head()
# submission_id_example = submission_id_dup.index[0]
# filtered_df[filtered_df['submission_id'] == submission_id_example].head()

Number of unique submission ids:  3484
Number of total submission ids:  3546


In [8]:
# create column that constructs the url for the Reddit Post 
filtered_df['url'] = "https://www.reddit.com/r/FirstTimeHomeBuyer/"+filtered_df['submission_id']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df['url'] = "https://www.reddit.com/r/FirstTimeHomeBuyer/"+filtered_df['submission_id']


In [9]:
from data_collection import authenticate_google_drive, save_google_drive_data


In [10]:
credentials_path="../0_data_collection/credentials/google_drive_client_secret.json"
folder_path="../0_data_collection/credentials/google_drive_folder_id.json"

In [11]:
# Grab the Google Drive object
drive = authenticate_google_drive(credentials_path=credentials_path)


In [12]:
# Save the data in the Google Drive location
save_google_drive_data(drive=drive, 
                       credential_file=folder_path,  
                       dataframe =filtered_df, 
                       filename="reddit_filtered_data.csv")

File 'reddit_filtered_data.csv' uploaded successfully to folder 1kJ6TrI9MVT5mfnnYvS-OpRMJFVbIQ6Tl!
