# DATA SELECTION AND LABELLING

In [15]:
%pip install tabulate

Collecting tabulate
  Downloading tabulate-0.9.0-py3-none-any.whl.metadata (34 kB)
Downloading tabulate-0.9.0-py3-none-any.whl (35 kB)
Installing collected packages: tabulate
Successfully installed tabulate-0.9.0
Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 25.0 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [13]:
import pandas as pd

## 1. LOADING AND BASIC PREPROCESSING

In [19]:
# Read the raw data extracted from Reddit
raw_data = pd.read_csv('../Data/new_combined_dataset.csv')


In [20]:
# Declare each field's data type
raw_data['post_id'] = raw_data['post_id'].astype(str)
raw_data['comment_id'] = raw_data['comment_id'].astype(str)
raw_data['post_title'] = raw_data['post_title'].astype(str)
raw_data['post_body'] = raw_data['post_body'].astype(str)
raw_data['post_author'] = raw_data['post_author'].astype(str)
raw_data['comment_body'] = raw_data['comment_body'].astype(str)
raw_data['comment_author'] = raw_data['comment_author'].astype(str)
raw_data['query'] = raw_data['query'].astype(str)

raw_data['subreddit'] = raw_data['subreddit'].astype('category')
raw_data['query'] = raw_data['query'].astype('category')

# Fill NaN values with 0 before converting to int
raw_data['number_of_comments'] = raw_data['number_of_comments'].fillna(0).astype(int)
raw_data['number_of_upvotes'] = raw_data['number_of_upvotes'].fillna(0).astype(int)

raw_data['readable_datetime'] = pd.to_datetime(raw_data['readable_datetime'])

print(raw_data.head())

   post_id   subreddit                                        post_title  \
0  1002dom  technology  ChatGPT Caused 'Code Red' at Google, Report Says   
1  1002dom  technology  ChatGPT Caused 'Code Red' at Google, Report Says   
2  1002dom  technology  ChatGPT Caused 'Code Red' at Google, Report Says   
3  1002dom  technology  ChatGPT Caused 'Code Red' at Google, Report Says   
4  1002dom  technology  ChatGPT Caused 'Code Red' at Google, Report Says   

  post_body  number_of_comments   readable_datetime post_author comment_id  \
0       nan                 370 2023-01-01 00:03:33    slakmehl    j2far1e   
1       nan                 370 2023-01-01 00:03:33    slakmehl    j2f5vg2   
2       nan                 370 2023-01-01 00:03:33    slakmehl    j2f9y5m   
3       nan                 370 2023-01-01 00:03:33    slakmehl    j2f7njc   
4       nan                 370 2023-01-01 00:03:33    slakmehl    j2fna2c   

                                        comment_body  number_of_upvotes  \

## 2. HEURISTIC FILTERING

In [21]:
# Create combined text field and replace None/NaN with empty string
raw_data["text"] = raw_data.apply(
	lambda row: (
		str(row['comment_body']).strip() if pd.notna(row['comment_body'])
		else ""), 
	axis=1
).fillna("")

# Print count
print("Number of empty texts:", (raw_data["text"] == "").sum())
print("Number of NA texts:", raw_data["text"].isna().sum())
print("Total records:", len(raw_data))

# Drop rows where text is empty
raw_data = raw_data[raw_data["text"] != ""]

# Add a new row for all unique posts (get post_id, comments, and all from first record of the post, exclude any comment fields)
posts = raw_data.groupby("post_id").first().reset_index()
posts = posts.drop(columns=["comment_id", "comment_body", "comment_author","text"])

print("\nNumber of posts:", len(posts))

# Make post text the as post title and post body
posts["text"] = posts["post_title"] + " " + posts["post_body"]

# concat raw data and posts
raw_data = pd.concat([posts, raw_data], ignore_index=True)

# Remove any duplicate rows
raw_data = raw_data.drop_duplicates()

# Display first few rows and value counts of empty strings
print("\n After adding new row for all unique posts")
print("\nNumber of empty texts:", (raw_data["text"] == "").sum())
print("Number of NA texts:", raw_data["text"].isna().sum())
print("Total records:", len(raw_data))

Number of empty texts: 0
Number of NA texts: 0
Total records: 54966

Number of posts: 9575

 After adding new row for all unique posts

Number of empty texts: 0
Number of NA texts: 0
Total records: 64541


In [22]:
# No.of Posts and Comments
print("\nNumber of Posts and Comments:")
print(raw_data["post_id"].nunique())
print(raw_data["comment_id"].count())


Number of Posts and Comments:
9575
54966


In [23]:
filtered_data = raw_data.copy()

###  S1: Filter to past 5 year

In [24]:
from datetime import datetime, timedelta

In [25]:
cutoff_date = datetime.now() - timedelta(days=5*365)

filtered_data = filtered_data[filtered_data["readable_datetime"] > cutoff_date]

print("\nNumber of Posts and Comments after filtering:")
print(filtered_data["post_id"].nunique())
print(filtered_data["comment_id"].count())

# Print date time range in the data
print("\nDate time range:")
print(filtered_data["readable_datetime"].min())
print(filtered_data["readable_datetime"].max())


Number of Posts and Comments after filtering:
9544
54905

Date time range:
2020-05-01 13:26:05
2025-01-30 16:11:12


In [26]:
# Remove all missing records where text is empty
filtered_data = filtered_data[filtered_data["text"] != ""]
filtered_data = filtered_data[filtered_data["text"].notna()]
filtered_data = filtered_data[filtered_data["text"] != "nan"]
filtered_data = filtered_data[filtered_data["text"] != "None"]

### S2: Text Length

Short Texts: Extremely short texts (e.g., those with only one or two words) might not provide enough context and could be noise.

Excessively Long Texts: Conversely, texts that far exceed the typical length for your domain might be off-topic or contain noise.

In [27]:
from tabulate import tabulate


min_words = 3

# Calculate word counts for each text
word_counts = filtered_data['text'].str.split().str.len()

print("\nWord count statistics:")
print(tabulate([word_counts.describe()], headers='keys'))
print("\n")


# Set max_words as the upper quartile (75th percentile) plus 1.5 times IQR
Q3 = word_counts.quantile(0.75)
Q1 = word_counts.quantile(0.25)
IQR = Q3 - Q1
max_words = int(Q3 + 1.5 * IQR)

print(f"Max words set to: {max_words}")


Word count statistics:
  count     mean      std    min    25%    50%    75%    max
-------  -------  -------  -----  -----  -----  -----  -----
  63852  53.8432  139.942      1     10     22     55   5827


Max words set to: 122


In [28]:
print(filtered_data["post_id"].nunique())
print(filtered_data["comment_id"].isna().sum())

# First recalculate word counts since filtered_data has been modified since last count
word_counts = filtered_data['text'].str.split().str.len()

# Filter based on min and max words
filtered_data = filtered_data[word_counts.between(min_words, max_words)]

print("\nNumber of Posts and Comments after filtering by word count:")
print(filtered_data["post_id"].nunique())
print(filtered_data["comment_id"].isna().sum())

9544
9544

Number of Posts and Comments after filtering by word count:
9332
7066


## 2. SEMANTIC SEARCH WITH SENTENCE TRANSFORMERS