# Data Preprocessing

In [1]:
import pandas as pd
from datetime import datetime, timedelta

### Cleaning the data

In [2]:
data = pd.read_csv('new_combined_dataset.csv')

# Ensure that the post_id is 7 characters long
data = data[data['post_id'].str.len() == 7]

#Remove all records where subreddit is null
data = data[data['subreddit'].notnull()]


data['post_title'] = data['post_title'].str.replace('\n', ' ')
data['post_body'] = data['post_body'].str.replace('\n', ' ')
data['comment_body'] = data['comment_body'].str.replace('\n', ' ')

# Reencode the data to utf-8
data['post_body'] = data['post_body'].str.encode('utf-8', 'ignore').str.decode('utf-8')
data['comment_body'] = data['comment_body'].str.encode('utf-8', 'ignore').str.decode('utf-8')

data['number_of_upvotes'] = data['number_of_upvotes'].fillna(0)

# Remove where comment_body is [deleted] or [removed]
data = data[data['comment_body'] != '[deleted]']
data = data[data['comment_body'] != '[removed]']

# Remove comment body when count is greater than 50
data = data.groupby('comment_body').filter(lambda x: len(x) <= 50)

# Remove data that matches the regex pattern
data = data[~data['comment_body'].str.contains(r'^Hey\s+/u/\w+.*?$', regex=True)]
data = data[~data['comment_body'].str.contains(r'^.*?if you have any questions or concerns.*?$', regex=True)]
data = data[~data['comment_body'].str.contains(r'\[ Removed by Reddit \]', regex=True)]
data = data[~data['comment_body'].str.contains(r'^.*?\[.*?\].*?$', regex=True)]

# Store the data in a new CSV file
data.to_csv('cleaned_data.csv', index=False)


### Data Preprocessing

In [3]:
raw_data = pd.read_csv('cleaned_data.csv', 
					   engine='pyarrow',     # Use python engine instead of pyarrow
					   encoding='utf-8',    # Specify encoding
)

# Declare each field data type
raw_data['post_id'] = raw_data['post_id'].astype(str)
raw_data['comment_id'] = raw_data['comment_id'].astype(str)
raw_data['post_title'] = raw_data['post_title'].astype(str)
raw_data['post_body'] = raw_data['post_body'].astype(str)
raw_data['post_author'] = raw_data['post_author'].astype(str)
raw_data['comment_body'] = raw_data['comment_body'].astype(str)
raw_data['comment_author'] = raw_data['comment_author'].astype(str)
raw_data['query'] = raw_data['query'].astype(str)

raw_data['subreddit'] = raw_data['subreddit'].astype('category')
raw_data['query'] = raw_data['query'].astype('category')

# Fill NaN values with 0 before converting to int
raw_data['number_of_comments'] = raw_data['number_of_comments'].fillna(0).astype(int)
raw_data['number_of_upvotes'] = raw_data['number_of_upvotes'].fillna(0).astype(int)

raw_data['readable_datetime'] = pd.to_datetime(raw_data['readable_datetime'])

print(raw_data.head())

   post_id   subreddit                                        post_title  \
0  1002dom  technology  ChatGPT Caused 'Code Red' at Google, Report Says   
1  1002dom  technology  ChatGPT Caused 'Code Red' at Google, Report Says   
2  1002dom  technology  ChatGPT Caused 'Code Red' at Google, Report Says   
3  1002dom  technology  ChatGPT Caused 'Code Red' at Google, Report Says   
4  1002dom  technology  ChatGPT Caused 'Code Red' at Google, Report Says   

  post_body  number_of_comments   readable_datetime post_author comment_id  \
0      None                 370 2023-01-01 00:03:33    slakmehl    j2far1e   
1      None                 370 2023-01-01 00:03:33    slakmehl    j2f5vg2   
2      None                 370 2023-01-01 00:03:33    slakmehl    j2f9y5m   
3      None                 370 2023-01-01 00:03:33    slakmehl    j2f7njc   
4      None                 370 2023-01-01 00:03:33    slakmehl    j2fna2c   

                                        comment_body  number_of_upvotes  \

In [None]:
# Create combined text field and replace None/NaN with empty string
raw_data["text"] = raw_data.apply(
	lambda row: (
				f"{str(row['comment_body']).strip()}" if pd.notna(row['comment_body'])
				else f"{str(row['post_title']).strip()} {str(row['post_body']).strip()}" if pd.notna(row['post_title']) and pd.notna(row['post_body'])
				else str(row['post_title']).strip() if pd.notna(row['post_title'])
				else ""), 
	axis=1
).fillna("")

# Display first few rows and value counts of empty strings
print("Sample texts:")
print(raw_data["text"].head())
print("\nNumber of empty texts:", (raw_data["text"] == "").sum())

Sample texts:
0    ChatGPT Caused 'Code Red' at Google, Report Sa...
1    ChatGPT Caused 'Code Red' at Google, Report Sa...
2    ChatGPT Caused 'Code Red' at Google, Report Sa...
3    ChatGPT Caused 'Code Red' at Google, Report Sa...
4    ChatGPT Caused 'Code Red' at Google, Report Sa...
Name: text, dtype: object

Number of empty texts: 0


In [5]:
# No.of Posts and Comments
print("\nNumber of Posts and Comments:")
print(raw_data["post_id"].nunique())
print(raw_data["comment_id"].count())


Number of Posts and Comments:
7963
46349


In [6]:
filtered_data = raw_data.copy()

###  S1: Filter to past 5 year

In [7]:
cutoff_date = datetime.now() - timedelta(days=5*365)

filtered_data = filtered_data[filtered_data["readable_datetime"] > cutoff_date]

print("\nNumber of Posts and Comments after filtering:")
print(filtered_data["post_id"].nunique())
print(filtered_data["comment_id"].count())

# Print date time range in the data
print("\nDate time range:")
print(filtered_data["readable_datetime"].min())
print(filtered_data["readable_datetime"].max())


Number of Posts and Comments after filtering:
7963
46349

Date time range:
2023-01-01 00:03:33
2025-01-30 12:00:03


### S2: Text Length

Short Texts: Extremely short texts (e.g., those with only one or two words) might not provide enough context and could be noise.

Excessively Long Texts: Conversely, texts that far exceed the typical length for your domain might be off-topic or contain noise.

In [8]:
from tabulate import tabulate


min_words = 3

# Calculate word counts for each text
word_counts = filtered_data['text'].str.split().str.len()

print("\nWord count statistics:")
print(tabulate([word_counts.describe()], headers='keys'))
print("\n")


# Set max_words as the upper quartile (75th percentile) plus 1.5 times IQR
Q3 = word_counts.quantile(0.75)
Q1 = word_counts.quantile(0.25)
IQR = Q3 - Q1
max_words = int(Q3 + 1.5 * IQR)

print(f"Max words set to: {max_words}")


Word count statistics:
  count     mean      std    min    25%    50%    75%    max
-------  -------  -------  -----  -----  -----  -----  -----
  46349  164.151  314.498      3     32     70    169   6543


Max words set to: 374


In [9]:
# Filter out texts with less than min_words or more than max_words

filtered_data = filtered_data[(word_counts >= min_words) & (word_counts <= max_words)]

print("\nNumber of Posts and Comments after filtering by word count:")
print(filtered_data["post_id"].nunique())
print(filtered_data["comment_id"].count())


Number of Posts and Comments after filtering by word count:
7305
41745


In [10]:
# Store the filtered data in a new CSV file
filtered_data.to_csv('filtered_data.csv', index=False)