# Data Preprocessing

In [104]:
import pandas as pd
from datetime import datetime, timedelta

### Cleaning the data

In [105]:
data = pd.read_csv('new_combined_dataset.csv')

print("Number of posts:",len(data['post_id'].unique()))
print("Posts without comments:", len(data[data['comment_id'].isna()]))



# Ensure that the post_id is 7 characters long
data = data[data['post_id'].str.len() == 7]

#Remove all records where subreddit is null
data = data[data['subreddit'].notnull()]


data['post_title'] = data['post_title'].str.replace('\n', ' ')
data['post_body'] = data['post_body'].str.replace('\n', ' ')
data['comment_body'] = data['comment_body'].str.replace('\n', ' ')

# Reencode the data to utf-8
data['post_title'] = data['post_title'].str.encode('utf-8', 'ignore').str.decode('utf-8')
data['post_body'] = data['post_body'].str.encode('utf-8', 'ignore').str.decode('utf-8')
data['comment_body'] = data['comment_body'].str.encode('utf-8', 'ignore').str.decode('utf-8')

data['number_of_upvotes'] = data['number_of_upvotes'].fillna(0)

# Remove where comment_body is [deleted] or [removed]
data = data[data['comment_body'] != '[deleted]']
data = data[data['comment_body'] != '[removed]']

# Remove comment body when count is greater than 50
# Keep records where comment_body is null or appears <= 50 times
# Create a mask for records where comment_body is null or appears <= 50 times
comment_counts = data['comment_body'].value_counts()
mask = (data['comment_body'].isna()) | (data['comment_body'].map(lambda x: comment_counts.get(x, 0) <= 50))
data = data[mask]

# Remove data that matches the regex pattern, handling NaN values
data = data[~data['comment_body'].fillna('').str.contains(r'^Hey\s+/u/\w+.*?$', regex=True)]
data = data[~data['comment_body'].fillna('').str.contains(r'^.*?if you have any questions or concerns.*?$', regex=True)]
data = data[~data['comment_body'].fillna('').str.contains(r'\[ Removed by Reddit \]', regex=True)]
data = data[~data['comment_body'].fillna('').str.contains(r'^.*?\[.*?\].*?$', regex=True)]

# Store the data in a new CSV file
data.to_csv('cleaned_data.csv', index=False)


# Check number post (has post_id but no comment_id)
print("\nAfter cleaning the data")
print("Number of posts:",len(data['post_id'].unique()))
print("Posts without comments:", len(data[data['comment_id'].isna()]))


Number of posts: 9575
Posts without comments: 614

After cleaning the data
Number of posts: 8513
Posts without comments: 550


### Data Preprocessing

In [106]:
raw_data = pd.read_csv('cleaned_data.csv', 
					   engine='pyarrow',     # Use python engine instead of pyarrow
					   encoding='utf-8',    # Specify encoding
)

# Declare each field data type
raw_data['post_id'] = raw_data['post_id'].astype(str)
raw_data['comment_id'] = raw_data['comment_id'].astype(str)
raw_data['post_title'] = raw_data['post_title'].astype(str)
raw_data['post_body'] = raw_data['post_body'].astype(str)
raw_data['post_author'] = raw_data['post_author'].astype(str)
raw_data['comment_body'] = raw_data['comment_body'].astype(str)
raw_data['comment_author'] = raw_data['comment_author'].astype(str)
raw_data['query'] = raw_data['query'].astype(str)

raw_data['subreddit'] = raw_data['subreddit'].astype('category')
raw_data['query'] = raw_data['query'].astype('category')

# Fill NaN values with 0 before converting to int
raw_data['number_of_comments'] = raw_data['number_of_comments'].fillna(0).astype(int)
raw_data['number_of_upvotes'] = raw_data['number_of_upvotes'].fillna(0).astype(int)

raw_data['readable_datetime'] = pd.to_datetime(raw_data['readable_datetime'])

print(raw_data.head())

   post_id   subreddit                                        post_title  \
0  1002dom  technology  ChatGPT Caused 'Code Red' at Google, Report Says   
1  1002dom  technology  ChatGPT Caused 'Code Red' at Google, Report Says   
2  1002dom  technology  ChatGPT Caused 'Code Red' at Google, Report Says   
3  1002dom  technology  ChatGPT Caused 'Code Red' at Google, Report Says   
4  1002dom  technology  ChatGPT Caused 'Code Red' at Google, Report Says   

  post_body  number_of_comments   readable_datetime post_author comment_id  \
0      None                 370 2023-01-01 00:03:33    slakmehl    j2far1e   
1      None                 370 2023-01-01 00:03:33    slakmehl    j2f5vg2   
2      None                 370 2023-01-01 00:03:33    slakmehl    j2f9y5m   
3      None                 370 2023-01-01 00:03:33    slakmehl    j2f7njc   
4      None                 370 2023-01-01 00:03:33    slakmehl    j2fna2c   

                                        comment_body  number_of_upvotes  \

In [107]:
# Create combined text field and replace None/NaN with empty string
raw_data["text"] = raw_data.apply(
	lambda row: (
		str(row['comment_body']).strip() if pd.notna(row['comment_body'])
		else ""), 
	axis=1
).fillna("")

# Print count
print("Number of empty texts:", (raw_data["text"] == "").sum())
print("Number of NA texts:", raw_data["text"].isna().sum())
print("Total records:", len(raw_data))

# Drop rows where text is empty
raw_data = raw_data[raw_data["text"] != ""]

# Add a new row for all unique posts (get post_id, comments, and all from first record of the post, exclude any comment fields)
posts = raw_data.groupby("post_id").first().reset_index()
posts = posts.drop(columns=["comment_id", "comment_body", "comment_author","text"])

print("\nNumber of posts:", len(posts))

# Make post text the as post title and post body
posts["text"] = posts["post_title"] + " " + posts["post_body"]

# concat raw data and posts
raw_data = pd.concat([posts, raw_data], ignore_index=True)

# Remove any duplicate rows
raw_data = raw_data.drop_duplicates()

# Display first few rows and value counts of empty strings
print("\n After adding new row for all unique posts")
print("\nNumber of empty texts:", (raw_data["text"] == "").sum())
print("Number of NA texts:", raw_data["text"].isna().sum())
print("Total records:", len(raw_data))

Number of empty texts: 0
Number of NA texts: 0
Total records: 46901

Number of posts: 8513

 After adding new row for all unique posts

Number of empty texts: 0
Number of NA texts: 0
Total records: 55414


In [108]:
# No.of Posts and Comments
print("\nNumber of Posts and Comments:")
print(raw_data["post_id"].nunique())
print(raw_data["comment_id"].count())


Number of Posts and Comments:
8513
46901


In [109]:
filtered_data = raw_data.copy()

###  S1: Filter to past 5 year

In [110]:
cutoff_date = datetime.now() - timedelta(days=5*365)

filtered_data = filtered_data[filtered_data["readable_datetime"] > cutoff_date]

print("\nNumber of Posts and Comments after filtering:")
print(filtered_data["post_id"].nunique())
print(filtered_data["comment_id"].count())

# Print date time range in the data
print("\nDate time range:")
print(filtered_data["readable_datetime"].min())
print(filtered_data["readable_datetime"].max())


Number of Posts and Comments after filtering:
8513
46901

Date time range:
2023-01-01 00:03:33
2025-01-30 12:00:03


In [111]:
# Remove all missing records where text is empty
filtered_data = filtered_data[filtered_data["text"] != ""]
filtered_data = filtered_data[filtered_data["text"].notna()]
filtered_data = filtered_data[filtered_data["text"] != "nan"]
filtered_data = filtered_data[filtered_data["text"] != "None"]

### S2: Text Length

Short Texts: Extremely short texts (e.g., those with only one or two words) might not provide enough context and could be noise.

Excessively Long Texts: Conversely, texts that far exceed the typical length for your domain might be off-topic or contain noise.

In [112]:
from tabulate import tabulate


min_words = 3

# Calculate word counts for each text
word_counts = filtered_data['text'].str.split().str.len()

print("\nWord count statistics:")
print(tabulate([word_counts.describe()], headers='keys'))
print("\n")


# Set max_words as the upper quartile (75th percentile) plus 1.5 times IQR
Q3 = word_counts.quantile(0.75)
Q1 = word_counts.quantile(0.25)
IQR = Q3 - Q1
max_words = int(Q3 + 1.5 * IQR)

print(f"Max words set to: {max_words}")


Word count statistics:
  count     mean      std    min    25%    50%    75%    max
-------  -------  -------  -----  -----  -----  -----  -----
  54862  51.6563  135.786      1     10     21     49   5827


Max words set to: 107


In [None]:
print(filtered_data["post_id"].nunique())
print(filtered_data["comment_id"].isna().sum())

# First recalculate word counts since filtered_data has been modified since last count
word_counts = filtered_data['text'].str.split().str.len()

# Filter based on min and max words
filtered_data = filtered_data[word_counts.between(min_words, max_words)]

print("\nNumber of Posts and Comments after filtering by word count:")
print(filtered_data["post_id"].nunique())
print(filtered_data["comment_id"].count())

8513
8513

Number of Posts and Comments after filtering by word count:
8267
5978


In [114]:
# Store the filtered data in a new CSV file
filtered_data.to_csv('filtered_data.csv', index=False)