In [1]:
import numpy as np
import pandas as pd

from utils.text_tensor import load_text_dataset_from_json

import zstandard as zstd
import json

import io
import re

## Load All Datasets

In [4]:
dataset_paths = input('Enter dataset paths, space separated: ').split()

In [5]:
dataset_paths

['datasets/reddit_datasets/askhistorians_2022-10-31_top-5-comments_json.zst',
 'datasets/reddit_datasets/askscience_2022-10-31_top-5-comments_json.zst',
 'datasets/reddit_datasets/explainlikeimfive_2022-10-31_top-5-comments_json.zst']

In [6]:
results = []
for dataset_path in dataset_paths:
    results.append(load_text_dataset_from_json(dataset_path))

In [7]:
data = []
labels = []

for result in results:
    data.extend(result[0])
    labels.extend(result[1])

In [8]:
data = np.array(data)
labels = np.array(labels)

In [9]:
print(data.shape, labels.shape)

(8933,) (8933,)


In [10]:
dataset = pd.DataFrame(data, columns=['Data'])

In [11]:
dataset['Labels'] = labels

In [12]:
np.unique(dataset['Labels'], return_counts=True)

(array([0, 1]), array([7433, 1500]))

## Remove Unnecessary characters and symbols

Removing '>' characters, which symbolize replies in reddit comments. GPT would not generate this symbol.

In [13]:
dataset['Data'] = dataset['Data'].str.replace('^>', '', regex=True)

Removing bot comments

In [14]:
dataset = dataset[~dataset['Data'].str.contains("I am a bot")]

Removing profanity

In [21]:
from better_profanity import profanity

def contains_explicit(text):
    return profanity.contains_profanity(text)

# Filter out rows containing explicit content
filtered_data = dataset[dataset['Data'].apply(contains_explicit)]

# Save the rows with explicit content to a separate CSV file
filtered_data.to_csv('explicit_content.csv', index=False)

# Remove the rows with explicit content from the original DataFrame
clean_df = dataset.drop(filtered_data.index)

# Save the cleaned DataFrame to another CSV file
clean_df.to_csv('cleaned_content.csv', index=False)

In [23]:
print(clean_df.shape, dataset.shape)

(7519, 2) (8901, 2)


In [1]:
print(filtered_data.shape, dataset.shape)

NameError: name 'filtered_data' is not defined

## Store Dataset

In [34]:
# Store dataframe as csv
dataset.to_csv('datasets/reddit_datasets/gpt_reddit_dataset.csv', index=False)

In [19]:
# Convert DataFrame to a byte stream
df_bytes = io.BytesIO()
dataset.to_csv(df_bytes, index=False)

# Compress the byte stream using Zstandard
cmpr = zstd.ZstdCompressor()
compressed_bytes = cmpr.compress(df_bytes.getvalue())

In [20]:
with open(f'datasets/reddit_datasets/gpt_reddit_dataset.zst', 'wb') as f:
    f.write(compressed_bytes)

## Read the data

In [None]:
# Decompress the byte stream using Zstandard
dctx = zstd.ZstdDecompressor()
decompressed_bytes = dctx.decompress(compressed_bytes)

# Convert the decompressed byte stream back to a DataFrame
dataset = pd.read_csv(io.BytesIO(decompressed_bytes))

# Print the decompressed DataFrame (optional, just to check the result)
print(dataset)


## Store dataset without special characters

In [35]:
# Function to remove non-alphanumeric characters and links
def clean_text(text):
    # Replace any URL-like patterns with an empty string
    text = re.sub(r'\b(?:https?://|www\.)\S+\b', '', text)
    # Replace non-alphanumeric characters
    text = re.sub(r'[^\w\s$.,!?"\']', '', text)
    # Remove extra spaces (optional)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

# Apply the clean_text function to the 'text_column'
# Create a new DataFrame with the cleaned 'text_column' and include 'other_column'
cleaned_dataset = pd.DataFrame({
    'Data': dataset['Data'].apply(clean_text),
    'Labels': dataset['Labels']
})

In [21]:
# Convert DataFrame to a byte stream
df_bytes = io.BytesIO()
cleaned_dataset.to_csv(df_bytes, index=False)

# Compress the byte stream using Zstandard
cmpr = zstd.ZstdCompressor()
compressed_bytes = cmpr.compress(df_bytes.getvalue())

In [36]:
cleaned_dataset.to_csv('datasets/reddit_datasets/gpt_reddit_dataset_cleaned.csv', index=False)

In [19]:
with open(f'dataset_output.txt', 'w') as f:
    f.write(cleaned_dataset.to_markdown())