In [None]:
%pip install pandas
%pip install nltk
%pip install scikit-learn

from google.colab import drive
drive.mount('/content/drive')

import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

# Load the datasets
try:
  df_train = pd.read_csv('/content/drive/MyDrive/TUS/Engineering_Project/data/source_files/all_train.tsv', sep='\t', encoding='latin1', low_memory=True)
  df_test = pd.read_csv('/content/drive/MyDrive/TUS/Engineering_Project/data/source_files/all_test_public.tsv', sep='\t', encoding='latin1', low_memory=True)
  df_validate = pd.read_csv('/content/drive/MyDrive/TUS/Engineering_Project/data/source_files/all_validate.tsv', sep='\t', encoding='latin1', low_memory=True)

except Exception as e:
  print(f"An error occurred during data loading: {e}")

try:
  df_submissions_clean = pd.concat([df_train, df_test, df_validate], ignore_index=True)
except Exception as e:
  print(f"An error occurred during concatenation: {e}")

# Save the cleaned DataFrame to a new CSV file
try:
    df_submissions_clean.to_csv('/content/drive/MyDrive/TUS/Engineering_Project/data/concatted_all_data.csv', index=False)
except Exception as e:
    print(f"An error occurred saving to csv: {e}")

In [None]:
import pandas as pd
import re
import glob

# Configuration class for cleaning parameters
class Config:
    def __init__(self):
        self.chunk_size = 10000
        self.max_text_length = 500
        self.comment_invalid_phrases = [
            'post comment without original photoshop',
            'please post comment photoshop reply',
            'unfortunately, your submission has been removed from',
            'google image search automated response',
            'is a curated space.**',
            'In order not to get your comment removed',
            'Google cached version',
            'http://',
            'https://'
        ]
        # self.stop_words = set(stopwords.words('english'))
        self.placeholder = "placeholderspecialdelim"

# Cleaner class to encapsulate data cleaning
class Cleaner:
    def __init__(self, config: Config):
        self.config = config
        # self.lemmatizer = WordNetLemmatizer()

    def clean_text(self, text):
        if isinstance(text, float):  # Handle NaN
            print(f"[DEBUG] Encountered NaN or invalid text: {text}")
            return ''

        # Temporarily replace special delimiter
        text = text.replace('|__|', self.config.placeholder)
        print(f"[DEBUG] After replacing special delimiter: {text}")

        # Remove unwanted characters
        text = re.sub(r'http\S+', '', text)  # Remove URLs
        text = re.sub(r'<.*?>', '', text)  # Remove HTML tags
        text = re.sub(r'[^a-zA-Z\s' + self.config.placeholder + ']', '', text)  # Remove special characters
        print(f"[DEBUG] After cleaning unwanted characters: {text}")

        # Convert text to lowercase
        text = text.lower()
        print(f"[DEBUG] Converted text to lowercase: {text}")

        # Truncate text if it exceeds the maximum length
        if len(text) > self.config.max_text_length:
            print(f"[DEBUG] Truncating text to max length of {self.config.max_text_length}")
            text = text[:self.config.max_text_length] + '...' + '|__|'

        return text

    def filter_comments(self, df_comments):
        comments_dict = {}
        comments_orig_dict = {}

        print(f"[DEBUG] Total comments before filtering: {len(df_comments)}")

        for i, row in df_comments.iterrows():
            parent_id = row.get('parent_id', '')
            if isinstance(parent_id, str):
                parent_id = parent_id.split('_', 1)[-1]  # Remove prefix like 't1_'
                print(f"[DEBUG] Processed parent_id: {parent_id}")

            comment_body = row.get('body', '') if pd.notna(row.get('body', '')) else ''
            print(f"[DEBUG] Original comment body: {comment_body}")

            # Skip invalid comments
            if (comment_body in ['NaN', 'deleted', 'removed', '', '[deleted]', '[removed]']) or \
               any(phrase in comment_body.lower() for phrase in self.config.comment_invalid_phrases):
                print(f"[DEBUG] Skipping invalid comment: {comment_body}")
                continue
            elif parent_id != '' and comment_body != '':
                # Build comment dictionaries
                if parent_id in comments_orig_dict:
                    # comments_dict[parent_id].append(cleaned_comment)
                    comments_orig_dict[parent_id].append(comment_body)
                    print(f"[DEBUG] Appended comment to parent_id {parent_id}")
                else:
                    # comments_dict[parent_id] = [cleaned_comment]
                    comments_orig_dict[parent_id] = [comment_body]
                    print(f"[DEBUG] Created new entry for parent_id {parent_id}")

        print(f"[DEBUG] Filtered {len(comments_orig_dict)} unique parent_ids with valid comments.")
        return comments_dict, comments_orig_dict

    def clean_dataframe(self, df):
        print(f"[DEBUG] Starting dataframe cleaning. Initial columns: {df.columns}")

        # Remove unnamed columns
        unnamed_cols = [col for col in df.columns if col.startswith('Unnamed')]
        df.drop(columns=unnamed_cols, inplace=True)
        print(f"[DEBUG] Removed unnamed columns. Remaining columns: {df.columns}")

        # Filter rows based on 'hasImage' and 'subreddit'
        initial_row_count = len(df)
        df = df[~df['hasImage'].astype(str).str.lower().eq('true')]
        print(f"[DEBUG] Rows after removing hasImage=True: {len(df)} (removed {initial_row_count - len(df)})")
        df = df[~df['subreddit'].str.lower().str.contains('photo')]
        print(f"[DEBUG] Rows after removing subreddits containing 'photo': {len(df)}")
        df = df[df['subreddit'].str.lower().str.contains('news')]
        print(f"[DEBUG] Rows after filtering subreddits containing 'news': {len(df)}")

        # Drop unnecessary columns
        drop_columns = ['created_utc', 'domain', 'image_url']
        df.drop(columns=drop_columns, inplace=True, errors='ignore')
        print(f"[DEBUG] Dropped columns: {drop_columns}. Remaining columns: {df.columns}")

        return df

# Main function to process data in chunks
def process_data_in_chunks(config: Config, cleaner: Cleaner, submissions_path, comments_path, output_path):
    try:
        # Load comments
        print("[DEBUG] Loading comments dataset...")
        df_comments = pd.read_csv(comments_path, sep='\t', encoding='utf-8', on_bad_lines='warn', low_memory=True)
        print(f"[DEBUG] Loaded {len(df_comments)} rows from comments dataset.")

        comments_dict, comments_orig_dict = cleaner.filter_comments(df_comments)

        # Load submissions in chunks
        print("[DEBUG] Loading submissions dataset in chunks...")
        chunk_iterator = pd.read_csv(submissions_path, encoding='latin1', low_memory=True, on_bad_lines='skip', chunksize=config.chunk_size)

        for chunk_count, chunk in enumerate(chunk_iterator):
            print(f"[DEBUG] Processing chunk {chunk_count + 1} with {len(chunk)} rows...")
            chunk = cleaner.clean_dataframe(chunk)

            # Map comments to submissions
            print("[DEBUG] Mapping comments to submissions...")
            chunk['comments_orig'] = chunk.apply(
                # lambda row: '|__|'.join(set(comments_orig_dict.get(row.get('linked_submission_id', ''), []) +
                #                            comments_orig_dict.get(row.get('id', ''), []))),
                lambda row: '|__|'.join(set(comments_orig_dict.get(row.get('id', ''), []))),
                axis=1
            )
            print("[DEBUG] Finished mapping comments for chunk.")

            # Remove rows without comments
            initial_chunk_row_count = len(chunk)
            chunk = chunk[chunk['comments_orig'] != '']
            print(f"[DEBUG] Rows after filtering out empty comments: {len(chunk)} (removed {initial_chunk_row_count - len(chunk)})")

            # Save cleaned chunk
            chunk.to_csv(f"{output_path}/cleaned_chunk_{chunk_count + 1}_news.csv", index=False)
            print(f"[DEBUG] Chunk {chunk_count + 1} saved with {len(chunk)} rows.")

        # Combine all cleaned chunks into a single file
        all_files = glob.glob(f"{output_path}/cleaned_chunk_*.csv")
        print(f"[DEBUG] Combining {len(all_files)} cleaned chunk files...")
        combined_df = pd.concat([pd.read_csv(f) for f in all_files], ignore_index=True)
        combined_df.to_csv(f"{output_path}/merged_cleaned_data_v31_news.csv", index=False)
        print("[DEBUG] All chunks merged and saved to merged_cleaned_data.csv.")

    except Exception as e:
        print(f"[ERROR] An error occurred: {e}")

# Main execution
if __name__ == "__main__":
    # Configuration
    config = Config()
    cleaner = Cleaner(config)

    # Paths
    submissions_path = '/content/drive/MyDrive/TUS/Engineering_Project/data/concatted_all_data.csv'
    comments_path = '/content/drive/MyDrive/TUS/Engineering_Project/data/source_files/all_comments.tsv'
    output_path = '/content/drive/MyDrive/TUS/Engineering_Project/data/cleaned_chunks'

    # Process data
    process_data_in_chunks(config, cleaner, submissions_path, comments_path, output_path)
