In [None]:
%pip install pandas
%pip install nltk
%pip install scikit-learn

from google.colab import drive
drive.mount('/content/drive')

import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

# Load the datasets
try:
  df_train = pd.read_csv('/content/drive/MyDrive/TUS/Engineering_Project/data/source_files/all_train.tsv', sep='\t', encoding='latin1', low_memory=True)
  df_test = pd.read_csv('/content/drive/MyDrive/TUS/Engineering_Project/data/source_files/all_test_public.tsv', sep='\t', encoding='latin1', low_memory=True)
  df_validate = pd.read_csv('/content/drive/MyDrive/TUS/Engineering_Project/data/source_files/all_validate.tsv', sep='\t', encoding='latin1', low_memory=True)

except Exception as e:
  print(f"An error occurred during data loading: {e}")

try:
  df_submissions_clean = pd.concat([df_train, df_test, df_validate], ignore_index=True)
except Exception as e:
  print(f"An error occurred during concatenation: {e}")

# Save the cleaned DataFrame to a new CSV file
try:
    df_submissions_clean.to_csv('/content/drive/MyDrive/TUS/Engineering_Project/data/concatted_all_data.csv', index=False)
except Exception as e:
    print(f"An error occurred saving to csv: {e}")

In [None]:
import pandas as pd
import re
import glob
import csv

# Configuration class for cleaning parameters
class Config:
    def __init__(self):
        self.chunk_size = 10000
        self.max_text_length = 500
        self.comment_invalid_phrases = [
            'post comment without original photoshop',
            'please post comment photoshop reply',
            'unfortunately, your submission has been removed from',
            'google image search automated response',
            'is a curated space.**',
            'In order not to get your comment removed',
            'm a bot so if I was wrong'
        ]
        # self.stop_words = set(stopwords.words('english'))
        self.placeholder = "placeholderspecialdelim"

# Cleaner class to encapsulate data cleaning
class Cleaner:
    def __init__(self, config: Config):
        self.config = config
        # self.lemmatizer = WordNetLemmatizer()

    def clean_text(self, text):
        if isinstance(text, float):  # Handle NaN
            print(f"[DEBUG] Encountered NaN or invalid text: {text}")
            return ''

        # Temporarily replace special delimiter
        text = text.replace('|__|', self.config.placeholder)
        print(f"[DEBUG] After replacing special delimiter: {text}")

        # Remove unwanted characters
        text = re.sub(r'http\S+', '', text)  # Remove URLs
        text = re.sub(r'<.*?>', '', text)  # Remove HTML tags
        text = re.sub(r'[^a-zA-Z\s' + self.config.placeholder + ']', '', text)  # Remove special characters
        print(f"[DEBUG] After cleaning unwanted characters: {text}")

        text = self.use_space_instead_of_chars(text)

        # Convert text to lowercase
        text = text.lower()
        print(f"[DEBUG] Converted text to lowercase: {text}")

        # Truncate text if it exceeds the maximum length
        if len(text) > self.config.max_text_length:
            print(f"[DEBUG] Truncating text to max length of {self.config.max_text_length}")
            text = text[:self.config.max_text_length] + '...'
            #text = text[:self.config.max_text_length] + '...' + '|__|'

        return text

    def filter_csv_compliant(self, text):
        """
        Filters and converts a given text string to be compliant with CSV format.

        Args:
            text (str): The input text string to be filtered.

        Returns:
            str: The CSV-compliant text string.
        """
        if not isinstance(text, str):
            raise ValueError("Input must be a string")

        text = self.use_space_instead_of_chars(text)
        # Escape special characters for CSV
        escaped_text = csv.StringIO()
        writer = csv.writer(escaped_text, quoting=csv.QUOTE_MINIMAL)
        writer.writerow([text])

        # Truncate text if it exceeds the maximum length
        if len(text) > self.config.max_text_length:
            print(f"[DEBUG] Truncating text to max length of {self.config.max_text_length}")
            text = text[:self.config.max_text_length] + '...'

        # Return filtered text without surrounding quotes
        return escaped_text.getvalue().strip()

    def use_space_instead_of_chars(self, text):
        # Replace tabs and commas
        text = text.replace('\t', ' ')  # Replace tabs with space
        text = text.replace(',', ' ')  # Replace comma with space
        text = text.replace('\n', ' ')  # Replace comma with space
        text = text.replace('\r', ' ')  # Replace comma with space
        return text  # Return the modified text

    def filter_comments(self, df_comments):
        comments_dict = {}
        comments_orig_dict = {}

        print(f"[DEBUG] Total comments before filtering: {len(df_comments)}")

        for i, row in df_comments.iterrows():
            parent_id = row.get('parent_id', '')
            if isinstance(parent_id, str):
                parent_id = parent_id.split('_', 1)[-1]  # Remove prefix like 't1_'
                print(f"[DEBUG] Processed parent_id: {parent_id}")

            comment_body = row.get('body', '') if pd.notna(row.get('body', '')) else ''
            print(f"[DEBUG] Original comment body: {comment_body}")

            # Skip invalid comments
            if (comment_body in ['NaN', 'deleted', 'removed', '', '[deleted]', '[removed]']) or \
               any(phrase in comment_body.lower() for phrase in self.config.comment_invalid_phrases):
                print(f"[DEBUG] Skipping invalid comment: {comment_body}")
                continue
            elif parent_id != '' and comment_body != '':
                # Clean comment body
                comment_body = self.filter_csv_compliant(comment_body)
                # print(f"[DEBUG] CSV compliant comment body: {comment_body}")
                # Build comment dictionaries
                if parent_id in comments_orig_dict:
                    # comments_dict[parent_id].append(cleaned_comment)
                    comments_orig_dict[parent_id].append(comment_body)
                    print(f"[DEBUG] Appended comment to parent_id {parent_id}")
                else:
                    # comments_dict[parent_id] = [cleaned_comment]
                    comments_orig_dict[parent_id] = [comment_body]
                    print(f"[DEBUG] Created new entry for parent_id {parent_id}")

        print(f"[DEBUG] Filtered {len(comments_orig_dict)} unique parent_ids with valid comments.")
        return comments_dict, comments_orig_dict

    def clean_dataframe(self, df):
        print(f"[DEBUG] Starting dataframe cleaning. Initial columns: {df.columns}")

        # Remove unnamed columns
        unnamed_cols = [col for col in df.columns if col.startswith('Unnamed')]
        df.drop(columns=unnamed_cols, inplace=True)
        print(f"[DEBUG] Removed unnamed columns. Remaining columns: {df.columns}")

        # Filter rows based on 'hasImage' and 'subreddit'
        initial_row_count = len(df)
        df = df[~df['hasImage'].astype(str).str.lower().eq('true')]
        print(f"[DEBUG] Rows after removing hasImage=True: {len(df)} (removed {initial_row_count - len(df)})")
        df = df[~df['subreddit'].str.lower().str.contains('photo')]
        print(f"[DEBUG] Rows after removing subreddits containing 'photo': {len(df)}")
        # df = df[df['subreddit'].str.lower().str.contains('news')]
        # print(f"[DEBUG] Rows after filtering subreddits containing 'news': {len(df)}")

        # Drop unnecessary columns
        drop_columns = ['created_utc', 'domain', 'image_url']
        df.drop(columns=drop_columns, inplace=True, errors='ignore')
        print(f"[DEBUG] Dropped columns: {drop_columns}. Remaining columns: {df.columns}")

        return df

# Main function to process data in chunks
def process_data_in_chunks(config: Config, cleaner: Cleaner, submissions_path, comments_path, output_path):
    try:

        # Toggle for using static test data
        USE_STATIC_DATA = False

        # Static test data
        static_comments_data = {
            'id': [3665229, 8035415, 8261647, 8261648, 8261649, 8261650, 8261651, 8261652, 8261653, 8261654, 8261655, 8261656, 8261657, 8261658, 8261659, 8261660, 8261661, 8261662, 8261663, 8261664, 8261665, 8261666, 8261667, 8261668, 8261669, 8261670, 8261671, 8261672, 8261673, 8261674, 8261675, 8261676, 8261677, 8261678, 8261679, 8261680, 8261681, 8261682, 8261683, 8261684, 8241541, 8241542, 8241543, 8241544, 8241545, 8231680, 8231681, 8231682],
            'author': ['kutuup1989', 'cpzjcsn', 'RabbiVolesSolo', 'now_stop_that', '', 'Gwunt', 'Kuritos', 'msbutah', 'HomerJunior', '', '', 'IratePieRater', 'MakesPensDance', 'showmeyourlove', 'Tetragrade', 'corvustock', 'CrispyPudding', 'lifeisjest', 'somestupidname1', 'GoghGirl', 'GoghGirl', 'paulrulez742', 'paulrulez742', 'paulrulez742', 'paulrulez742', 'paulrulez742', 'paulrulez742', 'paulrulez742', 'paulrulez742', 'paulrulez742', 'paulrulez742', 'paulrulez742', 'paulrulez742', 'paulrulez742', 'paulrulez742', 'paulrulez742', 'paulrulez742', 'paulrulez742', 'IratePieRater', 'paulrulez742', '', 'Mobbasta', '', 'DoofusMagnus', 'ELF4000', 'stimpakish', 'phd_dude', ''],
            'body': ["I may have spoken too soon in my previous comment.\n\nRelevant:\n\nKaren Raines Keller\n\nResponses from Blakely Elementary School to the Lego article...", "?", ";)", ";)", ";)", ";)", ";)", ";)", "( ͡~ ͜ʖ ͡°)", "[deleted]", "[deleted]", ";)", ";)", ";)", ":^)", ";)", "+‿-", "Some really weird shit pops up in r/all ...", ";)", "^(^-^) ^", "(;", ";)", ";)", ";)", ";)", ";)", ";)", ";)", ";)", ";)", ";)", ";)", ";)", ";)", ";)", ";)", ";)", ";)", "^(^—^) ^", "(;", "\"One of us. One of us.\"", "._.", "•____•", ".▂.", ".-.", ":[]", ":[ ]", "[deleted]"],
            'isTopLevel': [True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, True, True, True, True, True, True, False, False],
            'parent_id': ['t3_3tjd87', 't3_3148se', 't3_3iqltn', 't3_3iqltn', 't3_3iqltn', 't3_3iqltn', 't3_3iqltn', 't3_3iqltn', 't3_3iqltn', 't3_3iqltn', 't3_3iqltn', 't3_3iqltn', 't3_3iqltn', 't3_3iqltn', 't3_3iqltn', 't3_3iqltn', 't3_3iqltn', 't3_3iqltn', 't3_3iqltn', 't3_3iqltn', 't3_3iqltn', 't1_cuiqotf', 't1_cuitm7b', 't1_cuiu4f7', 't1_cuj34lk', 't1_cuj349r', 't1_cuj64dt', 't1_cuj7uzl', 't1_cujjn82', 't1_cujl3rx', 't1_cujarhm', 't1_cujcwje', 't1_cujebjb', 't1_cuivqrs', 't1_cujmqmf', 't1_cujoke2', 't1_cuj7h2o', 't1_cujgz0t', 't1_cujt1t1', 't1_cujj9h0', 't3_53pohv', 't3_53ov0q', 't3_53ov0q', 't3_53ov0q', 't3_53ov0q', 't3_5rpn5l', 't1_dd954mt', 't1_dd962ss'],
            'submission_id': ['3tjd87', '3148se', '3iqltn', '3iqltn', '3iqltn', '3iqltn', '3iqltn', '3iqltn', '3iqltn', '3iqltn', '3iqltn', '3iqltn', '3iqltn', '3iqltn', '3iqltn', '3iqltn', '3iqltn', '3iqltn', '3iqltn', '3iqltn', '3iqltn', '3iqltn', '3iqltn', '3iqltn', '3iqltn', '3iqltn', '3iqltn', '3iqltn', '3iqltn', '3iqltn', '3iqltn', '3iqltn', '3iqltn', '3iqltn', '3iqltn', '3iqltn', '3iqltn', '3iqltn', '3iqltn', '3iqltn', '53pohv', '53ov0q', '53ov0q', '53ov0q', '53ov0q', '5rpn5l', '5rpn5l', '5rpn5l'],
            'ups': [7.0, 3.0, 16.0, 11.0, 9.0, 7.0, 6.0, 7.0, 8.0, 3.0, 3.0, 4.0, 4.0, 6.0, 5.0, 2.0, 2.0, 4.0, 2.0, 1.0, 1.0, 6.0, 7.0, 9.0, 7.0, 5.0, 1.0, 5.0, 1.0, 2.0, 2.0, 3.0, 1.0, 4.0, 2.0, 2.0, 4.0, 1.0, 2.0, 2.0, 1.0, 15.0, 6.0, 5.0, 1.0, 1.0, 1.0, 1.0]
        }

        static_submissions_data = {
            'Unnamed: 0': [2402, 10646, 149720, 744801, 167700, 456042],
            'Unnamed: 0.1': [2402, 10646, 149720, 744801, 167700, 456042],
            'Unnamed: 0.1.1': [None, None, None, None, None, None],
            'author': ['scapler', 'rpilek', 'paulrulez742', 'tinywinner', 'Wizard-ette', 'patho5'],
            'clean_title': ['blakely teacher restricts legoplay to her girl students in the pursuit of gender equity', 'australian wwi poster', 'sure sit on me and spin', 'this horde of ikea drawer stoppers', 'oo', ':[]'],
            'created_utc': [1448000720.0, 1427925770.0, 1440776896.0, 1474406090.0, 1474396501.0, 1486071264.0],
            'domain': ['bainbridgereview.com', 'i.imgur.com', 'imgur.com', 'i.reddituploads.com', 'imgur.com', 'i.reddituploads.com'],
            'hasImage': [False, True, True, True, True, True],
            'id': ['3tjd87', '3148se', '3iqltn', '53pohv', '53ov0q', '5rpn5l'],
            'image_url': [
                '',
                'https://external-preview.redd.it/4xOStvKJZnwjGoP5gqEnms7etyLq6pRFp7h0JeWKcsU.jpg?width=320&crop=smart&auto=webp&s=cfde9ddf38c4916dacfb5c424d4de3507799c32e',
                'https://external-preview.redd.it/Lc7LWsIEOudLxy-n-TENIY2BshKJBpQbg0Z7NfcKuy4.jpg?width=320&crop=smart&auto=webp&s=9249dd6b86aeb2aafd6f2de07582d3874026d7c5',
                'https://external-preview.redd.it/Z6P_PMpruvYHpjE7ZA72IJxva-MYzxQRSojcQ7MY5xQ.jpg?width=320&crop=smart&auto=webp&s=4f4d318cc07d58ccee4424f225c0f700e57ba917',
                'https://external-preview.redd.it/_nMpG6S5mOrZH53Z190jRm5XACqubYFblL3rw_0v0yc.jpg?width=320&crop=smart&auto=webp&s=8ae0082fcafd2397225b7c9ee3fc8a6e7f987368',
                'https://external-preview.redd.it/GWZg6zeKP0Opjzb1xSixCMyadWkjs2xVOV-B6izCfvQ.jpg?width=320&crop=smart&auto=webp&s=0a424b912b60aafeaeda7ab88d0001d8ce1f3fd9'
            ],
            'linked_submission_id': [None, None, None, None, None, None],
            'num_comments': [604.0, 2.0, 39.0, 1.0, 4.0, 3.0],
            'score': [1327, 36, 720, 15, 162, 11],
            'subreddit': ['nottheonion', 'propagandaposters', 'pareidolia', 'pareidolia', 'pareidolia', 'pareidolia'],
            'title': ['Blakely teacher restricts Lego-play to her girl students in the pursuit of gender equity', 'Australian WWI Poster. 1914-1918.', 'Sure, sit on me and spin', 'This horde of IKEA drawer stoppers.', 'o_o', ':[]'],
            'upvote_ratio': [0.89, 0.95, 0.96, 0.74, 0.97, 0.92],
            '2_way_label': [1, 0, 0, 0, 0, 0],
            '3_way_label': [0, 1, 2, 2, 2, 2],
            '6_way_label': [0, 5, 2, 2, 2, 2]
        }


        # Load the datasets
        if USE_STATIC_DATA:
            # df_submissions_clean = pd.DataFrame(static_submissions_data)
            df_submissions_clean = pd.DataFrame(static_submissions_data)
            df_comments = pd.DataFrame(static_comments_data)
            chunk_iterator = [df_submissions_clean]  # Wrap the DataFrame in a list for iteration
        else:
            # Load comments
            print("[DEBUG] Loading comments dataset...")
            df_comments = pd.read_csv(comments_path, sep='\t', encoding='utf-8', on_bad_lines='warn', low_memory=True)
            # Load submissions in chunks
            print("[DEBUG] Loading submissions dataset in chunks...")
            chunk_iterator = pd.read_csv(submissions_path, encoding='latin1', low_memory=True, on_bad_lines='skip', chunksize=config.chunk_size)

        comments_dict, comments_orig_dict = cleaner.filter_comments(df_comments)

        for chunk_count, chunk in enumerate(chunk_iterator):
            print(f"[DEBUG] Processing chunk {chunk_count + 1} with {len(chunk)} rows...")
            chunk = cleaner.clean_dataframe(chunk)

            # Map comments to submissions
            print("[DEBUG] Mapping comments to submissions...")
            chunk['comments_orig'] = chunk.apply(
                # lambda row: '|__|'.join(set(comments_orig_dict.get(row.get('linked_submission_id', ''), []) +
                #                            comments_orig_dict.get(row.get('id', ''), []))),
                lambda row: '|__|'.join(set(comments_orig_dict.get(row.get('id', ''), []))),
                axis=1
            )
            print("[DEBUG] Finished mapping comments for chunk.")

            # Remove rows without comments
            initial_chunk_row_count = len(chunk)
            chunk = chunk[chunk['comments_orig'] != '']
            print(f"[DEBUG] Rows after filtering out empty comments: {len(chunk)} (removed {initial_chunk_row_count - len(chunk)})")

            # Save cleaned chunk
            chunk.to_csv(f"{output_path}/cleaned_chunk_{chunk_count + 1}_no-photo.csv", index=False)
            print(f"[DEBUG] Chunk {chunk_count + 1} saved with {len(chunk)} rows.")

        # Combine all cleaned chunks into a single file
        all_files = glob.glob(f"{output_path}/cleaned_chunk_*_no-photo.csv")
        print(f"[DEBUG] Combining {len(all_files)} cleaned chunk files...")
        combined_df = pd.concat([pd.read_csv(f) for f in all_files], ignore_index=True)
        combined_df.to_csv(f"{output_path}/merged_cleaned_data_v33_no_photo_v6.csv", index=False)
        print("[DEBUG] All chunks merged and saved to merged_cleaned_data.csv.")

    except Exception as e:
        print(f"[ERROR] An error occurred: {e}")

# Main execution
if __name__ == "__main__":
    # Configuration
    config = Config()
    cleaner = Cleaner(config)

    # Paths
    submissions_path = '/content/drive/MyDrive/TUS/Engineering_Project/data/concatted_all_data.csv'
    comments_path = '/content/drive/MyDrive/TUS/Engineering_Project/data/source_files/all_comments.tsv'
    output_path = '/content/drive/MyDrive/TUS/Engineering_Project/data/cleaned_chunks'

    # Process data
    process_data_in_chunks(config, cleaner, submissions_path, comments_path, output_path)


In [None]:
# # Export all FAKE subreddit titles
# import pandas as pd
# import logging

# # Set up logging
# logging.basicConfig(filename='query_results.log', level=logging.INFO, 
#                     format='%(asctime)s - %(levelname)s - %(message)s')

# class DataInMemory:
#     def __init__(self, submissions_path):
#         self.submissions_path = submissions_path
#         self.df_submissions = None

#     def load_data(self):
#         try:
#             # Load the submissions dataset into memory
#             print("[DEBUG] Loading submissions dataset...")
#             self.df_submissions = pd.read_csv(self.submissions_path, encoding='latin1', low_memory=True)
#             print(f"[DEBUG] Submissions dataset loaded with {len(self.df_submissions)} rows.")
#         except Exception as e:
#             print(f"[ERROR] An error occurred while loading submissions data: {e}")

#     def filter_and_export_submissions(self, output_file):
#         try:
#             print("[DEBUG] Filtering submissions with upvote_ratio > 0.90...")

#             # Filter rows where 'upvote_ratio' > 0.90
#             filtered_df = self.df_submissions[self.df_submissions['upvote_ratio'] > 0.90]
#             print(f"[DEBUG] Filtered {len(filtered_df)} submissions with upvote_ratio > 0.90.")

#             # Export to CSV
#             filtered_df.to_csv(output_file, index=False)
#             print(f"[DEBUG] Filtered submissions exported to {output_file}.")
            
#             # Log results
#             logging.info(f"Filtered {len(filtered_df)} submissions with upvote_ratio > 0.90 and exported to {output_file}.")
#         except Exception as e:
#             print(f"[ERROR] An error occurred during filtering or exporting: {e}")
#             logging.error(f"An error occurred during filtering or exporting: {e}")

# # Main execution
# if __name__ == "__main__":
#     # Define paths
#     submissions_path = '/content/drive/MyDrive/TUS/Engineering_Project/data/concatted_all_data.csv'
#     output_file = '/content/drive/MyDrive/TUS/Engineering_Project/data/filtered_submissions_fake_ones.csv'

#     # Initialize and load datasets into memory
#     data_manager = DataInMemory(submissions_path)
#     data_manager.load_data()

#     # Filter and export submissions with upvote_ratio > 0.90
#     data_manager.filter_and_export_submissions(output_file)

In [None]:
# # No PHOTO submissions - original comments - just cpmpatile with EXCEL - no stopwors or nltk cleaning

import pandas as pd
import re
import glob
import csv

# Configuration class for cleaning parameters
class Config:
    def __init__(self):
        self.chunk_size = 10000
        self.max_text_length = 500
        self.comment_invalid_phrases = [
            'post comment without original photoshop',
            'please post comment photoshop reply',
            'unfortunately, your submission has been removed from',
            'google image search automated response',
            'is a curated space.**',
            'In order not to get your comment removed',
            'm a bot so if I was wrong'
        ]
        # self.stop_words = set(stopwords.words('english'))
        self.placeholder = "placeholderspecialdelim"

# Cleaner class to encapsulate data cleaning
class Cleaner:
    def __init__(self, config: Config):
        self.config = config
        # self.lemmatizer = WordNetLemmatizer()

    def clean_text(self, text):
        if isinstance(text, float):  # Handle NaN
            print(f"[DEBUG] Encountered NaN or invalid text: {text}")
            return ''

        # Temporarily replace special delimiter
        text = text.replace('|__|', self.config.placeholder)
        print(f"[DEBUG] After replacing special delimiter: {text}")

        # Remove unwanted characters
        text = re.sub(r'http\S+', '', text)  # Remove URLs
        text = re.sub(r'<.*?>', '', text)  # Remove HTML tags
        text = re.sub(r'[^a-zA-Z\s' + self.config.placeholder + ']', '', text)  # Remove special characters
        print(f"[DEBUG] After cleaning unwanted characters: {text}")

        # Convert text to lowercase
        text = text.lower()
        print(f"[DEBUG] Converted text to lowercase: {text}")

        # Truncate text if it exceeds the maximum length
        if len(text) > self.config.max_text_length:
            print(f"[DEBUG] Truncating text to max length of {self.config.max_text_length}")
            text = text[:self.config.max_text_length] + '...' + '|__|'

        return text

    def filter_csv_compliant(self, text):
        """
        Filters and converts a given text string to be compliant with CSV format.

        Args:
            text (str): The input text string to be filtered.

        Returns:
            str: The CSV-compliant text string.
        """
        if not isinstance(text, str):
            raise ValueError("Input must be a string")

        # Escape special characters for CSV
        escaped_text = csv.StringIO()
        writer = csv.writer(escaped_text, quoting=csv.QUOTE_MINIMAL)
        writer.writerow([text])
        
        # Return filtered text without surrounding quotes
        return escaped_text.getvalue().strip()

    def filter_comments(self, df_comments):
        comments_dict = {}
        comments_orig_dict = {}

        print(f"[DEBUG] Total comments before filtering: {len(df_comments)}")

        for i, row in df_comments.iterrows():
            parent_id = row.get('parent_id', '')
            if isinstance(parent_id, str):
                parent_id = parent_id.split('_', 1)[-1]  # Remove prefix like 't1_'
                print(f"[DEBUG] Processed parent_id: {parent_id}")

            comment_body = row.get('body', '') if pd.notna(row.get('body', '')) else ''
            print(f"[DEBUG] Original comment body: {comment_body}")

            # Skip invalid comments
            if (comment_body in ['NaN', 'deleted', 'removed', '', '[deleted]', '[removed]']) or \
               any(phrase in comment_body.lower() for phrase in self.config.comment_invalid_phrases):
                print(f"[DEBUG] Skipping invalid comment: {comment_body}")
                continue
            elif parent_id != '' and comment_body != '':
                # Clean comment body
                comment_body = self.filter_csv_compliant(comment_body)
                # print(f"[DEBUG] CSV compliant comment body: {comment_body}")
                # Build comment dictionaries
                if parent_id in comments_orig_dict:
                    # comments_dict[parent_id].append(cleaned_comment)
                    comments_orig_dict[parent_id].append(comment_body)
                    print(f"[DEBUG] Appended comment to parent_id {parent_id}")
                else:
                    # comments_dict[parent_id] = [cleaned_comment]
                    comments_orig_dict[parent_id] = [comment_body]
                    print(f"[DEBUG] Created new entry for parent_id {parent_id}")

        print(f"[DEBUG] Filtered {len(comments_orig_dict)} unique parent_ids with valid comments.")
        return comments_dict, comments_orig_dict

    def clean_dataframe(self, df):
        print(f"[DEBUG] Starting dataframe cleaning. Initial columns: {df.columns}")

        # Remove unnamed columns
        unnamed_cols = [col for col in df.columns if col.startswith('Unnamed')]
        df.drop(columns=unnamed_cols, inplace=True)
        print(f"[DEBUG] Removed unnamed columns. Remaining columns: {df.columns}")

        # Filter rows based on 'hasImage' and 'subreddit'
        initial_row_count = len(df)
        df = df[~df['hasImage'].astype(str).str.lower().eq('true')]
        print(f"[DEBUG] Rows after removing hasImage=True: {len(df)} (removed {initial_row_count - len(df)})")
        df = df[~df['subreddit'].str.lower().str.contains('photo')]
        print(f"[DEBUG] Rows after removing subreddits containing 'photo': {len(df)}")
        # df = df[df['subreddit'].str.lower().str.contains('news')]
        # print(f"[DEBUG] Rows after filtering subreddits containing 'news': {len(df)}")

        # Drop unnecessary columns
        drop_columns = ['created_utc', 'domain', 'image_url']
        df.drop(columns=drop_columns, inplace=True, errors='ignore')
        print(f"[DEBUG] Dropped columns: {drop_columns}. Remaining columns: {df.columns}")

        return df

# Main function to process data in chunks
def process_data_in_chunks(config: Config, cleaner: Cleaner, submissions_path, comments_path, output_path):
    try:
        # Load comments
        print("[DEBUG] Loading comments dataset...")
        df_comments = pd.read_csv(comments_path, sep='\t', encoding='utf-8', on_bad_lines='warn', low_memory=True)
        print(f"[DEBUG] Loaded {len(df_comments)} rows from comments dataset.")

        comments_dict, comments_orig_dict = cleaner.filter_comments(df_comments)

        # Load submissions in chunks
        print("[DEBUG] Loading submissions dataset in chunks...")
        chunk_iterator = pd.read_csv(submissions_path, encoding='latin1', low_memory=True, on_bad_lines='skip', chunksize=config.chunk_size)

        for chunk_count, chunk in enumerate(chunk_iterator):
            print(f"[DEBUG] Processing chunk {chunk_count + 1} with {len(chunk)} rows...")
            chunk = cleaner.clean_dataframe(chunk)

            # Map comments to submissions
            print("[DEBUG] Mapping comments to submissions...")
            chunk['comments_orig'] = chunk.apply(
                # lambda row: '|__|'.join(set(comments_orig_dict.get(row.get('linked_submission_id', ''), []) +
                #                            comments_orig_dict.get(row.get('id', ''), []))),
                lambda row: '|__|'.join(set(comments_orig_dict.get(row.get('id', ''), []))),
                axis=1
            )
            print("[DEBUG] Finished mapping comments for chunk.")

            # Remove rows without comments
            initial_chunk_row_count = len(chunk)
            chunk = chunk[chunk['comments_orig'] != '']
            print(f"[DEBUG] Rows after filtering out empty comments: {len(chunk)} (removed {initial_chunk_row_count - len(chunk)})")

            # Save cleaned chunk
            chunk.to_csv(f"{output_path}/cleaned_chunk_{chunk_count + 1}_no-photo.csv", index=False)
            print(f"[DEBUG] Chunk {chunk_count + 1} saved with {len(chunk)} rows.")

        # Combine all cleaned chunks into a single file
        all_files = glob.glob(f"{output_path}/cleaned_chunk_*_no-photo.csv")
        print(f"[DEBUG] Combining {len(all_files)} cleaned chunk files...")
        combined_df = pd.concat([pd.read_csv(f) for f in all_files], ignore_index=True)
        combined_df.to_csv(f"{output_path}/merged_cleaned_data_v33_no_photo.csv", index=False)
        print("[DEBUG] All chunks merged and saved to merged_cleaned_data.csv.")

    except Exception as e:
        print(f"[ERROR] An error occurred: {e}")

# Main execution
if __name__ == "__main__":
    # Configuration
    config = Config()
    cleaner = Cleaner(config)

    # Paths
    submissions_path = '/content/drive/MyDrive/TUS/Engineering_Project/data/concatted_all_data.csv'
    comments_path = '/content/drive/MyDrive/TUS/Engineering_Project/data/source_files/all_comments.tsv'
    output_path = '/content/drive/MyDrive/TUS/Engineering_Project/data/cleaned_chunks'

    # Process data
    process_data_in_chunks(config, cleaner, submissions_path, comments_path, output_path)


In [None]:
# # Get data from 3 files by id(for comments mapped to submission_id or parent_id)


# import pandas as pd
# import logging

# # Set up logging
# logging.basicConfig(filename='query_results.log', level=logging.INFO, 
#                     format='%(asctime)s - %(levelname)s - %(message)s')

# class DataInMemory:
#     def __init__(self, submissions_path, comments_path, filtered_submissions_path):
#         self.submissions_path = submissions_path
#         self.comments_path = comments_path
#         self.filtered_submissions_path = filtered_submissions_path
#         self.df_submissions = None
#         self.df_comments = None
#         self.df_filtered_submissions = None

#     def load_data(self):
#         try:
#             # Load datasets into memory
#             print("[DEBUG] Loading submissions dataset...")
#             self.df_submissions = pd.read_csv(self.submissions_path, encoding='latin1', low_memory=True)
#             print(f"[DEBUG] Submissions dataset loaded with {len(self.df_submissions)} rows.")
            
#             print("[DEBUG] Loading comments dataset...")
#             self.df_comments = pd.read_csv(self.comments_path, sep='\t', encoding='utf-8', low_memory=True)
#             print(f"[DEBUG] Comments dataset loaded with {len(self.df_comments)} rows.")
            
#             print("[DEBUG] Loading filtered submissions dataset...")
#             self.df_filtered_submissions = pd.read_csv(self.filtered_submissions_path, encoding='latin1', low_memory=True)
#             print(f"[DEBUG] Filtered submissions dataset loaded with {len(self.df_filtered_submissions)} rows.")
#         except Exception as e:
#             print(f"[ERROR] An error occurred while loading data: {e}")

#     def get_data_by_id(self, data_id):
#         try:
#             # Debug log for the query
#             print(f"[DEBUG] Searching for ID: {data_id}")
            
#             # Search in submissions dataset
#             submission_row = self.df_submissions[self.df_submissions['id'] == data_id]
#             if not submission_row.empty:
#                 print(f"[DEBUG] Found ID in submissions dataset.")
#             else:
#                 print(f"[DEBUG] ID not found in submissions dataset.")

#             # Search in comments dataset
#             comments_row = self.df_comments[
#                 (self.df_comments['parent_id'].str.contains(data_id, na=False)) | 
#                 (self.df_comments['submission_id'] == data_id)
#             ]
#             if not comments_row.empty:
#                 print(f"[DEBUG] Found ID in comments dataset.")
#             else:
#                 print(f"[DEBUG] ID not found in comments dataset.")

#             # Search in filtered submissions dataset
#             filtered_row = self.df_filtered_submissions[self.df_filtered_submissions['id'] == data_id]
#             if not filtered_row.empty:
#                 print(f"[DEBUG] Found ID in filtered submissions dataset.")
#             else:
#                 print(f"[DEBUG] ID not found in filtered submissions dataset.")

#             # Log the results
#             logging.info(f"Query ID: {data_id}")
#             logging.info(f"Submissions data: {submission_row.to_dict(orient='records')}")
#             logging.info(f"Comments data: {comments_row.to_dict(orient='records')}")
#             logging.info(f"Filtered submissions data: {filtered_row.to_dict(orient='records')}")

#             # Return all the data
#             return {
#                 "submissions": submission_row,
#                 "comments": comments_row,
#                 "filtered_submissions": filtered_row
#             }
#         except Exception as e:
#             print(f"[ERROR] An error occurred while retrieving data for ID {data_id}: {e}")
#             logging.error(f"An error occurred for ID {data_id}: {e}")
#             return None

# # Main execution
# if __name__ == "__main__":
#     # Define paths
#     submissions_path = '/content/drive/MyDrive/TUS/Engineering_Project/data/concatted_all_data.csv'
#     comments_path = '/content/drive/MyDrive/TUS/Engineering_Project/data/source_files/all_comments.tsv'
#     filtered_submissions_path = '/content/drive/MyDrive/TUS/Engineering_Project/data/cleaned_chunks/merged_cleaned_data_v30_news.csv'

#     # Initialize and load datasets into memory
#     data_manager = DataInMemory(submissions_path, comments_path, filtered_submissions_path)
#     data_manager.load_data()

#     # Query data by entering different IDs
#     while True:
#         data_id = input("Enter the ID to search (or type 'exit' to quit): ").strip()
#         if data_id.lower() == 'exit':
#             print("Exiting the program. Goodbye!")
#             break

#         result = data_manager.get_data_by_id(data_id)
#         if result:
#             print("\n[RESULT] Submissions data:")
#             print(result["submissions"])
#             print("\n[RESULT] Comments data:")
#             print(result["comments"])
#             print("\n[RESULT] Filtered submissions data:")
#             print(result["filtered_submissions"])
#         else:
#             print("[RESULT] No data found for the provided ID.")


