In [1]:
import pandas as pd
import regex as re
import os

In [2]:
df = pd.read_pickle('./400QA_Understanding_QuizMaster_Codebase.pkl')

In [3]:
import pickle
list_pkl = []
for root, dirs, files in os.walk('.'):
    for file in files:
        if file.endswith('.pkl'):
            list_pkl.append(os.path.join(root, file).replace(os.sep, '/'))

filter_out = ['appender', 'log4j', 'Singleton', 'SLF4J', ' java ']

# filter out questions containing the any of the above keyworkds

for pkl in list_pkl:
    df = pd.read_pickle(pkl)
    df = df['question_text'].apply(lambda x: x.lower()) # modify this line to remove any of the filter_out keywords words

In [4]:
filter_out = [i for i in filter_out if i.strip()]

In [5]:
import pandas as pd
import os
# import pickle # Not needed if using pd.read_pickle

list_pkl = []
# Make sure '.' is the correct starting directory, or specify another path
start_dir = '.' 
for root, dirs, files in os.walk(start_dir):
    for file in files:
        if file.endswith('.pkl'):
            # Use forward slashes for consistency if desired
            list_pkl.append(os.path.join(root, file).replace(os.sep, '/')) 

# Ensure filter keywords are lowercase for case-insensitive matching
filter_out = ['appender', 'log4j', 'singleton', 'slf4j', ' java '] 
# Create a regex pattern joining keywords with OR '|'
# This allows checking for any of the keywords in one pass
filter_out_pattern = '|'.join(filter_out)

print(f"Found {len(list_pkl)} pickle files to process.")
print(f"Filtering out questions containing: {filter_out}")
print("--- IMPORTANT: Original files will be overwritten! ---") # Added warning
print("-" * 30)

# filtered_results = {} # Not strictly needed if overwriting

for pkl_path in list_pkl:
    print(f"Processing: {pkl_path}")
    try:
        df = pd.read_pickle(pkl_path)

        # Check if the expected column exists
        if 'question_text' not in df.columns:
            print(f"  Warning: 'question_text' column not found in {pkl_path}. Skipping.")
            continue

        # --- Identify rows to filter out ---
        # Ensure the column is treated as string and handle potential NaN values
        # Create a boolean mask: True if the lowercase text contains any keyword
        mask_contains_keyword = df['question_text'].astype(str).str.lower().str.contains(
            filter_out_pattern, 
            regex=True, 
            na=False # Treat NaN as not containing the keywords
        )
        
        # --- Print the questions being filtered out ---
        df_to_remove = df[mask_contains_keyword]
        removed_count = len(df_to_remove)

        if removed_count > 0:
            print(f"  --- Questions Filtered Out ({removed_count}) ---")
            for index, row in df_to_remove.iterrows():
                # Print the original question text
                print(f"    - {row['question_text']}") 
            print(f"  --- End of Filtered Out Questions ---")
        else:
            print("  No questions matched the filter keywords.")

        # --- Filter the DataFrame ---
        # Keep rows where the mask is False (i.e., keyword NOT found)
        df_filtered = df[~mask_contains_keyword]
        
        original_count = len(df)
        filtered_count = len(df_filtered)
        
        print(f"  Summary: Original rows: {original_count}, Filtered rows: {filtered_count}, Rows removed: {removed_count}")
        
        # --- Overwrite the original file ---
        df_filtered.to_pickle(pkl_path) 
        print(f"  Overwrote {pkl_path} with filtered data.")
        print("-" * 30) # Separator between files

    except FileNotFoundError:
        print(f"  Error: File not found {pkl_path}")
        print("-" * 30) 
    except Exception as e:
        print(f"  Error processing file {pkl_path}: {e}")
        print("-" * 30) 

print("Processing complete. All specified files have been filtered and overwritten.")

Found 3 pickle files to process.
Filtering out questions containing: ['appender', 'log4j', 'singleton', 'slf4j', ' java ']
--- IMPORTANT: Original files will be overwritten! ---
------------------------------
Processing: ./400QA_Understanding_QuizMaster_Codebase.pkl
  No questions matched the filter keywords.
  Summary: Original rows: 395, Filtered rows: 395, Rows removed: 0
  Overwrote ./400QA_Understanding_QuizMaster_Codebase.pkl with filtered data.
------------------------------
Processing: ./70QA_QUIZ_BANK_02.pkl
  No questions matched the filter keywords.
  Summary: Original rows: 70, Filtered rows: 70, Rows removed: 0
  Overwrote ./70QA_QUIZ_BANK_02.pkl with filtered data.
------------------------------
Processing: ./83QA_GCP_ESSENTIALS_QUIZ_BANK.pkl
  No questions matched the filter keywords.
  Summary: Original rows: 83, Filtered rows: 83, Rows removed: 0
  Overwrote ./83QA_GCP_ESSENTIALS_QUIZ_BANK.pkl with filtered data.
------------------------------
Processing complete. All 