In [38]:
import spacy
import pandas as pd
from tqdm import tqdm
nlp = spacy.load("en_core_web_sm")

In [46]:
#define the TextPreprocessor class
class TextPreprocessor:
    def __init__(self, pollution_keywords=None):
        """
        Initializes the text preprocessor with optional pollution keywords.
        Args:
            pollution_keywords (list of str): List of words to mask as pollution.
        """
        self.nlp = spacy.load("en_core_web_sm")
        self.pollution_keywords = pollution_keywords if pollution_keywords else []

    def mask_pollution(self, text):
        """
        Masks explicit markers in the text based on pollution keywords.
        Args:
            text (str): Input text.
        Returns:
            str: Text with explicit markers replaced by [MASK].
        """
        for keyword in self.pollution_keywords:
            text = text.replace(keyword, "[MASK]")
        return text

    def preprocess_with_pipe(self, texts, batch_size=10):
        """
        Preprocesses a list of texts using SpaCy's nlp.pipe for batch processing.
        Args:
            texts (list of str): List of text strings to preprocess.
            batch_size (int): Number of texts to process in each batch.
        Returns:
            list of str: List of preprocessed texts.
        """
        processed_texts = []
        for i, doc in enumerate(self.nlp.pipe(texts, batch_size=batch_size)):
            tokens = [token.lemma_ for token in doc if not token.is_stop and not token.is_punct]
            processed_texts.append(" ".join(tokens))
            
            # Print progress for every 10 rows processed
            if (i + 1) % 50 == 0:
                print(f"Processed {i + 1} rows...")
        
        return processed_texts


In [47]:
file_path = "lai-data/political_leaning.csv"
df = pd.read_csv(file_path)

preprocessor = TextPreprocessor(pollution_keywords=[])

batch_size = 64  #batch size for efficiency
df['preprocessed_post'] = preprocessor.preprocess_with_pipe(df['post'].tolist(), batch_size=batch_size)

preprocessed_output_path = "lai-datasets/preprocessed_political_leaning.csv"
df.to_csv(preprocessed_output_path, index=False)
print(f"Preprocessed data saved to {preprocessed_output_path}")


Processed 50 rows...
Processed 100 rows...
Processed 150 rows...
Processed 200 rows...
Processed 250 rows...
Processed 300 rows...
Processed 350 rows...
Processed 400 rows...
Processed 450 rows...
Processed 500 rows...
Processed 550 rows...
Processed 600 rows...
Processed 650 rows...
Processed 700 rows...
Processed 750 rows...
Processed 800 rows...
Processed 850 rows...
Processed 900 rows...
Processed 950 rows...
Processed 1000 rows...
Processed 1050 rows...
Processed 1100 rows...
Processed 1150 rows...
Processed 1200 rows...
Processed 1250 rows...
Processed 1300 rows...
Processed 1350 rows...
Processed 1400 rows...
Processed 1450 rows...
Processed 1500 rows...
Processed 1550 rows...
Processed 1600 rows...
Processed 1650 rows...
Processed 1700 rows...
Processed 1750 rows...
Processed 1800 rows...
Processed 1850 rows...
Processed 1900 rows...
Processed 1950 rows...
Processed 2000 rows...
Processed 2050 rows...
Processed 2100 rows...
Processed 2150 rows...
Processed 2200 rows...
Processe

In [48]:
# Display the first few rows of the preprocessed data
print(df[['post', 'preprocessed_post']].head())


                                                post  \
0  You can "buy" the show and stream it through t...   
1  me want to play Q*bert Holy shit, based Alex J...   
2  Shouldn't rely on any external services or per...   
3  PR to a specific person. Usually that just mea...   
4  This article's intention is clear that they wa...   

                                   preprocessed_post  
0  buy stream include Lethal Weapon 6](url episod...  
1  want play Q*bert Holy shit base Alex Jones bre...  
2  rely external service persistent datum test un...  
3  pr specific person usually mean round robin ap...  
4  article intention clear want imply causal rela...  
