In [2]:
import pandas as pd
import re
import logging
from pathlib import Path
import sys
import time

class TextPreprocessor:
    def __init__(self):
        pass
        
    def clean_text(self, text):
        if pd.isna(text) or not isinstance(text, str):
            return ''
            
        text = str(text).lower().strip()
        if not text:
            return ''
            
        text = self._remove_sensitive_info(text)
        text = self._remove_special_chars(text)
        return text.strip()
        
    def _remove_sensitive_info(self, text):
        patterns = {
            'phone': r'\b\d{10}\b',
            'email': r'\S+@\S+\.\S+',
            'account': r'\b(?:acc|a/c)\s*#?\s*\d+\b',
            'transaction': r'\b(?:tr|transaction|ref)\s*#?\s*\d+\b',
            'url': r'http\S+|www\.\S+' 
        }
        for key, pattern in patterns.items():
            text = re.sub(pattern, f'[{key.upper()}]', text, flags=re.IGNORECASE)
        return text
        
    def _remove_special_chars(self, text):
        text = re.sub(r'[^\w\s.,!?-]', ' ', text)
        return ' '.join(text.split())

class CyberCrimePreprocessor:
    def __init__(self):
        self.text_preprocessor = TextPreprocessor()
        
    def preprocess_dataset(self, df):
        df = df.copy()
        
        # Keep only first 3 columns
        essential_cols = df.columns[:3].tolist()
        df = df[essential_cols]
        
        if 'crimeaditionalinfo' in df.columns:
            # Fill NaN values with empty string
            df['crimeaditionalinfo'] = df['crimeaditionalinfo'].fillna('')
            
            # Clean text
            df['crimeaditionalinfo'] = df['crimeaditionalinfo'].apply(
                self.text_preprocessor.clean_text
            )
            
        return df

if __name__ == "__main__":
    start_time = time.time()
    logging.basicConfig(
        level=logging.INFO,
        format='%(asctime)s - %(levelname)s - %(message)s',
        handlers=[
            logging.FileHandler('preprocessing.log'),
            logging.StreamHandler(sys.stdout)
        ]
    )
    
    try:
        # Load datasets
        train = pd.read_csv('train.csv', encoding='utf-8', na_values=['nan', 'NaN', 'NAN', ''])
        test = pd.read_csv('test.csv', encoding='utf-8', na_values=['nan', 'NaN', 'NAN', ''])
        
        logging.info("Starting preprocessing pipeline...")
        preprocessor = CyberCrimePreprocessor()
        
        # Process datasets
        train_clean = preprocessor.preprocess_dataset(train)
        test_clean = preprocessor.preprocess_dataset(test)
        
        # Save preprocessed data
        train_clean.to_csv('train_clean.csv', index=False, encoding='utf-8')
        test_clean.to_csv('test_clean.csv', index=False, encoding='utf-8')
        
        logging.info(f"Preprocessing completed in {time.time() - start_time:.2f} seconds")
        
    except Exception as e:
        logging.error(f"Error in preprocessing pipeline: {str(e)}", exc_info=True)
        sys.exit(1)

2024-11-22 11:11:49,944 - INFO - Starting preprocessing pipeline...
2024-11-22 11:11:56,749 - INFO - Preprocessing completed in 7.26 seconds
