In [9]:
import pandas as pd
import json
from pathlib import Path
import numpy as np
from typing import List, Dict, Tuple
import logging
from datetime import datetime
import os
from sklearn.model_selection import train_test_split

class DataPreprocessor:
    def __init__(self, data_dir: str = "./Data"):
        self.data_dir = Path(data_dir)
        self.setup_logging()
        
        if not self.data_dir.exists():
            raise FileNotFoundError(f"Data directory not found: {self.data_dir}")
        
    def setup_logging(self):
        log_dir = Path("logs")
        log_dir.mkdir(exist_ok=True)
        
        logging.basicConfig(
            filename=log_dir / f"math_qa_processor_{datetime.now().strftime('%Y%m%d_%H%M%S')}.log",
            level=logging.INFO,
            format='%(asctime)s - %(levelname)s - %(message)s'
        )
    
    def verify_files_exist(self, file_names: List[str]) -> bool:
        for file_name in file_names:
            file_path = self.data_dir / file_name
            if not file_path.exists():
                logging.error(f"Required file not found: {file_path}")
                return False
        return True
        
    def load_and_process_data(self) -> Tuple[pd.DataFrame, pd.DataFrame]:
        """Load and process data, returning separate training and testing DataFrames"""
        train_files = ['main_train.csv', 'socratic_train.csv']
        test_files = ['main_test.csv', 'socratic_test.csv']
        
        # Verify all files exist
        all_files = train_files + test_files
        if not self.verify_files_exist(all_files):
            raise FileNotFoundError("One or more required files are missing")
            
        train_dfs = []
        test_dfs = []
        
        # Process training files
        for file_name in train_files:
            try:
                df = pd.read_csv(self.data_dir / file_name)
                logging.info(f"Successfully loaded training file: {file_name}")
                
                if not all(col in df.columns for col in ['question', 'answer']):
                    logging.error(f"Missing required columns in {file_name}")
                    continue
                    
                df['question'] = df['question'].str.strip()
                df['answer'] = df['answer'].str.strip()
                train_dfs.append(df)
                
            except Exception as e:
                logging.error(f"Error processing {file_name}: {str(e)}")
                continue
        
        # Process test files
        for file_name in test_files:
            try:
                df = pd.read_csv(self.data_dir / file_name)
                logging.info(f"Successfully loaded test file: {file_name}")
                
                if not all(col in df.columns for col in ['question', 'answer']):
                    logging.error(f"Missing required columns in {file_name}")
                    continue
                    
                df['question'] = df['question'].str.strip()
                df['answer'] = df['answer'].str.strip()
                test_dfs.append(df)
                
            except Exception as e:
                logging.error(f"Error processing {file_name}: {str(e)}")
                continue
        
        if not train_dfs or not test_dfs:
            raise ValueError("No valid data files were processed")
            
        train_data = pd.concat(train_dfs, ignore_index=True)
        test_data = pd.concat(test_dfs, ignore_index=True)
        
        return train_data, test_data
    
    def create_training_config(self, train_data: pd.DataFrame, test_data: pd.DataFrame) -> Dict:
        """Create training configuration with statistics from both datasets"""
        data_stats = {
            "train_samples": int(len(train_data)),
            "test_samples": int(len(test_data)),
            "total_samples": int(len(train_data) + len(test_data)),
            "max_question_length": int(max(
                train_data['question'].str.len().max(),
                test_data['question'].str.len().max()
            )),
            "avg_question_length": float(pd.concat([
                train_data['question'].str.len(),
                test_data['question'].str.len()
            ]).mean()),
            "unique_questions": int(pd.concat([
                train_data['question'],
                test_data['question']
            ]).nunique())
        }
        
        config = {
            "model_name": "bert-base-uncased",
            "batch_size": 32,
            "learning_rate": float(2e-5),
            "num_epochs": 3,
            "max_sequence_length": int(min(128, data_stats['max_question_length'])),
            "data_statistics": data_stats,
            "files": {
                "train": str(Path("processed_data/train.csv")),
                "test": str(Path("processed_data/test.csv"))
            }
        }
        
        return config

def main():
    try:
        # Initialize preprocessor
        preprocessor = DataPreprocessor()
        
        # Load and process data
        train_data, test_data = preprocessor.load_and_process_data()
        
        # Create output directory
        output_dir = Path("processed_data")
        output_dir.mkdir(exist_ok=True)
        
        # Save processed data
        train_data.to_csv(output_dir / "train.csv", index=False)
        test_data.to_csv(output_dir / "test.csv", index=False)
        
        # Create and save configuration
        config = preprocessor.create_training_config(train_data, test_data)
        with open(output_dir / "training_config.json", 'w') as f:
            json.dump(config, f, indent=2)
        
        logging.info("Data processing completed successfully")
        print("Data processing completed successfully!")
        print(f"Processed {len(train_data)} training samples and {len(test_data)} test samples")
        
    except Exception as e:
        logging.error(f"Error in main process: {str(e)}")
        print(f"Error: {str(e)}")
        raise

if __name__ == "__main__":
    main()

Data processing completed successfully!
Processed 14946 training samples and 2638 test samples
