In [None]:
import os
import shutil
import pandas as pd
from imblearn.combine import SMOTETomek
from imblearn.ensemble import EasyEnsembleClassifier, RUSBoostClassifier
import numpy as np
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.utils.validation import check_X_y, check_array, check_is_fitted
from sklearn.ensemble import RandomForestClassifier
import time
from multiprocessing import Pool
import concurrent.futures

# Base directory
base_dir = r"C:\Machine Learning\Phase 2"

# Define paths to deduplicated datasets
dedup_approaches = ["Direct_Removal", "Instance_Weighting", "Train_Test_Aware"]
outlier_techniques = ["Direct_Removal", "Isolation_Forest", "Log1p_Winsorization", "Winsorization", "Z-Score_Trimming"]
scaling_techniques = ["MinMaxScaler", "PowerTransformer", "QuantileTransformer", "RobustScaler", "StandardScaler"]
balancing_techniques = ["SMOTETomek", "ClassWeights", "EasyEnsemble", "RUSBoost", "ThresholdTuning"]

# Create a cache directory for checkpoints
cache_dir = os.path.join(base_dir, "cache")
os.makedirs(cache_dir, exist_ok=True)

class ThresholdTuningClassifier(BaseEstimator, ClassifierMixin):
    """A simple wrapper class for threshold tuning"""
    def __init__(self, base_estimator=None, threshold=0.5):
        self.threshold = threshold
        self.base_estimator = base_estimator or RandomForestClassifier(random_state=42)
        self.classes_ = np.array([0, 1])
        
    def fit(self, X, y):
        X, y = check_X_y(X, y)
        self.base_estimator.fit(X, y)
        self.X_ = X
        self.y_ = y
        return self
        
    def predict(self, X):
        check_is_fitted(self)
        X = check_array(X)
        probs = self.base_estimator.predict_proba(X)[:, 1]
        return np.where(probs > self.threshold, 1, 0)

def apply_balancing_technique(args):
    """Apply a single balancing technique to a specific dataset"""
    dedup_approach, balancing_technique, x_train, y_train, target_dir, x_train_filename = args
    
    start_time = time.time()
    balanced_y_file = f"phase2_{dedup_approach}_y_train_{balancing_technique}.csv"
    
    # Check if work is already done for this specific combination
    checkpoint_file = os.path.join(cache_dir, f"checkpoint_{dedup_approach}_{balancing_technique}_{os.path.basename(target_dir)}.txt")
    if os.path.exists(checkpoint_file):
        print(f"Skipping {balancing_technique} for {target_dir}, already processed")
        return
    
    # Apply specific balancing technique
    try:
        if balancing_technique == "SMOTETomek":
            smote_tomek = SMOTETomek(random_state=42, n_jobs=-1)  # Use all available cores
            _, y_resampled = smote_tomek.fit_resample(x_train, y_train.iloc[:, 0])
            balanced_y = pd.DataFrame(y_resampled)
        
        elif balancing_technique == "ClassWeights":
            # For class weights, we don't actually change y_train
            # but create a weights column that can be used during training
            class_counts = y_train.iloc[:, 0].value_counts()
            n_samples = len(y_train)
            weights = {cls: n_samples / (len(class_counts) * count) for cls, count in class_counts.items()}
            y_weights = y_train.iloc[:, 0].map(weights)
            balanced_y = pd.concat([y_train, pd.DataFrame({'weight': y_weights})], axis=1)
        
        elif balancing_technique == "EasyEnsemble":
            ensemble = EasyEnsembleClassifier(random_state=42, n_estimators=10)
            ensemble.fit(x_train, y_train.iloc[:, 0])
            # For demonstration, just use the original y_train
            balanced_y = y_train
        
        elif balancing_technique == "RUSBoost":
            rusboost = RUSBoostClassifier(random_state=42, n_estimators=50)
            rusboost.fit(x_train, y_train.iloc[:, 0])
            # For demonstration, just use the original y_train
            balanced_y = y_train
        
        elif balancing_technique == "ThresholdTuning":
            # For threshold tuning, we don't actually change y_train
            threshold_tuner = ThresholdTuningClassifier(threshold=0.3)
            threshold_tuner.fit(x_train, y_train.iloc[:, 0])
            balanced_y = y_train
        
        # Save directly to the target directory
        output_path = os.path.join(target_dir, balanced_y_file)
        balanced_y.to_csv(output_path, index=False)
        
        # Create checkpoint file to mark as complete
        with open(checkpoint_file, 'w') as f:
            f.write(f"completed at {time.strftime('%Y-%m-%d %H:%M:%S')}")
        
        elapsed = time.time() - start_time
        print(f"{balancing_technique} for {os.path.basename(target_dir)} took {elapsed:.2f} seconds")
        
        return output_path
    except Exception as e:
        print(f"Error processing {balancing_technique} for {target_dir}: {str(e)}")
        return None

def process_dedup_approach(dedup_approach):
    """Process all balancing techniques for a single deduplication approach"""
    try:
        # First, load the original y_train
        y_train_path = os.path.join(
            base_dir, 
            "Data", 
            "deduplicated_datasets", 
            dedup_approach, 
            f"phase2_{dedup_approach}_y_train.csv"
        )
        
        if not os.path.exists(y_train_path):
            print(f"Warning: {y_train_path} does not exist. Skipping...")
            return
        
        y_train = pd.read_csv(y_train_path)
        
        # Create a list to collect all tasks
        all_tasks = []
        
        # Process each outlier technique
        for outlier_technique in outlier_techniques:
            # Process feature added datasets
            feature_added_path = os.path.join(
                base_dir, 
                "Data", 
                "deduplicated_datasets", 
                dedup_approach, 
                "outlier_handled_datasets", 
                outlier_technique, 
                "feature_added_datasets"
            )
            
            if os.path.exists(feature_added_path):
                # Use the X_train file from the feature_added_datasets folder
                x_train_feature_added = f"phase2_{dedup_approach}_X_train_{outlier_technique}_feature_added.csv"
                x_train_feature_path = os.path.join(feature_added_path, x_train_feature_added)
                
                if os.path.exists(x_train_feature_path):
                    x_train_feature = pd.read_csv(x_train_feature_path)
                    
                    # Add balancing tasks for feature added datasets
                    for balancing_technique in balancing_techniques:
                        all_tasks.append((
                            dedup_approach, 
                            balancing_technique, 
                            x_train_feature, 
                            y_train, 
                            feature_added_path,
                            x_train_feature_added
                        ))
            
            # Process each scaling technique
            scaled_datasets_path = os.path.join(
                base_dir, 
                "Data", 
                "deduplicated_datasets", 
                dedup_approach, 
                "outlier_handled_datasets", 
                outlier_technique, 
                "scaled_datasets"
            )
            
            for scaling_technique in scaling_techniques:
                scale_folder = os.path.join(scaled_datasets_path, scaling_technique)
                
                if os.path.exists(scale_folder):
                    # Use the X_train file specific to this scaling technique
                    x_train_scaled = f"phase2_{dedup_approach}_X_train_{outlier_technique}_{scaling_technique}.csv"
                    x_train_scaled_path = os.path.join(scale_folder, x_train_scaled)
                    
                    if os.path.exists(x_train_scaled_path):
                        x_train_scaled_data = pd.read_csv(x_train_scaled_path)
                        
                        # Add balancing tasks for scaled datasets
                        for balancing_technique in balancing_techniques:
                            all_tasks.append((
                                dedup_approach, 
                                balancing_technique, 
                                x_train_scaled_data, 
                                y_train, 
                                scale_folder,
                                x_train_scaled
                            ))
        
        # Process all tasks in parallel
        with concurrent.futures.ProcessPoolExecutor(max_workers=os.cpu_count()) as executor:
            results = list(executor.map(apply_balancing_technique, all_tasks))
        
        print(f"Completed processing for {dedup_approach}")
        
    except Exception as e:
        print(f"Error processing dedup approach {dedup_approach}: {str(e)}")

def apply_balancing_techniques_parallel():
    """Apply balancing techniques to y_train files with corresponding X_train files in parallel"""
    print("Starting balancing techniques application in parallel...")
    
    # Create overall checkpoint file to track progress
    overall_checkpoint = os.path.join(cache_dir, "overall_progress.txt")
    
    # Check if there's a previous run that was interrupted
    completed_approaches = set()
    if os.path.exists(overall_checkpoint):
        with open(overall_checkpoint, 'r') as f:
            completed_approaches = set(line.strip() for line in f.readlines())
    
    # Process each deduplication approach in parallel
    with Pool(processes=min(os.cpu_count(), len(dedup_approaches))) as pool:
        remaining_approaches = [da for da in dedup_approaches if da not in completed_approaches]
        pool.map(process_dedup_approach, remaining_approaches)
    
    # Update overall checkpoint file
    with open(overall_checkpoint, 'w') as f:
        for approach in dedup_approaches:
            f.write(f"{approach}\n")
    
    print("All balancing techniques applied!")

def main():
    start_time = time.time()
    
    # Apply balancing techniques in parallel
    apply_balancing_techniques_parallel()
    
    total_time = time.time() - start_time
    print(f"\nBalancing techniques applied and files distributed! Total time: {total_time:.2f} seconds")

if __name__ == "__main__":
    main()

Starting balancing techniques application in parallel...


In [1]:
import os
import platform
import psutil
import time
import pandas as pd
import numpy as np
from imblearn.combine import SMOTETomek
from sklearn.ensemble import RandomForestClassifier
import multiprocessing
import tempfile
import shutil

def get_system_info():
    """Get detailed system information"""
    print("=" * 50)
    print("SYSTEM INFORMATION")
    print("=" * 50)
    
    # CPU information
    print(f"CPU: {platform.processor()}")
    print(f"CPU Cores: {multiprocessing.cpu_count()} physical cores")
    print(f"CPU Logical Processors: {psutil.cpu_count(logical=True)} logical processors")
    print(f"CPU Frequency: {psutil.cpu_freq().current:.2f} MHz")
    
    # Memory information
    mem = psutil.virtual_memory()
    print(f"Total Memory: {mem.total / (1024**3):.2f} GB")
    print(f"Available Memory: {mem.available / (1024**3):.2f} GB")
    
    # Disk information
    disk = psutil.disk_usage('/')
    print(f"Disk Total: {disk.total / (1024**3):.2f} GB")
    print(f"Disk Free: {disk.free / (1024**3):.2f} GB")
    
    # Check if running on SSD or HDD
    print("\nDisk I/O Speed Test:")
    test_disk_speed()
    
    print("=" * 50)

def test_disk_speed(file_size_mb=100):
    """Test disk read/write speed"""
    # Create a temporary directory
    temp_dir = tempfile.mkdtemp()
    test_file = os.path.join(temp_dir, "test_file.bin")
    
    try:
        # Write test
        data = b"0" * (file_size_mb * 1024 * 1024)  # Create file_size_mb of data
        start_time = time.time()
        with open(test_file, 'wb') as f:
            f.write(data)
        write_time = time.time() - start_time
        write_speed = file_size_mb / write_time
        
        # Read test
        start_time = time.time()
        with open(test_file, 'rb') as f:
            data = f.read()
        read_time = time.time() - start_time
        read_speed = file_size_mb / read_time
        
        print(f"Write Speed: {write_speed:.2f} MB/s")
        print(f"Read Speed: {read_speed:.2f} MB/s")
        
        # Classify disk type based on read speed
        if read_speed > 200:
            print("Storage Type: Likely SSD (Fast)")
        elif read_speed > 80:
            print("Storage Type: Likely Fast HDD or Hybrid")
        else:
            print("Storage Type: Likely HDD (Slower)")
            
    finally:
        # Clean up
        shutil.rmtree(temp_dir)

def benchmark_smote_tomek(x_size, y_size, n_features=40):
    """Benchmark SMOTETomek performance on sample data"""
    print("\n" + "=" * 50)
    print("BALANCING PERFORMANCE BENCHMARK")
    print("=" * 50)
    
    # Create synthetic data with similar dimensions
    print(f"Creating synthetic data with shape X: [{x_size}, {n_features}], y: [{y_size}, 1]")
    X = np.random.random((x_size, n_features))
    y = np.random.choice([0, 1], size=y_size, p=[0.7, 0.3])  # Create imbalanced data
    
    # Sample subset for quick benchmark
    sample_size = min(10000, x_size)
    X_sample = X[:sample_size]
    y_sample = y[:sample_size]
    
    # Test SMOTETomek on sample
    print(f"Running SMOTETomek on {sample_size} samples...")
    start_time = time.time()
    smote_tomek = SMOTETomek(random_state=42, n_jobs=-1)
    X_res, y_res = smote_tomek.fit_resample(X_sample, y_sample)
    sample_time = time.time() - start_time
    
    # Estimate full runtime
    estimated_full_time = sample_time * (x_size / sample_size)**2  # SMOTETomek scales roughly quadratically
    
    print(f"Sample SMOTETomek completed in {sample_time:.2f} seconds")
    print(f"Estimated SMOTETomek time for full dataset: {estimated_full_time:.2f} seconds ({estimated_full_time/60:.2f} minutes)")
    
    # Test file I/O with pandas
    print("\nTesting pandas DataFrame I/O speed...")
    df = pd.DataFrame(X_sample)
    
    # Test write speed
    start_time = time.time()
    with tempfile.NamedTemporaryFile(suffix='.csv', delete=False) as tmp:
        tmp_name = tmp.name
    df.to_csv(tmp_name, index=False)
    write_time = time.time() - start_time
    
    # Test read speed
    start_time = time.time()
    pd.read_csv(tmp_name)
    read_time = time.time() - start_time
    
    # Clean up
    os.remove(tmp_name)
    
    print(f"Pandas CSV write time for {sample_size} rows: {write_time:.2f} seconds")
    print(f"Pandas CSV read time for {sample_size} rows: {read_time:.2f} seconds")
    
    # Estimate total data processing time
    total_operations = 3 * 5 * 5 * 5  # dedup_approaches * outlier_techniques * scaling_techniques * balancing_techniques
    estimated_io_time = (write_time + read_time) * (x_size / sample_size) * total_operations
    
    print(f"Estimated I/O time for all operations: {estimated_io_time:.2f} seconds ({estimated_io_time/60:.2f} minutes)")
    
    # Total estimated runtime
    print("\nESTIMATED TOTAL RUNTIME:")
    serial_runtime = estimated_full_time * total_operations / 5 + estimated_io_time  # Divide by 5 because not all balancing techniques are as slow as SMOTETomek
    parallel_runtime = serial_runtime / psutil.cpu_count(logical=True) * 1.2  # Add 20% overhead for parallel coordination
    
    print(f"Without parallelization: {serial_runtime/3600:.2f} hours")
    print(f"With parallelization ({psutil.cpu_count(logical=True)} cores): {parallel_runtime/3600:.2f} hours")
    print("=" * 50)

def main():
    # Get system information
    get_system_info()
    
    # Run benchmark with your data dimensions
    benchmark_smote_tomek(626043, 671698, 40)

if __name__ == "__main__":
    main()

SYSTEM INFORMATION
CPU: Intel64 Family 6 Model 154 Stepping 4, GenuineIntel
CPU Cores: 12 physical cores
CPU Logical Processors: 12 logical processors
CPU Frequency: 1300.00 MHz
Total Memory: 7.68 GB
Available Memory: 0.32 GB
Disk Total: 476.34 GB
Disk Free: 255.17 GB

Disk I/O Speed Test:
Write Speed: 321.63 MB/s
Read Speed: 305.06 MB/s
Storage Type: Likely SSD (Fast)

BALANCING PERFORMANCE BENCHMARK
Creating synthetic data with shape X: [626043, 40], y: [671698, 1]
Running SMOTETomek on 10000 samples...
Sample SMOTETomek completed in 0.91 seconds
Estimated SMOTETomek time for full dataset: 3555.30 seconds (59.26 minutes)

Testing pandas DataFrame I/O speed...
Pandas CSV write time for 10000 rows: 0.88 seconds
Pandas CSV read time for 10000 rows: 0.19 seconds
Estimated I/O time for all operations: 25142.35 seconds (419.04 minutes)

ESTIMATED TOTAL RUNTIME:
Without parallelization: 81.05 hours
With parallelization (12 cores): 8.11 hours


In [2]:
pip install imbalanced-learn

Defaulting to user installation because normal site-packages is not writeable
Collecting imbalanced-learn
  Downloading imbalanced_learn-0.13.0-py3-none-any.whl.metadata (8.8 kB)
Collecting sklearn-compat<1,>=0.1 (from imbalanced-learn)
  Downloading sklearn_compat-0.1.3-py3-none-any.whl.metadata (18 kB)
Downloading imbalanced_learn-0.13.0-py3-none-any.whl (238 kB)
Downloading sklearn_compat-0.1.3-py3-none-any.whl (18 kB)
Installing collected packages: sklearn-compat, imbalanced-learn
Successfully installed imbalanced-learn-0.13.0 sklearn-compat-0.1.3
Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 25.0.1 -> 25.1
[notice] To update, run: C:\Users\HP\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.12_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip
