In [2]:
import pandas as pd
from sklearn.svm import OneClassSVM
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
import itertools
import joblib
import os

SIZES = {
    '1k': {'train_path': '/home/ctai42@tntech.edu/OCC/Model_training_and_inference_code/new_occ_training_data_1k_mc_15.csv',
           'models_dir': '/home/ctai42@tntech.edu/OCC/Chao_new_pretrained_mc_15_1k',
           'results_file': '/home/ctai42@tntech.edu/OCC/Chao_results_mc15/occ_hyperparameter_results_1k.csv'},
    '2k': {'train_path': '/home/ctai42@tntech.edu/OCC/Model_training_and_inference_code/new_occ_training_data_2k_mc_15.csv',
           'models_dir': '/home/ctai42@tntech.edu/OCC/Chao_new_pretrained_mc_15_2k',
           'results_file': '/home/ctai42@tntech.edu/OCC/Chao_results_mc15/occ_hyperparameter_results_2k.csv'},
    '3k': {'train_path': '/home/ctai42@tntech.edu/OCC/Model_training_and_inference_code/new_occ_training_data_3k_mc_15.csv',
           'models_dir': '/home/ctai42@tntech.edu/OCC/Chao_new_pretrained_mc_15_3k',
           'results_file': '/home/ctai42@tntech.edu/OCC/Chao_results_mc15/occ_hyperparameter_results_3k.csv'},
    '4k': {'train_path': '/home/ctai42@tntech.edu/OCC/Model_training_and_inference_code/new_occ_training_data_4k_mc_15.csv',
           'models_dir': '/home/ctai42@tntech.edu/OCC/Chao_new_pretrained_mc_15_4k',
           'results_file': '/home/ctai42@tntech.edu/OCC/Chao_results_mc15/occ_hyperparameter_results_4k.csv'},
    '5k': {'train_path': '/home/ctai42@tntech.edu/OCC/Model_training_and_inference_code/new_occ_training_data_5k_mc_15.csv',
           'models_dir': '/home/ctai42@tntech.edu/OCC/Chao_new_pretrained_mc_15_5k',
           'results_file': '/home/ctai42@tntech.edu/OCC/Chao_results_mc15/occ_hyperparameter_results_5k.csv'},
    '10k': {'train_path': '/home/ctai42@tntech.edu/OCC/Model_training_and_inference_code/new_occ_training_data_10k_mc_15.csv',
           'models_dir': '/home/ctai42@tntech.edu/OCC/Chao_new_pretrained_mc_15_10k',
           'results_file': '/home/ctai42@tntech.edu/OCC/Chao_results_mc15/occ_hyperparameter_results_10k.csv'},
    '15k': {'train_path': '/home/ctai42@tntech.edu/OCC/Model_training_and_inference_code/new_occ_training_data_15k_mc_15.csv',
           'models_dir': '/home/ctai42@tntech.edu/OCC/Chao_new_pretrained_mc_15_15k',
           'results_file': '/home/ctai42@tntech.edu/OCC/Chao_results_mc15/occ_hyperparameter_results_15k.csv'},
    '20k': {'train_path': '/home/ctai42@tntech.edu/OCC/Model_training_and_inference_code/new_occ_training_data_20k_mc_15.csv',
           'models_dir': '/home/ctai42@tntech.edu/OCC/Chao_new_pretrained_mc_15_20k',
           'results_file': '/home/ctai42@tntech.edu/OCC/Chao_results_mc15/occ_hyperparameter_results_20k.csv'}
}

TEST_FILE = "/home/ctai42@tntech.edu/OCC/Model_training_and_inference_code/new_occ_testing_data_mc_15.csv"
SCALER_BASE_PATH = "/home/ctai42@tntech.edu/OCC/Pre_Trained_Model/scaler"

# Hyperparameters
nu_values = [0.01, 0.05, 0.1, 0.2]
gamma_values = ['scale', 'auto', 0.01, 0.1, 1.0]

# Load test data 
test_data = pd.read_csv(TEST_FILE)
test_features = test_data.drop(columns=["image_num", "actual_class", "noise_level"])

def train_size_model(size_key, paths):
    print(f"\nProcessing {size_key} dataset...")
    
    # Load training data
    train_data = pd.read_csv(paths['train_path'])
    train_features = train_data.drop(columns=["image_num", "actual_class", "noise_level"])
    
    # Create and fit scaler
    scaler = StandardScaler()
    X_train = scaler.fit_transform(train_features)
    X_test = scaler.transform(test_features)
    
    # Save scaler to a folder
    scaler_file = os.path.join(SCALER_BASE_PATH, f"scaler_mc15_{size_key}.joblib")
    os.makedirs(os.path.dirname(scaler_file), exist_ok=True)
    joblib.dump(scaler, scaler_file)
    
    os.makedirs(paths['models_dir'], exist_ok=True)
    
    # Train models with different hyperparameters
    results = []
    for nu, gamma in itertools.product(nu_values, gamma_values):
        print(f"Training OCC SVM with nu={nu}, gamma={gamma}")
        oc_svm = OneClassSVM(kernel="rbf", nu=nu, gamma=gamma)
        oc_svm.fit(X_train)
        
        # Save models to models_dir
        model_file = os.path.join(paths['models_dir'], f"oc_svm_nu{nu}_gamma{gamma}.joblib")
        joblib.dump(oc_svm, model_file)
        
        y_pred = oc_svm.predict(X_test)
        y_pred = [-1 if x == -1 else 0 for x in y_pred]
        
        true_flagged = sum([1 if pred == -1 else 0 for pred in y_pred])
        false_normal = sum([1 if pred == 0 else 0 for pred in y_pred])
        accuracy = true_flagged / len(y_pred)
        
        print(f"Accuracy: {accuracy}, True Flagged: {true_flagged}, False Normal: {false_normal}")
        
        results.append({
            "size": size_key,
            "nu": nu,
            "gamma": gamma,
            "accuracy": accuracy,
            "pred_flagged": true_flagged,
            "pred_normal": false_normal,
            "model_file": model_file
        })
    
    results_df = pd.DataFrame(results)
    os.makedirs(os.path.dirname(paths['results_file']), exist_ok=True)
    results_df.to_csv(paths['results_file'], index=False)
    
    return results_df

# Run training for all sizes
all_results = []
for size_key, paths in SIZES.items():
    size_results = train_size_model(size_key, paths)
    all_results.append(size_results)

# Combine and save all results
combined_results = pd.concat(all_results)
combined_results_file = "/home/ctai42@tntech.edu/OCC/Chao_results_mc15/combined_hyperparameter_results.csv"
combined_results.to_csv(combined_results_file, index=False)


Processing 1k dataset...
Training OCC SVM with nu=0.01, gamma=scale
Accuracy: 0.112, True Flagged: 112, False Normal: 888
Training OCC SVM with nu=0.01, gamma=auto
Accuracy: 0.112, True Flagged: 112, False Normal: 888
Training OCC SVM with nu=0.01, gamma=0.01
Accuracy: 0.03, True Flagged: 30, False Normal: 970
Training OCC SVM with nu=0.01, gamma=0.1
Accuracy: 0.394, True Flagged: 394, False Normal: 606
Training OCC SVM with nu=0.01, gamma=1.0
Accuracy: 0.888, True Flagged: 888, False Normal: 112
Training OCC SVM with nu=0.05, gamma=scale
Accuracy: 0.127, True Flagged: 127, False Normal: 873
Training OCC SVM with nu=0.05, gamma=auto
Accuracy: 0.127, True Flagged: 127, False Normal: 873
Training OCC SVM with nu=0.05, gamma=0.01
Accuracy: 0.092, True Flagged: 92, False Normal: 908
Training OCC SVM with nu=0.05, gamma=0.1
Accuracy: 0.396, True Flagged: 396, False Normal: 604
Training OCC SVM with nu=0.05, gamma=1.0
Accuracy: 0.889, True Flagged: 889, False Normal: 111
Training OCC SVM wi