# Soft Margin trong bài toán phân loại

In [2]:
import warnings
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor

warnings.filterwarnings('ignore')
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.decomposition import  KernelPCA
from sklearn.svm import SVC
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from data_preprocessor import DataPreprocessor
import warnings
warnings.filterwarnings('ignore')

In [37]:
class softmargin_SVM:
    def __init__(self, data_path, target_column, n_components=2, test_size=0.2, random_state=42):
        self.data_path = data_path
        self.target_column = target_column
        self.n_components = n_components
        self.test_size = test_size
        self.random_state = random_state

        preprocessor = DataPreprocessor(self.data_path, self.target_column)
        self.X, self.y, self.df = preprocessor.preprocess()

    def _scale_and_impute(self, X):
        scaler = StandardScaler()
        imputer = SimpleImputer(strategy='mean')
        return imputer.fit_transform(scaler.fit_transform(X))

    def _evaluate_classifiers(self, X_train, X_test, y_train, y_test, classifiers):
        results = {}
        for clf in classifiers:
            clf_name = clf.__class__.__name__
            clf.fit(X_train, y_train)
            y_pred = clf.predict(X_test)
            results[clf_name] = {
                "accuracy": accuracy_score(y_test, y_pred),
                "classification_report": classification_report(y_test, y_pred),
                "matrix": confusion_matrix(y_test, y_pred)
            }
        return results

    def results_classifier_original(self, classifiers):
        X_train, X_test, y_train, y_test = train_test_split(self.X, self.y, test_size=self.test_size,
                                                            random_state=self.random_state)
        X_train_scaled, X_test_scaled = self._scale_and_impute(X_train), self._scale_and_impute(X_test)
        return self._evaluate_classifiers(X_train_scaled, X_test_scaled, y_train, y_test, classifiers)

    def apply_pca(self, classifiers):
        # Áp dụng PCA để giảm chiều
        from pca import PCA_Processor, print_results  
        pca_processor = PCA_Processor(
            data_path=self.data_path,
            target_column=self.target_column,
            n_components=self.n_components,
            test_size=self.test_size,
            random_state=self.random_state
        )
        pca_processor.n_components = 5

        results_pca_before = pca_processor.apply_pca_before_split(classifiers)
        print("\n\n===== Kết quả với dữ liệu sau khi áp dụng giảm chiều PCA =====")
        print_results(results_pca_before)

    def apply_umap(self, classifiers):
        # Áp dụng UMAP để giảm chiều
        from umap_function_2 import UMAP_Processor, print_results
        umap_processor = UMAP_Processor(
            data_path=self.data_path,
            target_column=self.target_column,
            n_components=self.n_components,
            test_size=self.test_size,
            random_state=self.random_state
        )
        results_after_umap = umap_processor.apply_optimal_n_components(classifiers)

        print("\n\n===== Kết quả với dữ liệu sau khi áp dụng giảm chiều UMAP =====")
        print_results(results_after_umap)

def print_results(results):
    for clf_name, metrics in results.items():
        print(f"Accuracy {clf_name:<25}{metrics['accuracy'] : <10.4f}")
        print(f"\nClassification Report for {clf_name}:")
        print(metrics['classification_report'])
        print(f"\nConfusion Matrix for {clf_name}:")
        print(metrics['matrix'])


In [None]:
classifiers = [
    SVC(kernel='linear', C=50),  
]

softmargin_svm = softmargin_SVM(
    data_path='../data/framingham.csv',  
    target_column='TenYearCHD',  
    n_components=2,  
    test_size=0.2,  
    random_state=42  
)

In [3]:
print("\n\n===== Kết quả với dữ liệu gốc =====")
results_original = softmargin_svm.results_classifier_original(classifiers)
print_results(results_original)



===== Kết quả với dữ liệu gốc =====
Accuracy SVC                      0.6491    

Classification Report for SVC:
              precision    recall  f1-score   support

           0       0.67      0.61      0.64       735
           1       0.63      0.69      0.66       704

    accuracy                           0.65      1439
   macro avg       0.65      0.65      0.65      1439
weighted avg       0.65      0.65      0.65      1439


Confusion Matrix for SVC:
[[451 284]
 [221 483]]


In [4]:
softmargin_svm.apply_pca(classifiers)



===== Kết quả với dữ liệu sau khi áp dụng giảm chiều PCA =====
Accuracy SVC            0.6567    

Classification Report for SVC:
              precision    recall  f1-score   support

           0       0.66      0.67      0.67       735
           1       0.65      0.64      0.65       704

    accuracy                           0.66      1439
   macro avg       0.66      0.66      0.66      1439
weighted avg       0.66      0.66      0.66      1439


Confusion Matrix for SVC:
[[492 243]
 [251 453]]


In [None]:
softmargin_svm.apply_umap(classifiers)

# SVM trong bài toán Hồi quy 

In [22]:
class RegressionFromSVM:
    def __init__(self, data_path, target_column, n_components=5, test_size=0.2, random_state=42):
        self.data_path = data_path
        self.target_column = target_column
        self.n_components = n_components
        self.test_size = test_size
        self.random_state = random_state

        preprocessor = DataPreprocessor(self.data_path, self.target_column)
        self.X, self.y = preprocessor.preprocess()

    def _scale_and_impute(self, X):
        scaler = StandardScaler()
        return scaler.fit_transform(X)

    def _get_decision_values(self):
        svm_model = SVC(kernel='linear', probability=True)
        X_train, X_test, y_train, y_test = train_test_split(self.X, self.y, test_size=self.test_size,
                                                            random_state=self.random_state)
        X_train_scaled = self._scale_and_impute(X_train)
        svm_model.fit(X_train_scaled, y_train)

        X_test_scaled = self._scale_and_impute(X_test)
        decision_values = svm_model.decision_function(X_test_scaled)
         
        return decision_values, y_test

    def evaluate_regression_on_decision_values(self, regressors):
        decision_values, y_test = self._get_decision_values()

        X_train, X_test, y_train, y_test = train_test_split(decision_values.reshape(-1, 1), y_test, 
                                                            test_size=self.test_size, random_state=self.random_state)

        results = []
        for reg in regressors:
            reg_name = reg.__class__.__name__
            reg.fit(X_train, y_train)
            y_pred = reg.predict(X_test)
            mse = mean_squared_error(y_test, y_pred)
            r2 = r2_score(y_test, y_pred)
            results.append([reg_name, mse, r2])

        return pd.DataFrame(results, columns=["Model", "MSE", "R^2 Score"])

    def evaluate_regression_on_reduced_data(self, regressors):
        # Lấy giá trị hàm quyết định từ SVM
        decision_values, y_test = self._get_decision_values()

        # Giảm chiều dữ liệu bằng PCA
        pca = KernelPCA(n_components=5)
        decision_values_pca = pca.fit_transform(decision_values.reshape(-1, 1))

        # Chia dữ liệu đã giảm chiều thành tập huấn luyện và kiểm tra
        X_train, X_test, y_train, y_test = train_test_split(decision_values_pca, y_test, 
                                                            test_size=self.test_size, random_state=self.random_state)

        results = []
        for reg in regressors:
            reg_name = reg.__class__.__name__
            reg.fit(X_train, y_train)
            y_pred = reg.predict(X_test)
            mse = mean_squared_error(y_test, y_pred)
            r2 = r2_score(y_test, y_pred)
            results.append([reg_name, mse, r2])

        return pd.DataFrame(results, columns=["Model", "MSE", "R^2 Score"])

if __name__ == "__main__":
    data_path = '../data/framingham.csv'
    target_column = 'TenYearCHD'
    regressors = [LinearRegression(), KNeighborsRegressor()]

    regression_model = RegressionFromSVM(data_path, target_column)
    
    results_decision_values = regression_model.evaluate_regression_on_decision_values(regressors)
    print("Kết quả hồi quy trên dữ liệu gốc:")
    print(results_decision_values)

    results_reduced_data = regression_model.evaluate_regression_on_reduced_data(regressors)
    print("\nKết quả hồi quy trên dữ liệu đã giảm chiều:")
    print (results_reduced_data)

Kết quả hồi quy trên dữ liệu gốc:
                 Model       MSE  R^2 Score
0     LinearRegression  0.214361   0.142158
1  KNeighborsRegressor  0.151466   0.393854

Kết quả hồi quy trên dữ liệu đã giảm chiều:
                 Model       MSE  R^2 Score
0     LinearRegression  0.204250   0.182960
1  KNeighborsRegressor  0.235139   0.059399
