In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import classification_report, accuracy_score, f1_score
from sklearn.preprocessing import StandardScaler
from scipy.stats import uniform

In [2]:
# 加载数据
# url = 'https://raw.githubusercontent.com/ZhiyunXu20/INT104_Coursework2/main/dataframe_PCA.csv'
dataframe = pd.read_csv('dataframe_PCA.csv')

X = dataframe.drop(columns=['Label', 'Patient index'])
y = dataframe['Label']

# 定义SVM核函数和参数
kernels = ['linear', 'poly', 'rbf', 'sigmoid']
kernel_params = {
    'linear': {'C': [0.1, 1, 10, 100]},
    'poly': {'C': [0.1, 1, 10, 100], 'degree': [2, 3, 4], 'coef0': [0, 1]},
    'rbf': {'C': [0.1, 1, 10, 100], 'gamma': ['scale', 'auto']},
    'sigmoid': {'C': [0.1, 1, 10, 100], 'coef0': [0, 1]}
}

# test_sizes = [0.1, 0.2, 0.3]
# random_states = [21, 42, 84]
test_size = 0.1
random_state = 21

results_SVM = []
print(f"Test size: {test_size}, Random state: {random_state}")

# 划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)



Test size: 0.1, Random state: 21


In [None]:
# 对每个核函数执行网格搜索和交叉验证
for kernel in kernels:
    print(f"  Kernel: {kernel}")
    svm = SVC(kernel=kernel)

    # 修正前：使用GridSearchCV进行网格搜索和交叉验证
    grid_search = GridSearchCV(svm, kernel_params[kernel], cv=5, scoring='accuracy', n_jobs=4)
    grid_search.fit(X_train, y_train)

    # 输出最佳参数和分类器性能
    print("  GridSearchCV Best parameters found:", grid_search.best_params_)
    print("  GridSearchCV Best score found:", grid_search.best_score_)

    # 修正后：使用RandomizedSearchCV进行随机搜索和交叉验证
    distribution = dict(C=uniform(loc=0, scale=10), gamma=uniform(loc=0, scale=1))
    randomized_search = RandomizedSearchCV(svm, distribution, n_iter=10, cv=5, scoring='accuracy', n_jobs=4)
    randomized_search.fit(X_train, y_train)

    # 输出最佳参数和分类器性能
    print("  RandomizedSearchCV Best parameters found:", randomized_search.best_params_)
    print("  RandomizedSearchCV Best score found:", randomized_search.best_score_)

    # 使用最佳参数在测试集上评估模型
    # 使用GridSearchCV找到的最佳参数在测试集上评估模型
    best_svm_grid = grid_search.best_estimator_
    y_pred_grid = best_svm_grid.predict(X_test)
    accuracy_grid = accuracy_score(y_test, y_pred_grid)
    f1_grid = f1_score(y_test, y_pred_grid, average='weighted')

    result_SVM_grid = {
        'Test Size': test_size,
        'Random State': random_state,
        'Kernel': kernel,
        'Search Method': 'GridSearchCV',
        'Best Parameters': grid_search.best_params_,
        'Cross-Validation Score': grid_search.best_score_,
        'Accuracy': accuracy_grid,
        'F1 Score': f1_grid
    }
    results_SVM.append(result_SVM_grid)

    # 使用RandomizedSearchCV找到的最佳参数在测试集上评估模型
    best_svm_rand = randomized_search.best_estimator_
    y_pred_rand = best_svm_rand.predict(X_test)
    accuracy_rand = accuracy_score(y_test, y_pred_rand)
    f1_rand = f1_score(y_test, y_pred_rand, average='weighted')

    result_SVM_rand = {
        'Test Size': test_size,
        'Random State': random_state,
        'Kernel': kernel,
        'Search Method': 'RandomizedSearchCV',
        'Best Parameters': randomized_search.best_params_,
        'Cross-Validation Score': randomized_search.best_score_,
        'Accuracy': accuracy_rand,
        'F1 Score': f1_rand
    }
    results_SVM.append(result_SVM_rand)

# 将结果转换为DataFrame并保存到CSV文件
results_SVM_df = pd.DataFrame(results_SVM)
results_SVM_df.to_csv('results_SVM.csv', index=False)

In [11]:
# 加载数据
# url = 'https://raw.githubusercontent.com/ZhiyunXu20/INT104_Coursework2/main/dataframe_PCA.csv'
dataframe = pd.read_csv('dataframe_PCA.csv')

X = dataframe.drop(columns=['Label', 'Patient index'])
y = dataframe['Label']

# 定义SVM核函数和参数
kernels = ['linear', 'poly', 'rbf', 'sigmoid']
kernel_params = {
    'linear': {'C': [0.1, 1, 10, 100]},
    'poly': {'C': [0.1, 1, 10, 100], 'degree': [2, 3, 4], 'coef0': [0, 1]},
    'rbf': {'C': [0.1, 1, 10, 100], 'gamma': ['scale', 'auto']},
    'sigmoid': {'C': [0.1, 1, 10, 100], 'coef0': [0, 1]}
}

test_sizes = [0.1, 0.2, 0.3]
random_states = [21, 42, 84]

results_SVM = []

# 对每个test_size和random_state组合进行循环
for test_size in test_sizes:
    for random_state in random_states:
        print(f"Test size: {test_size}, Random state: {random_state}")

        # 划分训练集和测试集
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)

        # 对每个核函数执行网格搜索和交叉验证
        for kernel in kernels:
            print(f"  Kernel: {kernel}")
            svm = SVC(kernel=kernel)

            # 修正前：使用GridSearchCV进行网格搜索和交叉验证
            grid_search = GridSearchCV(svm, kernel_params[kernel], cv=5, scoring='accuracy', n_jobs=4)
            grid_search.fit(X_train, y_train)

            # 输出最佳参数和分类器性能
            print("  GridSearchCV Best parameters found:", grid_search.best_params_)
            print("  GridSearchCV Best score found:", grid_search.best_score_)

            # 修正后：使用RandomizedSearchCV进行随机搜索和交叉验证
            distribution = dict(C=uniform(loc=0, scale=10), gamma=uniform(loc=0, scale=1))
            randomized_search = RandomizedSearchCV(svm, distribution, n_iter=10, cv=5, scoring='accuracy', n_jobs=4)
            randomized_search.fit(X_train, y_train)

            # 输出最佳参数和分类器性能
            print("  RandomizedSearchCV Best parameters found:", randomized_search.best_params_)
            print("  RandomizedSearchCV Best score found:", randomized_search.best_score_)

            # 使用最佳参数在测试集上评估模型
            # 使用GridSearchCV找到的最佳参数在测试集上评估模型
            best_svm_grid = grid_search.best_estimator_
            y_pred_grid = best_svm_grid.predict(X_test)
            accuracy_grid = accuracy_score(y_test, y_pred_grid)
            f1_grid = f1_score(y_test, y_pred_grid, average='weighted')

            result_SVM_grid = {
                'Test Size': test_size,
                'Random State': random_state,
                'Kernel': kernel,
                'Search Method': 'GridSearchCV',
                'Best Parameters': grid_search.best_params_,
                'Cross-Validation Score': grid_search.best_score_,
                'Accuracy': accuracy_grid,
                'F1 Score': f1_grid
            }
            results_SVM.append(result_SVM_grid)

            # 使用RandomizedSearchCV找到的最佳参数在测试集上评估模型
            best_svm_rand = randomized_search.best_estimator_
            y_pred_rand = best_svm_rand.predict(X_test)
            accuracy_rand = accuracy_score(y_test, y_pred_rand)
            f1_rand = f1_score(y_test, y_pred_rand, average='weighted')

            result_SVM_rand = {
                'Test Size': test_size,
                'Random State': random_state,
                'Kernel': kernel,
                'Search Method': 'RandomizedSearchCV',
                'Best Parameters': randomized_search.best_params_,
                'Cross-Validation Score': randomized_search.best_score_,
                'Accuracy': accuracy_rand,
                'F1 Score': f1_rand
            }
            results_SVM.append(result_SVM_rand)

# 将结果转换为DataFrame并保存到CSV文件
results_SVM_df = pd.DataFrame(results_SVM)
results_SVM_df.to_csv('results_SVM.csv', index=False)


Test size: 0.1, Random state: 21
  Kernel: linear


KeyboardInterrupt: 

In [13]:
# 划分数据集 Split the dataset
dataframe = pd.read_csv('dataframe_PCA.csv')
X = dataframe.drop(columns=['Patient index', 'Label'])
y = dataframe['Label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.05, random_state=42)

# 训练 SVM 分类器 Train SVM classifier
svm_clf = SVC(kernel='rbf', C=10, random_state=42)
svm_clf.fit(X_train, y_train)

# 评估 SVM 分类器 Evaluate SVM classifier
svm_predictions = svm_clf.predict(X_test)
print("SVM Classifier Results:")
print(classification_report(y_test, svm_predictions))


SVM Classifier Results:
              precision    recall  f1-score   support

           0       0.70      0.74      0.72       137
           1       0.71      0.66      0.69       130

    accuracy                           0.70       267
   macro avg       0.70      0.70      0.70       267
weighted avg       0.70      0.70      0.70       267

