In [5]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import MinMaxScaler, StandardScaler, LabelEncoder, OneHotEncoder, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, confusion_matrix

# 读取数据
train = pd.read_csv('train.csv', on_bad_lines='skip')
test = pd.read_csv('test.csv', on_bad_lines='skip')

# 移除 id 列
id = test['id']
train.drop(columns=['id'], inplace=True)
test.drop(columns=['id'], inplace=True)

# 修改 CALC 列
test.loc[test['CALC']=='Always', 'CALC'] = 'Frequently'

# 标签编码目标变量
le = LabelEncoder()
train['NObeyesdad'] = le.fit_transform(train['NObeyesdad'])

# 分离特征和目标变量
X = train.drop(columns=['NObeyesdad'])
y = train['NObeyesdad']

# 划分训练集和验证集
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# 定义数值列
num_cols = [col for col in X.columns if X[col].dtype == 'float']

# 定义预处理步骤
preprocessor = ColumnTransformer(transformers=[
    ('tnf1', OneHotEncoder(sparse_output=False, drop='first', dtype='int'), ['Gender', 'family_history_with_overweight', 'FAVC', 'SMOKE', 'SCC', 'MTRANS']),
    ('tnf2', OrdinalEncoder(dtype='int'), ['CAEC', 'CALC']),
    ('tnf3', StandardScaler(), num_cols)
], remainder='passthrough')

# 应用预处理
X_train_preprocessed = preprocessor.fit_transform(X_train)
X_val_preprocessed = preprocessor.transform(X_val)
X_test_preprocessed = preprocessor.transform(test)

In [6]:
'''import numpy as np
import random
import matplotlib.pyplot as plt
import re
import time

class SVMModel(object):
    """
    SVM model
    """
    def __init__(self, max_iter=10000, kernel_type='linear', C=1.0, epsilon=0.00001,gamma=0.01):
        self.max_iter = max_iter
        self.kernel_type = kernel_type
        self.kernel_func_list = {
            'linear': self._kernel_linear,
            'quadratic': self._kernel_quadratic,
            'rbf': self._kernel_rbf
        }
        self.kernel_func = self.kernel_func_list[kernel_type]
        self.C = C
        self.epsilon = epsilon
        self.gamma = gamma
        self.alpha = None
        self.X_train = None
        self.Y_train = None
        self.b = 0

    def fit(self, X_train, Y_train):
        """
        Training model
        :param X_train: shape = num_train, dim_feature
        :param Y_train: shape = num_train, 1
        :return: loss_history
        """
        n, d = X_train.shape[0], X_train.shape[1]
        self.alpha = np.zeros(n)
        self.X_train = X_train
        self.Y_train = Y_train
        # Iteration
        for i in range(self.max_iter):
            diff = self._iteration(X_train, Y_train)
            if i % 100 == 0:
                print('Iter %r / %r, Diff %r' % (i, self.max_iter, diff))
            if diff < self.epsilon:
                break

    #def predict_raw(self, X):
        #return np.dot(self.w.T, X.T) + self.b

    #def predict(self, X):
        #return np.sign(np.dot(self.w.T, X.T) + self.b).astype(int)

    def predict_raw(self, X):
        result = 0
        for j in range(self.X_train.shape[0]):
            result += self.alpha[j] * self.Y_train[j] * self.kernel_func(self.X_train[j], X)
        result += self.b
        return result

    def predict(self, X):
        return np.sign(self.predict_raw(X)).astype(int)

    def _iteration(self, X_train, Y_train):
        alpha = self.alpha
        alpha_prev = np.copy(alpha)
        n = alpha.shape[0]
        for j in range(n):
            # Find i not equal to j randomly
            i = j
            for _ in range(1000):
                if i != j:
                    break
                i = random.randint(0, n - 1)
            # 添加调试信息
            print(f'i: {i}, j: {j}, n: {n}')
            x_i, x_j, y_i, y_j = X_train[i, :], X_train[j, :], Y_train[i], Y_train[j]
            # Define the similarity of instances. K11 + K22 - 2K12
            k_ij = self.kernel_func(x_i, x_i) + self.kernel_func(x_j, x_j) - 2 * self.kernel_func(x_i, x_j)
            if k_ij == 0:
                continue
            a_i, a_j = alpha[i], alpha[j]
            # Calculate the boundary of alpha
            L, H = self._cal_L_H(self.C, a_j, a_i, y_j, y_i)
            # Calculate model parameters
            self.w = np.dot(X_train.T, np.multiply(alpha, Y_train))
            self.b = np.mean(Y_train - np.dot(self.w.T, X_train.T))
            # Iterate alpha_j and alpha_i according to 'Delta W(a_j)'
            E_i = self.predict(x_i) - y_i
            E_j = self.predict(x_j) - y_j
            alpha[j] = a_j + (y_j * (E_i - E_j) * 1.0) / k_ij
            alpha[j] = min(H, max(L, alpha[j]))
            alpha[i] = a_i + y_i * y_j * (a_j - alpha[j])
        diff = np.linalg.norm(alpha - alpha_prev)
        return diff


    def _kernel_linear(self, x1, x2):
        return np.dot(x1, x2.T)

    def _kernel_quadratic(self, x1, x2):
        return np.dot(x1, x2.T) ** 2

    def _kernel_rbf(self, x1, x2):
        return np.exp(-self.gamma * np.linalg.norm(x1 - x2) ** 2)

    def _cal_L_H(self, C, a_j, a_i, y_j, y_i):
        if y_i != y_j:
            L = max(0, a_j - a_i)
            H = min(C, C - a_i + a_j)
        else:
            L = max(0, a_i + a_j - C)
            H = min(C, a_i + a_j)
        return L, H'''


'import numpy as np\nimport random\nimport matplotlib.pyplot as plt\nimport re\nimport time\n\nclass SVMModel(object):\n    """\n    SVM model\n    """\n    def __init__(self, max_iter=10000, kernel_type=\'linear\', C=1.0, epsilon=0.00001,gamma=0.01):\n        self.max_iter = max_iter\n        self.kernel_type = kernel_type\n        self.kernel_func_list = {\n            \'linear\': self._kernel_linear,\n            \'quadratic\': self._kernel_quadratic,\n            \'rbf\': self._kernel_rbf\n        }\n        self.kernel_func = self.kernel_func_list[kernel_type]\n        self.C = C\n        self.epsilon = epsilon\n        self.gamma = gamma\n        self.alpha = None\n        self.X_train = None\n        self.Y_train = None\n        self.b = 0\n\n    def fit(self, X_train, Y_train):\n        """\n        Training model\n        :param X_train: shape = num_train, dim_feature\n        :param Y_train: shape = num_train, 1\n        :return: loss_history\n        """\n        n, d = X_tr

In [7]:
import numpy as np
import random
import matplotlib.pyplot as plt
import re
import time

class SVMModel(object):
    """
    SVM model using SMO algorithm
    """
    def __init__(self, max_iter=10000, kernel_type='linear', C=1.0, epsilon=0.00001, gamma=0.01):
        self.max_iter = max_iter
        self.kernel_type = kernel_type
        self.kernel_func_list = {
            'linear': self._kernel_linear,
            'quadratic': self._kernel_quadratic,
            'rbf': self._kernel_rbf
        }
        self.kernel_func = self.kernel_func_list[kernel_type]
        self.C = C
        self.epsilon = epsilon
        self.gamma = gamma
        self.alpha = None
        self.X_train = None
        self.Y_train = None
        self.b = 0
        self.E = None

    def fit(self, X_train, Y_train):
        """
        Training model using SMO algorithm
        :param X_train: shape = num_train, dim_feature
        :param Y_train: shape = num_train, 1
        :return: None
        """
        n, d = X_train.shape
        self.alpha = np.zeros(n)
        self.X_train = X_train
        self.Y_train = Y_train
        self.E = np.zeros(n)
        for i in range(n):
            self.E[i] = self._predict(i) - Y_train[i]
        
        # Iteration
        for it in range(self.max_iter):
            num_changed_alphas = 0
            for i in range(n):
                if self._check_kkt(i):
                    j = self._select_j(i, n)
                    if j is not None:
                        num_changed_alphas += self._update_alpha(i, j)
            if num_changed_alphas == 0:
                print(f'Iteration {it+1}/{self.max_iter}: No alpha changed')
                break
            else:
                print(f'Iteration {it+1}/{self.max_iter}: {num_changed_alphas} alphas changed')

    def predict_raw(self, X):
        result = 0
        for j in range(self.X_train.shape[0]):
            result += self.alpha[j] * self.Y_train[j] * self.kernel_func(self.X_train[j], X)
        result += self.b
        return result

    def predict(self, X):
        return np.sign(self.predict_raw(X)).astype(int)

    def _predict(self, i):
        """Calculate the prediction value for a single training example"""
        return sum([self.alpha[j] * self.Y_train[j] * self.kernel_func(self.X_train[j], self.X_train[i]) for j in range(self.X_train.shape[0])]) + self.b

    def _check_kkt(self, i):
        """Check if the KKT conditions are violated for alpha[i]"""
        yG = self.Y_train[i] * self.E[i]
        if (self.alpha[i] > 0 and self.alpha[i] < self.C and abs(yG) < self.epsilon) or \
           (self.alpha[i] == 0 and yG >= -self.epsilon) or \
           (self.alpha[i] == self.C and yG <= self.epsilon):
            return False
        return True

    def _select_j(self, i, n):
        """Select a second alpha to optimize"""
        j = i
        while j == i:
            j = random.randint(0, n - 1)
        return j

    def _update_alpha(self, i, j):
        """Update alpha[i] and alpha[j]"""
        if self.Y_train[i] != self.Y_train[j]:
            L = max(0, self.alpha[j] - self.alpha[i])
            H = min(self.C, self.C + self.alpha[j] - self.alpha[i])
        else:
            L = max(0, self.alpha[j] + self.alpha[i] - self.C)
            H = min(self.C, self.alpha[j] + self.alpha[i])
        
        if L == H:
            return 0
        
        eta = 2.0 * self.kernel_func(self.X_train[i], self.X_train[j]) - \
              self.kernel_func(self.X_train[i], self.X_train[i]) - \
              self.kernel_func(self.X_train[j], self.X_train[j])
        if eta >= 0:
            return 0
        
        alpha_j_old = self.alpha[j].copy()
        alpha_i_old = self.alpha[i].copy()
        
        self.alpha[j] -= self.Y_train[j] * (self.E[i] - self.E[j]) / eta
        self.alpha[j] = self._clip_alpha(self.alpha[j], L, H)
        
        if abs(self.alpha[j] - alpha_j_old) < 1e-5:
            return 0
        
        self.alpha[i] += self.Y_train[j] * self.Y_train[i] * (alpha_j_old - self.alpha[j])
        
        b1 = self.b - self.E[i] - self.Y_train[i] * (self.alpha[i] - alpha_i_old) * self.kernel_func(self.X_train[i], self.X_train[i]) - \
             self.Y_train[j] * (self.alpha[j] - alpha_j_old) * self.kernel_func(self.X_train[i], self.X_train[j])
        b2 = self.b - self.E[j] - self.Y_train[i] * (self.alpha[i] - alpha_i_old) * self.kernel_func(self.X_train[i], self.X_train[j]) - \
             self.Y_train[j] * (self.alpha[j] - alpha_j_old) * self.kernel_func(self.X_train[j], self.X_train[j])
        
        if 0 < self.alpha[i] < self.C:
            self.b = b1
        elif 0 < self.alpha[j] < self.C:
            self.b = b2
        else:
            self.b = (b1 + b2) / 2.0
        
        self.E[i] = self._predict(i) - self.Y_train[i]
        self.E[j] = self._predict(j) - self.Y_train[j]
        
        return 1

    def _clip_alpha(self, aj, L, H):
        """Clip alpha[j] to be within the bounds [L, H]"""
        if aj > H:
            return H
        elif aj < L:
            return L
        else:
            return aj

    def _kernel_linear(self, x1, x2):
        return np.dot(x1, x2.T)

    def _kernel_quadratic(self, x1, x2):
        return np.dot(x1, x2.T) ** 2

    def _kernel_rbf(self, x1, x2):
        return np.exp(-self.gamma * np.linalg.norm(x1 - x2) ** 2)

In [8]:
# 检查数据形状和类型
print("X_train_preprocessed shape:", X_train_preprocessed.shape)
print("y_train shape:", y_train.shape)
print("X_train_preprocessed type:", type(X_train_preprocessed))
print("y_train type:", type(y_train))

# 确保 y_train 是 numpy.ndarray
if isinstance(y_train, pd.Series):
    y_train = y_train.values

X_train_preprocessed shape: (16606, 19)
y_train shape: (16606,)
X_train_preprocessed type: <class 'numpy.ndarray'>
y_train type: <class 'pandas.core.series.Series'>


In [9]:
# 创建 SVM 模型
svm = SVMModel(max_iter=50, 
               kernel_type='rbf', 
               C=100, 
               epsilon=0.01, 
               gamma=0.01)

# 训练模型
svm.fit(X_train_preprocessed, y_train)


Iteration 1/50: 6851 alphas changed


KeyboardInterrupt: 

In [None]:
# 在验证集上进行预测
y_val_pred = svm.predict(X_val_preprocessed)

# 计算验证集上的准确率和混淆矩阵
accuracy = accuracy_score(y_val, y_val_pred)
conf_matrix = confusion_matrix(y_val, y_val_pred)

print(f'Validation Accuracy: {accuracy}')
print(f'Confusion Matrix:\n{conf_matrix}')

# 在测试集上进行预测
y_test_pred = svm.predict(X_test_preprocessed)

# 反标签编码预测结果
y_val_pred_label = le.inverse_transform(y_val_pred)
y_test_pred_label = le.inverse_transform(y_test_pred)

# 保存测试集的预测结果到CSV文件中
submission = pd.DataFrame({'id': id, 'NObeyesdad': y_test_pred_label})
submission.to_csv('submission_svm3.csv', index=False)

Validation Accuracy: 0.8564547206165704
Confusion Matrix:
[[464  56   0   0   0   3   1]
 [ 51 502   2   0   0  60  11]
 [  2   1 458  18   1  25  38]
 [  1   0  20 631   0   0   5]
 [  0   0   1   2 801   0   0]
 [  1  59  25   1   0 340  58]
 [  0  13  57  11   1  72 360]]
