```python
sklearn.svm.SVC(C=1.0, kernel='rbf', degree=3, gamma=0.0, coef0=0.0, shrinking=True, probability=False,tol=0.001, cache_size=200, class_weight=None, verbose=False, max_iter=-1, random_state=None)

```

In [26]:
import csv
from sklearn import svm
import os
import numpy as np
print(os.getcwd())

d:\My\MyCode\MLLearn\chapter13


In [92]:
def load_data(path):
    with open(path,'r') as f:
        reader = csv.reader(f)
        result = list(reader)
        return np.array(result,dtype=float)

def trans_zero(datas):
    for data in datas:
        data[data==0]=-1

label_x = load_data('./data/label_X.csv')
label_y = load_data('./data/label_y.csv').T.squeeze()
# print(label_y)
test_x = load_data('./data/test_X.csv')
test_y = load_data('./data/test_y.csv').T.squeeze()
unlabel_x = load_data('./data/unlabel_X.csv')
unlabel_y = load_data('./data/unlabel_y.csv').T.squeeze()
trans_zero([label_y,test_y,unlabel_y])

In [93]:
def cal_accuracy(predict, real):
    res = 0
    for p,r in zip(predict,real):
        if p==r:
            res+=1
    return res/len(predict)

In [101]:
svm_model = svm.SVC(kernel='linear',C=1.0)
svm_model.fit(label_x,label_y)
train_score = svm_model.score(label_x,label_y)
# 只是用标记数据的SVM表现
# 94.7% 正确率
predict = svm_model.predict(test_x)
predict_score = svm_model.score(test_x,test_y)
print(f'Train Score: {train_score}\nPredict Score: {predict_score}')

Train Score: 1.0
Predict Score: 0.9469026548672567


In [105]:
class TSVM:
    def __init__(self,kernel='linear'):
        self.kernel = kernel
        # 起始时使Cl >> Cu，让标记样本起更大的作用 
        self.Cl = 0.5
        self.Cu = 0.001
        self.svm_model = svm.SVC(kernel = self.kernel, C=1.0)
    
    def train(self, x1, y1, x2):
        '''
        Train TSVM by X1, Y1, X2
        Parameters
        ----------
        X1: Input data with labels
                np.array, shape:[n1, m], n1: numbers of samples with labels, m: numbers of features
        Y1: labels of X1
                np.array, shape:[n1, ], n1: numbers of samples with labels
        X2: Input data without labels
                np.array, shape:[n2, m], n2: numbers of samples without labels, m: numbers of features
        '''
        # 初始化使用 标记样本训练
        self.svm_model.fit(x1,y1)
        # 伪标记
        y2 = self.svm_model.predict(x2)
        x2_id = np.arange(len(x2))

        x3 = np.vstack([x1,x2])
        print(x3.shape)
        y3 = np.hstack([y1,y2])
        print(y3.shape)

        # Per-sample weights. Rescale C per sample. 
        # Higher weights force the classifier to put more emphasis on these points.
        sample_weight = np.ones(len(y3))
        sample_weight[:len(y1)] = self.Cl
        sample_weight[len(y1):] = self.Cu

        while self.Cu < self.Cl:
            # 使用伪数据再次训练
            self.svm_model.fit(x3,y3,sample_weight=sample_weight)

            # 找出两个标记为异类且很可能发生错误的未标记样本
            # 交换标记
            while True:
                # linear: w^Tx + b
                y2_decision_function = self.svm_model.decision_function(x2)
                # print(y2.shape)
                # print(y2_decision_function.shape)

                # 松弛向量
                epsilon = 1-y2*y2_decision_function
                positive_set, positive_id = epsilon[y2>0], x2_id[y2>0]
                negative_set, negative_id = epsilon[y2<0], x2_id[y2<0]
                # print(positive_set)
                # print(negative_set)
                positive_max_id = positive_id[np.argmax(positive_set)]
                negative_max_id = negative_id[np.argmax(negative_set)]
                a, b = epsilon[positive_max_id], epsilon[negative_max_id]
                if a > 0 and b > 0 and a + b > 2.0:
                    y2[positive_max_id] = -1 * y2[positive_max_id]
                    y2[negative_max_id] = -1 * y2[negative_max_id]
                    y3 = np.hstack([y1,y2])
                    self.svm_model.fit(x3,y3,sample_weight=sample_weight)
                else:
                    break
            # 更新权重
            self.Cu = min(self.Cl, 2*self.Cu)
            sample_weight[len(y1):] = self.Cu

    def score(self,x,y):
        return self.svm_model.score(x,y)

tsvm = TSVM()
tsvm.train(label_x,label_y,unlabel_x)
print(tsvm.score(test_x,test_y))
print(tsvm.score(unlabel_x,unlabel_y))

(456, 30)
(456,)
0.9557522123893806
0.935672514619883
