In [None]:
# encoding=utf8

# 特征工程-线性特征变换-奇异值分解SVD


In [3]:
import numpy as np
import matplotlib.pyplot as plt



In [1]:
class PCA:
    """ 通过SVD分解来实现PCA 
    1. 训练数据train_x必须一行代表一个样本, 一列代表一个特征
    2. 能够同时压缩train_x的行和列
    3. 可以选择在压缩前, 是否对数据进行中心化
    """
    def __init__(self, dimension, centered=True, compression="cols"):
        """
        dimension:      降维后的维度
        centered:       是否事先对数据进行中心化
        compression:    压缩行, 还是压缩列
        """
        self.dimension = dimension
        self.centered = centered
        self.compression = compression
        
    def _centered(self, train_x):
        """ 数据中心化 """
        return train_x - np.mean(train_x, axis=0)
    
    def _svd(self, train_x):
        """ 奇异值分解 """
        return np.linalg.svd(train_x)
    
    def transform(self, train_x):
        """ 数据转化(降维) 
        train_x:        训练数据, 一行代表一个样本
        u, sigma, v:    奇异值分解结果
        result:         降维后的数据
        """
        # 1. 数据中心化
        if self.centered == True:
            train_x = self._centered(train_x)
        
        # 2. 奇异值分解
        u, sigma, v = self._svd(train_x)
        v = v.T
        
        # 3. 降维
        if self.compression == "cols":
            result = np.dot(train_x, v[:, 0:self.dimension])
        elif self.compression == "rows":
            result = np.dot(u[:, 0:self.dimension], train_x[0:self.dimension, :])
        else:
            raise(Exception("parameter error."))
        return result

In [2]:
    with open("../../DataSets/pybk020master/iris.txt", "r") as f:  
        iris = []  
        for line in f.readlines():  
            temp = line.strip().split(",")  
            if temp[4] == "Iris-setosa":  
                temp[4] = 0  
            elif temp[4] == "Iris-versicolor":  
                temp[4] = 1  
            elif temp[4] == "Iris-virginica":  
                temp[4] = 2  
            else:  
                raise(Exception("data error."))  
            iris.append(temp)  
    iris = np.array(iris, np.float)  
    return iris  
      


FileNotFoundError: [Errno 2] No such file or directory: '../SVD/data/Iris.txt'

In [None]:
def draw_result(new_trainX, iris):  
    """ 
    new_trainX:     降维后的数据 
    iris:           原数据 
    """  
    plt.figure()  
    # Iris-setosa  
    setosa = new_trainX[iris[:, 4] == 0]  
    plt.scatter(setosa[:, 0], setosa[:, 1], color="red", label="Iris-setosa")  
      
    # Iris-versicolor  
    versicolor = new_trainX[iris[:, 4] == 1]  
    plt.scatter(versicolor[:, 0], versicolor[:, 1], color="orange", label="Iris-versicolor")  
      
    # Iris-virginica  
    virginica = new_trainX[iris[:, 4] == 2]  
    plt.scatter(virginica[:, 0], virginica[:, 1], color="blue", label="Iris-virginica")  
    plt.legend()  
    plt.show()  
      



In [None]:

    # 导入数据  
    iris = load_data()  
      
    # 降维  
    clf = PCA(2, centered, compression)  
    new_iris = clf.transform(iris[:, 0:4])  
      
    # 降维结果可视化  
    draw_result(new_iris, iris)  