## 第十四章—利用SVD简化数据

**优点：简化数据，去除噪声，提高算法的结果**  

**缺点：数据的转换可能难以理解**  

**适用数据类型：数值型数据**

In [1]:
import numpy as np
U, Sigma, VT = np.linalg.svd([[1, 1], [7, 7]])
print(U,'\n',Sigma,'\n',VT)

[[-0.14142136 -0.98994949]
 [-0.98994949  0.14142136]] 
 [10.  0.] 
 [[-0.70710678 -0.70710678]
 [-0.70710678  0.70710678]]


In [2]:
def loadExData():
    return[[0, 0, 0, 2, 2],
           [0, 0, 0, 3, 3],
           [0, 0, 0, 1, 1],
           [1, 1, 1, 0, 0],
           [2, 2, 2, 0, 0],
           [5, 5, 5, 0, 0],
           [1, 1, 1, 0, 0]]

In [3]:
Data = loadExData()
U, Sigma, VT = np.linalg.svd(Data)
print(Sigma)

[9.64365076e+00 5.29150262e+00 7.40623935e-16 4.05103551e-16
 2.21838243e-32]


In [4]:
Sig3 = np.mat([[Sigma[0], 0, 0],[0, Sigma[1], 0],[0, 0, Sigma[2]]])
print(U[:,:3]*Sig3*VT[:3,:])

[[ 5.03302006e-17  1.95279569e-15  1.70575023e-15  2.00000000e+00
   2.00000000e+00]
 [-7.69233911e-16  3.14619452e-16  4.54614459e-16  3.00000000e+00
   3.00000000e+00]
 [-2.02143152e-16  6.40186235e-17  1.38124528e-16  1.00000000e+00
   1.00000000e+00]
 [ 1.00000000e+00  1.00000000e+00  1.00000000e+00 -1.52065993e-33
  -1.21652794e-33]
 [ 2.00000000e+00  2.00000000e+00  2.00000000e+00 -3.04131986e-33
  -2.43305589e-33]
 [ 5.00000000e+00  5.00000000e+00  5.00000000e+00  1.82479192e-33
   1.45983353e-33]
 [ 1.00000000e+00  1.00000000e+00  1.00000000e+00 -1.52065993e-33
  -1.21652794e-33]]


In [5]:
# 相似度计算
def ecludSim(inA, inB):
    return 1.0/(1.0 + np.linalg.norm(inA - inB))

def pearsSim(inA, inB):
    if len(inA) < 3 : return 1.0
    return 0.5 + 0.5 * np.corrcoef(inA, inB, rowvar = 0)[0][1]

def cosSim(inA, inB):
    num = np.float(inA.T * inB)
    denom = np.linalg.norm(inA) * np.linalg.norm(inB)
    return 0.5 + 0.5 * (num / denom)

In [6]:
myMat = np.mat(loadExData())
print(ecludSim(myMat[:,0], myMat[:,4]))
print(ecludSim(myMat[:,0], myMat[:,0]))
print(cosSim(myMat[:,0], myMat[:,4]))
print(cosSim(myMat[:,0], myMat[:,0]))
print(pearsSim(myMat[:,0], myMat[:,4]))
print(pearsSim(myMat[:,0], myMat[:,0]))

0.12973190755680383
1.0
0.5
1.0
0.20596538173840329
1.0


### 示例：餐馆菜肴推荐引擎

In [7]:
def standEst(dataMat, user, simMeas, item):
    n = np.shape(dataMat)[1]
    simTotal = 0.0; ratSimTotal = 0.0
    for j in range(n):
        userRating = dataMat[user, j]
        if userRating == 0: continue
        overLap = np.nonzero(np.logical_and(dataMat[:, item].A > 0, dataMat[:, j].A > 0))[0]
        if len(overLap) == 0: similarity = 0
        else: similarity = simMeas(dataMat[overLap, item], dataMat[overLap, j])
        print('the %d and %d similarity is: %f' % (item, j, similarity))
        simTotal += similarity
        ratSimTotal += similarity * userRating
    if simTotal == 0: return 0
    else: return ratSimTotal / simTotal
    
def recommend(dataMat, user, N = 3, simMeas = cosSim, estMethod = standEst):
    unratedItems = np.nonzero(dataMat[user, :].A == 0)[1]                   # find unrated items 
    if len(unratedItems) == 0: return 'you rated everything'
    itemScores = []
    for item in unratedItems:
        estimatedScore = estMethod(dataMat, user, simMeas, item)
        itemScores.append((item, estimatedScore))
    return sorted(itemScores, key = lambda jj: jj[1], reverse = True)[:N]

In [8]:
myMat = np.mat(loadExData())
myMat[0,1] = myMat[0,0] = myMat[1,0] = myMat[2,0] = 4
myMat[3,3] = 2
print(myMat)
print(recommend(myMat, 2))
print(recommend(myMat, 2, simMeas = ecludSim))
print(recommend(myMat, 2, simMeas = pearsSim))

[[4 4 0 2 2]
 [4 0 0 3 3]
 [4 0 0 1 1]
 [1 1 1 2 0]
 [2 2 2 0 0]
 [5 5 5 0 0]
 [1 1 1 0 0]]
the 1 and 0 similarity is: 1.000000
the 1 and 3 similarity is: 0.928746
the 1 and 4 similarity is: 1.000000
the 2 and 0 similarity is: 1.000000
the 2 and 3 similarity is: 1.000000
the 2 and 4 similarity is: 0.000000
[(2, 2.5), (1, 2.0243290220056256)]
the 1 and 0 similarity is: 1.000000
the 1 and 3 similarity is: 0.309017
the 1 and 4 similarity is: 0.333333
the 2 and 0 similarity is: 1.000000
the 2 and 3 similarity is: 0.500000
the 2 and 4 similarity is: 0.000000
[(2, 3.0), (1, 2.8266504712098603)]
the 1 and 0 similarity is: 1.000000
the 1 and 3 similarity is: 1.000000
the 1 and 4 similarity is: 1.000000
the 2 and 0 similarity is: 1.000000
the 2 and 3 similarity is: 1.000000
the 2 and 4 similarity is: 0.000000
[(2, 2.5), (1, 2.0)]


In [9]:
def loadExData2():
    return[[0, 0, 0, 0, 0, 4, 0, 0, 0, 0, 5],
           [0, 0, 0, 3, 0, 4, 0, 0, 0, 0, 3],
           [0, 0, 0, 0, 4, 0, 0, 1, 0, 4, 0],
           [3, 3, 4, 0, 0, 0, 0, 2, 2, 0, 0],
           [5, 4, 5, 0, 0, 0, 0, 5, 5, 0, 0],
           [0, 0, 0, 0, 5, 0, 1, 0, 0, 5, 0],
           [4, 3, 4, 0, 0, 0, 0, 5, 5, 0, 1],
           [0, 0, 0, 4, 0, 4, 0, 0, 0, 0, 4],
           [0, 0, 0, 2, 0, 2, 5, 0, 0, 1, 2],
           [0, 0, 0, 0, 5, 0, 0, 0, 0, 4, 0],
           [1, 0, 0, 0, 0, 0, 0, 1, 2, 0, 0]]

In [10]:
U, Sigma, VT = np.linalg.svd(np.mat(loadExData2()))
print(Sigma)
print(sum(Sigma**2))
print(sum(Sigma**2)*0.9)
print(sum((Sigma**2)[:2]))
print(sum((Sigma**2)[:3]))

[15.77075346 11.40670395 11.03044558  4.84639758  3.09292055  2.58097379
  1.00413543  0.72817072  0.43800353  0.22082113  0.07367823]
541.9999999999995
487.7999999999996
378.8295595113579
500.50028912757926


In [11]:
# 基于SVD的评分估计
def svdEst(dataMat, user, simMeas, item):
    n = np.shape(dataMat)[1]
    simTotal = 0.0; ratSimTotal = 0.0
    U,Sigma,VT = np.linalg.svd(dataMat)
    Sig4 = np.mat(np.eye(4)*Sigma[:4])              # arrange Sig4 into a diagonal matrix
    xformedItems = dataMat.T * U[:,:4] * Sig4.I  # create transformed items
    for j in range(n):
        userRating = dataMat[user,j]
        if userRating == 0 or j == item: continue
        similarity = simMeas(xformedItems[item,:].T, xformedItems[j,:].T)
        print('the %d and %d similarity is: %f' % (item, j, similarity))
        simTotal += similarity
        ratSimTotal += similarity * userRating
    if simTotal == 0: return 0
    else: return ratSimTotal/simTotal

In [12]:
print(recommend(myMat, 1, estMethod = svdEst))
print(recommend(myMat, 1, estMethod = svdEst, simMeas = pearsSim))

the 1 and 0 similarity is: 0.498142
the 1 and 3 similarity is: 0.498131
the 1 and 4 similarity is: 0.509974
the 2 and 0 similarity is: 0.552670
the 2 and 3 similarity is: 0.552976
the 2 and 4 similarity is: 0.217301
[(2, 3.4177569186592387), (1, 3.330717154558564)]
the 1 and 0 similarity is: 0.626075
the 1 and 3 similarity is: 0.672793
the 1 and 4 similarity is: 0.614375
the 2 and 0 similarity is: 0.429334
the 2 and 3 similarity is: 0.387057
the 2 and 4 similarity is: 0.043539
[(2, 3.4992661245386794), (1, 3.327232428061366)]


### 示例：基于SVD的图像压缩

In [13]:
# 图像压缩函数
def printMat(inMat, thresh = 0.8):
    for i in range(32):
        for k in range(32):
            if float(inMat[i, k]) > thresh:
                print(1, end = ' ')
            else: print(0, end = ' ')
        print('\n')

def imgCompress(numSV = 3, thresh = 0.8):
    myl = []
    for line in open('D:/data/study/AI/ML/MLcode/Ch14/0_5.txt').readlines():
        newRow = []
        for i in range(32):
            newRow.append(int(line[i]))
        myl.append(newRow)
    myMat = np.mat(myl)
    print("****original matrix******")
    printMat(myMat, thresh)
    U,Sigma,VT = np.linalg.svd(myMat)
    SigRecon = np.mat(np.zeros((numSV, numSV)))
    for k in range(numSV):                          # construct diagonal matrix from vector
        SigRecon[k, k] = Sigma[k]
    reconMat = U[:,:numSV] * SigRecon * VT[:numSV,:]
    print("****reconstructed matrix using %d singular values******" % numSV)
    printMat(reconMat, thresh)

In [14]:
imgCompress(2)

****original matrix******
0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 

0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 

0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 

0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 

0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 

0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 

0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 

0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 0 0 0 0 1 1 1 1 1 0 0 0 0 0 0 0 0 

0 0 0 0 0 0 0 1 1 1 1 1 1 1 0 0 0 0 0 1 1 1 1 1 0 0 0 0 0 0 0 0 

0 0 0 0 0 0 1 1 1 1 1 1 0 0 0 0 0 0 0 0 1 1 1 1 0 0 0 0 0 0 0 0 

0 0 0 0 0 0 1 1 1 1 1 1 0 0 0 0 0 0 0 0 1 1 1 1 1 0 0 0 0 0 0 0 

0 0 0 0 0 0 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 1 1 1 1 0 0 0 0 0 0 0 

0 0 0 0 0 0 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 1 1 1 1 0 0 0 0 0 0 0 

0 0 0 0 0 0 0 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 1 1 1 1 0 0 0 0 0 0 

0 0 0 0 0 0 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 1 1 1 