Skip to content

Commit 21889b2

Browse files
committed
test
1 parent a19ffd4 commit 21889b2

File tree

7 files changed

+4616
-15
lines changed

7 files changed

+4616
-15
lines changed

NB&LR/NaiveBayes_vs_LogisticRegression.py

Lines changed: 56 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -51,26 +51,44 @@ def fit(self, train_x, train_y):
5151
param = p0, p1, p0Vec, p1Vec
5252
return vocabList, param
5353

54-
def predict(self, test_X, test_y, vocabList, param):
54+
def predict(self, test_X, vocabList, param):
5555
p0, p1, p0Vec, p1Vec = param
5656
testMat = []
5757
for wordList in test_X:
5858
testMat.append(self.listOfWords2Vec(vocabList, wordList))
5959
testMatrix = np.array(testMat) ## array
60-
testLabel = np.array(test_y) ## array
6160
predict_y = []
6261
for vec in testMatrix:
6362
prob_y0 = sum(vec*p0Vec)+np.log(p0) # 对应p(w1|c0)*p(w2|c0)*...*p(c0),log(a*b) = log(a)+log(b)
6463
prob_y1 = sum(vec*p1Vec)+np.log(p1) # 对应p(w1|c1)*p(w2|c1)*...*p(c1),log(a*b) = log(a)+log(b)
65-
if prob_y0 < prob_y1:
64+
if prob_y0 < prob_y1: ## 对应0/1分类,但是NaiveBayes可以修改成多分类
6665
predict_y.append(1)
6766
else:
6867
predict_y.append(0)
6968
predictLabel = np.array(predict_y) ## array
70-
print 'accuracy:', sum(testLabel==predictLabel)/float(len(testLabel))
7169
return predictLabel
7270

73-
class LogisticRegression(): # 二分类
71+
def predict1(self, test_X, test_y, vocabList, param):
72+
p0, p1, p0Vec, p1Vec = param
73+
testMat = []
74+
for wordList in test_X:
75+
testMat.append(self.listOfWords2Vec(vocabList, wordList))
76+
testMatrix = np.array(testMat) ## array
77+
m = testMatrix.shape[0]
78+
predict_y = []
79+
for vec in testMatrix:
80+
prob_y0 = sum(vec*p0Vec)+np.log(p0) # 对应p(w1|c0)*p(w2|c0)*...*p(c0),log(a*b) = log(a)+log(b)
81+
prob_y1 = sum(vec*p1Vec)+np.log(p1) # 对应p(w1|c1)*p(w2|c1)*...*p(c1),log(a*b) = log(a)+log(b)
82+
if prob_y0 < prob_y1: ## 对应0/1分类,但是NaiveBayes可以修改成多分类
83+
predict_y.append(1)
84+
else:
85+
predict_y.append(0)
86+
testLabel = np.array(test_y) ## array
87+
predictLabel = np.array(predict_y) ## array
88+
print 'accuracy:', sum(testLabel==predictLabel)/float(m)
89+
return predictLabel
90+
91+
class LogisticRegression(): # 二分类,0/1分类
7492
def __init__(self):
7593
pass
7694

@@ -102,29 +120,48 @@ def fit(self, train_x, train_y, alpha=0.01, maxCycles=100):
102120
trainMatrix = np.matrix(trainMat) ## matrix是二维的 # size: m*n
103121
trainLabel = np.matrix(train_y).T ## matrix是二维的 # size: m*1
104122
m, n = trainMatrix.shape
105-
weigh = np.ones((n, 1)) # size: n*1
123+
weigh = np.matrix(np.ones((n, 1))) # size: n*1
106124
for i in range(maxCycles):
107-
hx = self.sigmoid(trainMatrix*weigh) # size: m*1
125+
hx = self.sigmoid(trainMatrix*weigh) # size: m*1 sigmoid把线性回归转换到[0,1]之间,对应概率
108126
error = trainLabel-hx # size: m*1
109127
weigh += alpha*trainMatrix.T*error # size: n*1
110128
return vocabList, weigh
111129

112130
# 使用学习得到的参数进行分类
113-
def predict(self, test_X, test_y, vocabList, weigh):
131+
def predict(self, test_X, vocabList, weigh):
114132
testMat = []
115133
for wordList in test_X:
116134
testMat.append(self.listOfWords2Vec(vocabList, wordList))
117135
testMatrix = np.matrix(testMat) ## matrix是二维的
118-
testLabel = np.array(test_y) ## array
119-
hx = self.sigmoid(testMatrix*weigh) # size: m*1
136+
m = testMatrix.shape[0]
137+
hx = self.sigmoid(testMatrix*weigh) # size: m*1 sigmoid把线性回归转换到[0,1]之间,对应概率
138+
predict_y = []
139+
for i in range(m): ## 对应0/1分类
140+
if hx[i][0] > 0.5:
141+
predict_y.append(1)
142+
else:
143+
predict_y.append(0)
144+
predictLabel = np.array(predict_y) ## array
145+
# predictLabel = np.matrix(predict_y).T ## matrix
146+
return predictLabel
147+
148+
# 使用学习得到的参数进行分类
149+
def predict1(self, test_X, test_y, vocabList, weigh):
150+
testMat = []
151+
for wordList in test_X:
152+
testMat.append(self.listOfWords2Vec(vocabList, wordList))
153+
testMatrix = np.matrix(testMat) ## matrix是二维的
154+
m = testMatrix.shape[0]
155+
hx = self.sigmoid(testMatrix*weigh) # size: m*1 sigmoid把线性回归转换到[0,1]之间,对应概率
120156
predict_y = []
121-
for i in range(len(testLabel)):
157+
for i in range(m): ## 对应0/1分类
122158
if hx[i][0] > 0.5:
123159
predict_y.append(1)
124160
else:
125161
predict_y.append(0)
162+
testLabel = np.array(test_y) ## array
126163
predictLabel = np.array(predict_y) ## array
127-
print 'accuracy:', sum(testLabel==predictLabel)/float(len(testLabel))
164+
print 'accuracy:', sum(testLabel==predictLabel)/float(m)
128165
return predictLabel
129166

130167
def loadTrainDataSet():
@@ -150,9 +187,13 @@ def loadTestDataSet():
150187
test_X, test_y = loadTestDataSet()
151188
clf = NaiveBayes()
152189
vocabList, param = clf.fit(train_X, train_y)
153-
results = clf.predict(test_X, test_y, vocabList, param)
190+
results = clf.predict(test_X, vocabList, param)
154191
print results
192+
results1 = clf.predict1(test_X, test_y, vocabList, param)
193+
print results1
155194
clf = LogisticRegression()
156195
vocabList, weigh = clf.fit(train_X, train_y)
157-
results = clf.predict(test_X, test_y, vocabList, weigh)
158-
print results
196+
results = clf.predict(test_X, vocabList, weigh)
197+
print results
198+
results1 = clf.predict1(test_X, test_y, vocabList, weigh)
199+
print results1

README.md

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,4 +12,13 @@
1212

1313
* 针对文本分类的 LogisticRegression 算法
1414

15+
* 回归算法:
16+
* 标准的线性回归
17+
* 局部加权线性回归
18+
* 岭回归
19+
20+
结果示例:
21+
![image](./Regression/standRegresResults.png)
22+
![image](./Regression/lwlrResults.png)
23+
1524
参照:《机器学习实战》

Regression/RegressionTest.py

Lines changed: 174 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,174 @@
1+
#coding:utf-8
2+
import numpy as np
3+
import matplotlib.pyplot as plt
4+
5+
'''
6+
np.linalg Core Linear Algebra Tools
7+
xx.T 矩阵的转置
8+
xx.I 矩阵的逆
9+
m 样本点数
10+
n 特征维数
11+
'''
12+
13+
def loadDataSet(datafile):
14+
featData = []
15+
labelData = []
16+
with open(datafile, 'r') as fr_file:
17+
for eachLine in fr_file:
18+
oneLine = eachLine.split('\t')
19+
tempArr = []
20+
for i in range(len(oneLine)-1):
21+
tempArr.append(float(oneLine[i]))
22+
featData.append(tempArr)
23+
labelData.append(float(oneLine[-1].strip())) # float型连续变量
24+
featData = np.array(featData) # 转换为array
25+
labelData = np.array(labelData) # 转换为array
26+
return featData, labelData
27+
28+
def rssError(yArr, yHat):
29+
return np.sum((yArr-yHat)**2)
30+
31+
def showRegres(xArr, yArr, yHat):
32+
fig = plt.figure()
33+
ax = fig.add_subplot(111)
34+
ax.scatter(xArr[:, 1], yArr)
35+
'''
36+
因为数据假定了x0=1,因此yHat=ws[0]+ws[1]*x1,看yHat与x1之间的线性关系
37+
'''
38+
srtInd = xArr[:, 1].argsort(0)
39+
# print srtInd
40+
ax.plot(xArr[srtInd, 1], yHat[srtInd]) # 拟合前需要将点升序排列
41+
plt.show()
42+
43+
'''标准的线性回归:最小二乘法(平方误差最小),适用于m>=n情况'''
44+
def standRegres(xMat, yMat):
45+
xTx = xMat.T*xMat # n*n
46+
if np.linalg.det(xTx) == 0.0:
47+
print 'This matrix is singular, cannot do inverse'
48+
return
49+
## 方法1
50+
ws = xTx.I*(xMat.T*yMat) # n*1
51+
## 方法2
52+
# ws = np.linalg.solve(xTx, xMat.T*yMat) # n*1
53+
# yHat = xMat*ws # m*1
54+
return ws
55+
56+
def standRegresTest(xArr, yArr):
57+
xMat = np.matrix(xArr) # m*n
58+
yMat = np.matrix(yArr).T # m*1
59+
ws = standRegres(xMat, yMat) # n*1
60+
# print ws
61+
yHat = xMat*ws # m*1
62+
yHat = np.array(yHat).reshape(1, -1)[0] ## [[xx1][xx2]]二维matrix为[xx1, xx2]一维array
63+
return yHat
64+
65+
'''局部加权线性回归,适用于m>=n情况'''
66+
def lwlr(testPoint, xMat, yMat, k=1.0):
67+
m = np.shape(xMat)[0]
68+
weights = np.matrix(np.eye(m)) # 创建对角矩阵
69+
for j in range(m):
70+
diffMat = testPoint-xMat[j, :]
71+
weights[j, j] = np.exp(diffMat*diffMat.T/(-2.0*k**2)) # 高斯核
72+
print weights
73+
xTx = xMat.T*(weights*xMat)
74+
if np.linalg.det(xTx) == 0.0:
75+
print 'This matrix is singular, cannot do inverse'
76+
return
77+
ws = xTx.I*(xMat.T*(weights*yMat))
78+
return testPoint*ws
79+
80+
def lwlrTest(testArr, xArr, yArr, k=1.0):
81+
xMat = np.matrix(xArr) # m*n
82+
yMat = np.matrix(yArr).T # m*1
83+
m = np.shape(testArr)[0]
84+
yHat = np.zeros(m)
85+
for i in range(m):
86+
yHat[i] = lwlr(testArr[i], xMat, yMat, k)
87+
return yHat
88+
89+
'''岭回归,适用于m>=n及m<n情况'''
90+
def ridgeRegres(xMat, yMat, lam=0.2):
91+
xTx = xMat.T*xMat
92+
denom = xTx+np.eye(np.shape(xMat)[1])*lam
93+
if np.linalg.det(denom) == 0.0:
94+
print 'This matrix is singular, cannot do inverse'
95+
return
96+
ws = denom.I*(xMat.T*yMat)
97+
return ws
98+
99+
def ridgeTest(xArr, yArr):
100+
xMat = np.matrix(xArr)
101+
yMat = np.matrix(yArr).T
102+
'''标准化XY'''
103+
## regularize Y's
104+
yMean = np.mean(yMat, 0)
105+
yMat = yMat-yMean # to eliminate X0 take mean off of Y
106+
## regularize X's
107+
xMeans = np.mean(xMat, 0) # calc mean then subtract it off
108+
xVar = np.var(xMat, 0) # calc variance of Xi then divide by it
109+
xMat = (xMat-xMeans)/xVar
110+
'''计算wMat'''
111+
numTestPts = 30
112+
wMat = np.matrix(np.zeros((numTestPts, np.shape(xMat)[1])))
113+
for i in range(numTestPts):
114+
ws = ridgeRegres(xMat, yMat, np.exp(i-10))
115+
wMat[i, :] = ws.T
116+
return wMat
117+
118+
if __name__ == '__main__':
119+
####################################################################################
120+
## 标准的线性回归
121+
xArr, yArr = loadDataSet('ex.txt')
122+
yHat = standRegresTest(xArr, yArr)
123+
print yHat
124+
showRegres(xArr, yArr, yHat)
125+
coef = np.corrcoef(yArr, yHat)
126+
print coef
127+
print (coef[0, 1]+coef[1, 0])/2.0
128+
print rssError(yArr, yHat)
129+
####################################################################################
130+
## 局部加权线性回归
131+
xArr, yArr = loadDataSet('ex.txt')
132+
yHat = lwlrTest(xArr, xArr, yArr, k=0.01)
133+
print yHat
134+
showRegres(xArr, yArr, yHat)
135+
coef = np.corrcoef(yArr, yHat)
136+
print coef
137+
print (coef[0, 1]+coef[1, 0])/2.0
138+
print rssError(yArr, yHat)
139+
# '''寻找使相关系数最大的k'''
140+
# max_k = 0
141+
# max_coef = 0
142+
# for k in range(1, 100):
143+
# k /= 1000.0
144+
# yHat = lwlrTest(xArr, xArr, yArr, k)
145+
# coef = np.corrcoef(yArr, yHat)
146+
# temp_coef = (coef[0, 1]+coef[1, 0])/2.0
147+
# if temp_coef > max_coef:
148+
# max_coef = temp_coef
149+
# max_k = k
150+
# print max_k, max_coef
151+
# '''寻找使平方误差最小的k'''
152+
# min_k = 0
153+
# min_error = np.inf
154+
# for k in range(1, 100):
155+
# k /= 1000.0
156+
# yHat = lwlrTest(xArr, xArr, yArr, k)
157+
# temp_error = rssError(yArr, yHat)
158+
# if temp_error < min_error:
159+
# min_error = temp_error
160+
# min_k = k
161+
# print min_k, min_error
162+
####################################################################################
163+
## 岭回归
164+
xArr, yArr = loadDataSet('abalone.txt')
165+
wMat = ridgeTest(xArr, yArr)
166+
fig = plt.figure()
167+
ax = fig.add_subplot(111)
168+
ax.plot(wMat) # 描述回归系数与log(lam)的关系
169+
plt.show()
170+
'''
171+
在最左边时,lam为np.exp(0-10)=0,回归系数为原始值(即不缩减),跟标准的线性回归一致
172+
在最右边时,lam为np.exp(20-10)=e^10,回归系数全部缩减为0
173+
因此,在中间的某部分取值,lam能得到最好的预测效果,去掉不重要回归参数,参数的大小表示其重要性
174+
'''

0 commit comments

Comments
 (0)