In [1]:
import os
import csv
import re
import pandas as pd
import numpy as np

#将数据分割为训练集和测试集
from sklearn.model_selection import train_test_split

#用于标准化数据
from sklearn.preprocessing import StandardScaler
#一种线性分类技术
from sklearn.linear_model import LogisticRegression
#一种线性分类技术
from sklearn.linear_model import SGDClassifier
#用于分析数据
from sklearn.metrics import classification_report

In [2]:
# 载入数据,分离数据集与标签
red_dataset = pd.read_csv(r'data/winequality-red.csv', sep='/')
red_dataset = np.array(red_dataset['fixed acidity;"volatile acidity";"citric acid";"residual sugar";"chlorides";' \
                               '"free sulfur dioxide";"total sulfur dioxide";"density";"pH";"sulphates";"alcohol"' \
                               ';"quality"'].str.split(';', expand=True).astype(float))
red_label = red_dataset[:,11]
red_data = red_dataset[:,:11]

In [3]:
from collections import Counter
# 已知共有6类
classes = dict(Counter(red_label))
print(classes)

# 扩展标签
classname = np.unique(red_label)
print(classname)
multi_red_label = np.zeros([len(red_label),len(classname)])
for i in range(len(classname)):
    multi_red_label[np.where(red_label==classname[i]),i] = 1
print(multi_red_label)

{5.0: 681, 6.0: 638, 7.0: 199, 4.0: 53, 8.0: 18, 3.0: 10}
[3. 4. 5. 6. 7. 8.]
[[0. 0. 1. 0. 0. 0.]
 [0. 0. 1. 0. 0. 0.]
 [0. 0. 1. 0. 0. 0.]
 ...
 [0. 0. 0. 1. 0. 0.]
 [0. 0. 1. 0. 0. 0.]
 [0. 0. 0. 1. 0. 0.]]


In [4]:
# 类平均向量mc
mc = np.zeros([6,len(red_data[0])])
mc_num = np.zeros([6,1])
for i in range(len(red_label)):
    if red_label[i] == 3.0:
        mc[0] += red_data[i]
        mc_num[0] += 1
        continue
    if red_label[i] == 4.0:
        mc[1] += red_data[i]
        mc_num[1] += 1
        continue
    if red_label[i] == 5.0:
        mc[2] += red_data[i]
        mc_num[2] += 1
        continue
    if red_label[i] == 6.0:
        mc[3] += red_data[i]
        mc_num[3] += 1
        continue
    if red_label[i] == 7.0:
        mc[4] += red_data[i]
        mc_num[4] += 1
        continue
    if red_label[i] == 8.0:
        mc[5] += red_data[i]
        mc_num[5] += 1
        continue
for i in range(len(mc_num)):
    mc[i] /= mc_num[i]
print(mc)

[[ 8.36        0.8845      0.171       2.635       0.1225     11.
  24.9         0.997464    3.398       0.57        9.955     ]
 [ 7.77924528  0.69396226  0.17415094  2.69433962  0.09067925 12.26415094
  36.24528302  0.99654245  3.38150943  0.59641509 10.26509434]
 [ 8.16725404  0.57704112  0.24368576  2.52885463  0.09273568 16.98384728
  56.51395007  0.99710363  3.3049486   0.62096916  9.89970631]
 [ 8.34717868  0.49748433  0.27382445  2.47719436  0.08495611 15.71159875
  40.86990596  0.99661506  3.3180721   0.67532915 10.62951933]
 [ 8.87236181  0.4039196   0.37517588  2.72060302  0.07658794 14.04522613
  35.0201005   0.99610427  3.29075377  0.74125628 11.4659129 ]
 [ 8.56666667  0.42333333  0.39111111  2.57777778  0.06844444 13.27777778
  33.44444444  0.99521222  3.26722222  0.76777778 12.09444444]]


In [5]:
# 总平均向量m_red
m_red = np.zeros(len(red_data[0]))
for i in range(len(red_data[0])):
    m_red[i] = red_data[:, i].mean()
print(m_red)

[ 8.31963727  0.52782051  0.27097561  2.5388055   0.08746654 15.87492183
 46.46779237  0.99674668  3.3111132   0.65814884 10.42298311]


In [6]:
# 构建Sw,Sb，ST
Sw = np.zeros([11,11])
for i in range(6):
    for j in range(len(red_label)):
        if red_label[j] == i + 3.0:
            temp = red_data[j] - mc[i]
            Sw += np.outer(temp ,temp.T)
# print(Sw)
print(Sw.shape)
Sb = np.zeros([11,11])
for i in range(6):
    temp = mc[i] - m_red[i]
    Sb += mc_num[i] * np.outer(temp, temp.T)
# print(Sb)
print(Sb.shape)
ST = Sw + Sb

(11, 11)
(11, 11)


In [7]:
# 求解投影方向
w = np.linalg.eig(np.linalg.pinv(Sw)*Sb)[1][0:7]
print(w)

[[-2.74552829e-04  1.28562299e-02 -2.50568540e-01  2.27669489e-02
  -5.99098685e-02 -2.09164760e-01  7.04469466e-02  9.38155507e-01
  -4.54682008e-02 -3.14779259e-02 -3.21851047e-02]
 [-7.92595834e-04  7.36531677e-02  2.38170779e-02  5.35051615e-01
   5.05020254e-01 -1.48959618e-01 -6.54835603e-01  4.06150372e-02
  -3.31841227e-03  1.19866487e-03  4.58008733e-03]
 [-1.09329735e-04  9.85638358e-02 -9.27002507e-03  8.29664621e-01
  -2.35901397e-01  6.63070637e-02  4.87890329e-01 -6.10991220e-02
  -5.71100167e-03 -8.97707898e-04  2.49966023e-03]
 [-3.60816726e-04  1.07744579e-03 -1.49084771e-02 -5.22253623e-03
  -1.81692269e-03 -3.68368652e-02  1.11458633e-02  2.66870830e-02
   3.01697911e-05  1.79201058e-01  9.82567791e-01]
 [-1.85298819e-03 -9.87557587e-01  1.75382971e-02  1.13132944e-01
   9.16431602e-02 -1.73152293e-03  5.41276629e-02  1.69483614e-02
   6.70572301e-04  3.33183511e-04  9.19099031e-04]
 [-1.80956741e-06 -7.99657238e-04  7.14413098e-03 -4.46515340e-03
   9.21745478e-04 -

In [8]:
# LDA降维与可视化
import matplotlib.pyplot as plt
red_data_lda = np.dot(red_data, w.T)
print(red_data_lda)


if len(red_data_lda[0]) == 2:
    color = ['y', 'gold', 'sandybrown', 'peru', 'red', 'darkred']
    alpha = [0.2, 0.2, 0.4, 0.4, 0.6, 1]
    for i, c, a in zip(classname, color, alpha):
        plt.scatter(red_data_lda[np.where(red_label==i), 0], red_data_lda[np.where(red_label==i), 1], c=c, alpha=a)
    plt.show()

[[ 5.96386102e-01 -2.27296926e+01  1.88862526e+01 ...  1.36629974e+00
  -3.26127014e+00  2.05373241e+00]
 [ 6.15743850e-03 -4.60231510e+01  3.65109292e+01 ...  3.03116174e+00
  -3.02789157e+00  2.15961113e+00]
 [ 1.16236250e+00 -3.61923503e+01  2.92452739e+01 ...  2.42956968e+00
  -3.06954047e+00  1.98417461e+00]
 ...
 [-2.82682314e+00 -2.91282138e+01  2.33236443e+01 ...  1.89854428e+00
  -3.32597219e+00  2.24920091e+00]
 [-3.15501505e+00 -3.23496543e+01  2.52361019e+01 ...  1.94242178e+00
  -3.35878482e+00  2.48707035e+00]
 [-4.38700329e-01 -2.81147208e+01  2.46281700e+01 ...  2.37607049e+00
  -3.33330485e+00  1.97809261e+00]]


In [9]:
ss = StandardScaler()
# X_train = ss.fit_transform(X_train)
# X_test = ss.fit_transform(X_test)

In [10]:
lr = LogisticRegression(multi_class='ovr')

# lr.fit(X_train, y_train)
# lr_y_predict = lr.predict(X_test)#用fit之后的结果对X_test进行预测

# print('Accuracy of LR Classifier:', lr.score(X_test, y_test))
# print(classification_report(y_test, lr_y_predict, target_names=['3', '4', '5', '6', '7', '8']))

In [11]:
sgdc = SGDClassifier()

# sgdc.fit(X_train, y_train)
# sgdc_y_predict = sgdc.predict(X_test)

# print('Accuarcy of SGD Classifier:', sgdc.score(X_test, y_test))
# print(classification_report(y_test, sgdc_y_predict, target_names=['3', '4', '5', '6', '7', '8']))

In [12]:
# 降维后

In [13]:
# 5重交叉验证
from sklearn.model_selection import KFold

kf = KFold(n_splits=5)

In [19]:
# 对降维后的数据进行logistic回归分类
# 使用交叉验证，记录每次学习得到的w和b
import time

w_stack = []
acc_stack = []
ford = 1
for X_train_i,X_test_i in kf.split(red_data_lda):
    time_start = time.time()
    w = np.ones([6, len(red_data_lda[0])])
    b = np.ones([6, 1])
    
    X_train = ss.fit_transform(red_data_lda[X_train_i])
    X_test = ss.fit_transform(red_data_lda[X_test_i])
    y_train = red_label[X_train_i]
    y_test = red_label[X_test_i]

    lr.fit(X_train, y_train)
    lr_y_predict = lr.predict(X_test)
    acc = lr.score(X_test, y_test)
    time_end = time.time()
    print('%s  %f s' % ("训练耗时为\n", (time_end - time_start)))
    print("\033[1;31;47m   Ford%d:acc = %f \033[0m" %(ford, acc))
    #print(classification_report(y_test, lr_y_predict, target_names=['3', '4', '5', '6', '7', '8']))
    w_stack.append(lr.coef_)
    acc_stack.append(acc)
    ford += 1



训练耗时为
  0.034994 s
[1;31;47m   Ford1:acc = 0.565625 [0m
训练耗时为
  0.035003 s
[1;31;47m   Ford2:acc = 0.484375 [0m
训练耗时为
  0.036008 s
[1;31;47m   Ford3:acc = 0.565625 [0m
训练耗时为
  0.032009 s
[1;31;47m   Ford4:acc = 0.490625 [0m
训练耗时为
  0.031005 s
[1;31;47m   Ford5:acc = 0.630094 [0m


In [15]:
# 取平均得到最终分类器
# 降维后
from numpy import *
w = np.array(w_stack).mean(0)
predict = np.zeros(len(red_label))

lr_y_predict = lr.predict(ss.fit_transform(red_data_lda))
acc = lr.score(ss.fit_transform(red_data_lda), red_label)
print("降维后总精度 = %f" %acc)

降维后总精度 = 0.584740


In [16]:
# 取平均得到最终分类器
# 降维后
from numpy import *
w = np.array(w_stack).mean(0)
predict = np.zeros(len(red_label))

lr_y_predict = sgdc.predict(ss.fit_transform(red_data_lda))
acc = sgdc.score(ss.fit_transform(red_data_lda), red_label)
print("降维后总精度 = %f" %acc)

NotFittedError: This SGDClassifier instance is not fitted yet. Call 'fit' with appropriate arguments before using this estimator.