In [1]:
# %% 1
# Package imports
import matplotlib.pyplot as plt
import numpy as np
import sklearn
import sklearn.datasets
import sklearn.linear_model
import matplotlib
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from pprint import pprint

In [2]:
categories = ['alt.atheism', 'talk.religion.misc', 'comp.graphics', 'sci.space']
print(np.shape(categories))

newsgroups_train = fetch_20newsgroups(subset='train',  categories=categories)
newsgroups_test = fetch_20newsgroups(subset='test',  categories=categories)

(4,)


In [3]:
num_train = len(newsgroups_train.data)
num_test  = len(newsgroups_test.data)

vectorizer = TfidfVectorizer(max_features=2048)    #文本特征提取

data = vectorizer.fit_transform( newsgroups_train.data + newsgroups_test.data )
X_train = data[0:num_train, :]
X_test = data[num_train:num_train+num_test,:]

Y_train = newsgroups_train.target
Y_test = newsgroups_test.target

print(X_train)
print(X_train.shape, Y_train.shape)
print(X_test.shape, Y_test.shape)

  (0, 742)	0.04584677767961114
  (0, 601)	0.22178541984084713
  (0, 88)	0.1679900779387685
  (0, 1879)	0.15744988844762187
  (0, 1746)	0.02292338883980557
  (0, 1986)	0.06059476500051193
  (0, 537)	0.0679123091588565
  (0, 135)	0.08850994143992637
  (0, 1801)	0.19448797059837275
  (0, 1563)	0.20875340280264512
  (0, 779)	0.06839902359627349
  (0, 1035)	0.022943695992032588
  (0, 29)	0.0855003973638563
  (0, 839)	0.09392989548470095
  (0, 1927)	0.06579763264400111
  (0, 1800)	0.05910830179880021
  (0, 877)	0.07463940290723516
  (0, 2039)	0.10261940600890049
  (0, 1276)	0.05406144464962305
  (0, 1581)	0.20875340280264512
  (0, 1160)	0.10128618793379258
  (0, 2004)	0.037858496447795785
  (0, 2041)	0.050240098331598336
  (0, 1094)	0.1230251272349178
  (0, 1838)	0.10088053507516663
  :	:
  (2033, 640)	0.07289629705757049
  (2033, 1608)	0.07441608911470858
  (2033, 704)	0.09115293495944315
  (2033, 1467)	0.103291655405789
  (2033, 1904)	0.05678318136629984
  (2033, 1216)	0.11551544351314319


z1 = X * W1 + b1  
a1 = ReLU(z1)  
z2 = a1 * W2 + b2  
a2 = tanh(z2)  
z3 = a2 * W3 + b3  
a3 = sigmoid(z3)  
z4 = a3 * W4 + b4  
a4 = $\hat{y}$ = softmax(z4)  
![avater](./loss.png)

In [4]:
# Helper function to evaluate the total loss on the dataset
def calculate_loss(model, X, y):
    W1, b1, W2, b2, W3, b3, W4, b4 = model['W1'], model['b1'], model['W2'], model['b2'], model['W3'], model['b3'], model['W4'], model['b4']
    #正向传播，计算预测值
    z1 = X.dot(W1) + b1
    a1 = np.maximum(0, z1)
    z2 = a1.dot(W2) + b2
    a2 = np.tanh(z2)
    z3 = a2.dot(W3) + b3
    a3 = 1./(1 + np.exp(-z3))
    z4 = a3.dot(W4) + b4
    exp_scores = np.exp(z4)
    probs = exp_scores / np.sum(exp_scores, axis=1, keepdims=True)
    # 计算损失
    corect_logprobs = -np.log(probs[range(num_examples), y])
    data_loss = np.sum(corect_logprobs)
    #在损失上加上正则项（可选）
    data_loss += reg_lambda/2 * (np.sum(np.square(W1)) + np.sum(np.square(W2)) + np.sum(np.square(W3)) + np.sum(np.square(W4)))
    return 1./num_examples * data_loss

In [5]:
# Helper function to predict an output (0 or 1)
def predict(model, X):
    W1, b1, W2, b2, W3, b3, W4, b4 = model['W1'], model['b1'], model['W2'], model['b2'], model['W3'], model['b3'], model['W4'], model['b4']
    #正向传播，计算预测值
    z1 = X.dot(W1) + b1
    a1 = np.maximum(0, z1)
    z2 = a1.dot(W2) + b2
    a2 = np.tanh(z2)
    z3 = a2.dot(W3) + b3
    a3 = 1./(1 + np.exp(-z3))
    z4 = a3.dot(W4) + b4
    exp_scores = np.exp(z4)
    probs = exp_scores / np.sum(exp_scores, axis=1, keepdims=True)
    return np.argmax(probs, axis=1)

In [6]:
def dropout(X, level):
    if level < 0 or level >= 1:
        print("Dropout level must be in interval [0,1)")
    retain_prob = 1 - level
    sample = np.random.binomial(1, retain_prob, np.shape(X))  #生成一个0，1分布的向量
    #print('X_shape',np.shape(X), type(X))
    #print('sample',np.shape(sample), type(sample))
    X = X * sample
    X /= retain_prob
    return X

In [7]:
# This function learns parameters for the neural network and returns the model.
# - nn_hdim: Number of nodes in the hidden layer
# - num_passes: Number of passes through the training data for gradient descent
# - print_loss: If True, print the loss every 1000 iterations
def build_model(X, y, nn_hdim, epsilon, reg_lambda, num_passes=20000,  print_loss=False):
# 用随机值初始化参数。我们需要学习这些参数
    np.random.seed(0)
    W1 = np.random.randn(input_dim, nn_hdim[0]) / np.sqrt(input_dim)
    b1 = np.zeros((1, nn_hdim[0]))
    W2 = np.random.randn(nn_hdim[0],nn_hdim[1]) / np.sqrt(nn_hdim[0])
    b2 = np.zeros((1, nn_hdim[1]))
    W3 = np.random.randn(nn_hdim[1],nn_hdim[2]) / np.sqrt(nn_hdim[1])
    b3 = np.zeros((1, nn_hdim[2]))
    W4 = np.random.randn(nn_hdim[2], np.shape(categories)[0]) / np.sqrt(nn_hdim[2])
    b4 = np.zeros((1, np.shape(categories)[0]))

    # 这是我们最终要返回的数据
    model = {}

    # 梯度下降
    for i in range(0, num_passes):
        #print(i)
        #正向传播，计算预测值
        #Y = dropout(X, 0.2)
        z1 = X.dot(W1) + b1
        a1 = np.maximum(0, z1)
        a1 = dropout(a1, 0.01)
        z2 = a1.dot(W2) + b2
        a2 = np.tanh(z2)
        a2 = dropout(a2, 0)
        z3 = a2.dot(W3) + b3
        a3 = 1./(1 + np.exp(-z3))
        a3 = dropout(a3, 0)
        z4 = a3.dot(W4) + b4
        exp_scores = np.exp(z4)
        probs = exp_scores / np.sum(exp_scores, axis=1, keepdims=True)

        # 反向传播
        delta4 = probs
        delta4[range(num_examples), y] -= 1
        dW4 = (a3.T).dot(delta4)
        db4 = np.sum(delta4, axis=0, keepdims=True)
        delta3 = delta4.dot(W4.T) * a3 * (1 - a3)
        dW3 = (a2.T).dot(delta3)
        db3 = np.sum(delta3, axis=0, keepdims=True)
        delta2 = delta3.dot(W3.T) * (1 - np.power(a2, 2))
        dW2 = (a1.T).dot(delta2)
        db2 = np.sum(delta2, axis=0)
        delta1 = delta2.dot(W2.T)
        delta1[z1 <= 0] = 0
        dW1 = (X.T).dot(delta1)
        db1 = np.sum(delta1, axis=0)
        


        # 添加正则项 (b1 和 b2 没有正则项)
        dW4 += reg_lambda * W4
        dW3 += reg_lambda * W3
        dW2 += reg_lambda * W2
        dW1 += reg_lambda * W1
        
        #print(dW1)
        #print(dW2)
        #print(dW3)
        #print(dW4)

        # 梯度下降更新参数
        W1 += -epsilon * dW1
        b1 += -epsilon * db1
        W2 += -epsilon * dW2
        b2 += -epsilon * db2
        W3 += -epsilon * dW3
        b3 += -epsilon * db3
        W4 += -epsilon * dW4
        b4 += -epsilon * db4
        
        
        # 为模型分配新的参数
        model = { 'W1': W1, 'b1': b1, 'W2': W2, 'b2': b2, 'W3': W3, 'b3': b3, 'W4': W4, 'b4': b4 }
        #print(model)
        #yp = predict(model, X_test)
        #print(yp)
        #print('Accuracy %f'%(np.mean(yp==Y_test)))

        # 选择性地打印损失
        # 这种做法很奢侈，因为我们用的是整个数据集，所以我们不想太频繁地这样做
        if print_loss and i % 1000 == 0:
            print ("Loss after iteration %i: %f" %(i, calculate_loss(model,X,y)))
            epsilon = epsilon * 0.95


    return model

In [8]:
# Build a model with a 3-dimensional hidden layer

num_examples, input_dim = X_train.shape
#print(input_dim)
epsilon = 0.0001
reg_lambda = 0.01
epochs = 100000
nn_hdim = [4,16,8]

model = build_model(X_train, Y_train, nn_hdim, epsilon, reg_lambda, epochs, print_loss=True)

Loss after iteration 0: 1.387040
Loss after iteration 1000: 0.624583
Loss after iteration 2000: 0.146109
Loss after iteration 3000: 0.023992
Loss after iteration 4000: 0.014799
Loss after iteration 5000: 0.011928
Loss after iteration 6000: 0.010175
Loss after iteration 7000: 0.008622
Loss after iteration 8000: 0.007645
Loss after iteration 9000: 0.007020
Loss after iteration 10000: 0.006623
Loss after iteration 11000: 0.006395
Loss after iteration 12000: 0.006305
Loss after iteration 13000: 0.006125
Loss after iteration 14000: 0.005854
Loss after iteration 15000: 0.005835
Loss after iteration 16000: 0.005720
Loss after iteration 17000: 0.005735
Loss after iteration 18000: 0.005651
Loss after iteration 19000: 0.005559
Loss after iteration 20000: 0.005425
Loss after iteration 21000: 0.005493
Loss after iteration 22000: 0.005382
Loss after iteration 23000: 0.005385
Loss after iteration 24000: 0.005331
Loss after iteration 25000: 0.005234
Loss after iteration 26000: 0.005127
Loss after ite

In [9]:
n_correct = 0
n_test = X_test.shape[0]
for n in range(n_test):
    x = X_test[n,:]
    yp = predict(model, x)
    if yp == Y_test[n]:
        n_correct += 1.0

print('Accuracy %f = %d / %d'%(n_correct/n_test, int(n_correct), n_test) )

Accuracy 0.825573 = 1117 / 1353
