# Boston Housing Problem 

In [182]:
# 导入所需要的包
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error,accuracy_score
from sklearn.naive_bayes import MultinomialNB
import sklearn
import pandas as pd
import warnings
import numpy as np
warnings.simplefilter('ignore')

In [153]:
# 导入数据
data = pd.read_csv('./Boston.csv')
data

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATI,B,LSTAT,MEDV
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.0900,1,296,15.3,396.90,4.98,24.0
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242,17.8,396.90,9.14,21.6
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222,18.7,396.90,5.33,36.2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
501,0.06263,0.0,11.93,0,0.573,6.593,69.1,2.4786,1,273,21.0,391.99,9.67,22.4
502,0.04527,0.0,11.93,0,0.573,6.120,76.7,2.2875,1,273,21.0,396.90,9.08,20.6
503,0.06076,0.0,11.93,0,0.573,6.976,91.0,2.1675,1,273,21.0,396.90,5.64,23.9
504,0.10959,0.0,11.93,0,0.573,6.794,89.3,2.3889,1,273,21.0,393.45,6.48,22.0


Method 1. Linear regression (20 points) 

任务定义：利用线性回归模型来根据特征对波士顿房价回归  
输入：波士顿房子的特征    
输出：波士顿房子房价    
性能评价：均方差损失  
环境：python3.5  

In [47]:
# 切分X和y
y = data['MEDV'] 
X = data.ix[:,:-1]
print(y)
X

0      24.0
1      21.6
2      34.7
3      33.4
4      36.2
       ... 
501    22.4
502    20.6
503    23.9
504    22.0
505    11.9
Name: MEDV, Length: 506, dtype: float64


Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATI,B,LSTAT
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.0900,1,296,15.3,396.90,4.98
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242,17.8,396.90,9.14
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242,17.8,392.83,4.03
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222,18.7,394.63,2.94
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222,18.7,396.90,5.33
...,...,...,...,...,...,...,...,...,...,...,...,...,...
501,0.06263,0.0,11.93,0,0.573,6.593,69.1,2.4786,1,273,21.0,391.99,9.67
502,0.04527,0.0,11.93,0,0.573,6.120,76.7,2.2875,1,273,21.0,396.90,9.08
503,0.06076,0.0,11.93,0,0.573,6.976,91.0,2.1675,1,273,21.0,396.90,5.64
504,0.10959,0.0,11.93,0,0.573,6.794,89.3,2.3889,1,273,21.0,393.45,6.48


In [48]:
# 转为numpy数组
X = np.array(X)
y = np.array(y)
print(X.shape)
print(y.shape)

(506, 13)
(506,)


In [69]:
# 定义LR_learn函数
def LR_learn(feature,target):
    # 线性回归模型
    model = LinearRegression()
    # 训练
    model.fit(feature,target)
    # 返回权重w和偏置i
    return model.coef_,model.intercept_

In [70]:
# demo
w,i = LR_learn(X,y)
w

array([-1.08011358e-01,  4.64204584e-02,  2.05586264e-02,  2.68673382e+00,
       -1.77666112e+01,  3.80986521e+00,  6.92224640e-04, -1.47556685e+00,
        3.06049479e-01, -1.23345939e-02, -9.52747232e-01,  9.31168327e-03,
       -5.24758378e-01])

In [72]:
# 预测函数
def LRs_predict(X,w,i):
    # 线性回归模型
    model = LinearRegression()
    # 输入权重和偏置
    model.coef_ = w
    model.intercept_ = i
    # 预测
    y = model.predict(X)
    # 返回预测值
    return y

In [120]:
# demo
y = LRs_predict(X,w,i)
y[:20]

array([30.00384338, 25.02556238, 30.56759672, 28.60703649, 27.94352423,
       25.25628446, 23.00180827, 19.53598843, 11.52363685, 18.92026211,
       18.99949651, 21.58679568, 20.90652153, 19.55290281, 19.28348205,
       19.29748321, 20.52750979, 16.91140135, 16.17801106, 18.40613603])

In [121]:
# 划分数据集函数
def train_test_split(X,y,ratio):
    X_train,X_test,y_train,y_test = sklearn.model_selection.train_test_split(X,y,test_size =ratio,random_state=0)
    return X_train,X_test,y_train,y_test

In [125]:
# demo
X_train,X_test,y_train,y_test = train_test_split(X,y,ratio = 0.3)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(354, 13)
(152, 13)
(354,)
(152,)


In [137]:
# test
w,i = LR_learn(X_train,y_train)
print("w:",end='')
print(w)
print("i:",end='')
print(i)
y_pred_train = LRs_predict(X_train,w,i)
y_pred_test = LRs_predict(X_test,w,i)
print("MSE_trian:",end='')
print(mean_squared_error(y_pred_train,y_train))
print("MSE_test:",end='')
print(mean_squared_error(y_pred_test,y_test))

w:[-1.08011358e-01  4.64204584e-02  2.05586264e-02  2.68673382e+00
 -1.77666112e+01  3.80986521e+00  6.92224640e-04 -1.47556685e+00
  3.06049479e-01 -1.23345939e-02 -9.52747232e-01  9.31168327e-03
 -5.24758378e-01]
i:36.459488385091305
MSE_trian:8.301127149440674e-26
MSE_test:8.572958343187849e-26


Test的均方差大一些

Method 2. Bayes classifier (20 points) 

任务定义：利用朴素贝叶斯模型来根据特征对波士顿房价分类  
输入：波士顿房子的特征    
输出：波士顿房子房价的类别    
性能评价：准确率  
环境：python3.5  

In [154]:
y = data['MEDV']
y.describe()

count    506.000000
mean      22.532806
std        9.197104
min        5.000000
25%       17.025000
50%       21.200000
75%       25.000000
max       50.000000
Name: MEDV, dtype: float64

分为三个等级，断点为17和25

In [155]:
for i in range(len(y)):
    if y[i]<=17:
        y[i] = 0
    elif y[i]<=25:
        y[i] = 1
    else:
        y[i] = 2
y

0      1.0
1      1.0
2      2.0
3      2.0
4      2.0
      ... 
501    1.0
502    1.0
503    1.0
504    1.0
505    0.0
Name: MEDV, Length: 506, dtype: float64

In [160]:
# 数据处理
X = np.array(data.ix[:,:-1])
y = np.array(y)
print(y.shape)
print(X.shape)

(506,)
(506, 13)


In [170]:
# 贝叶斯学习函数
def Bys_learn(X,y):
    model = MultinomialNB()
    model.fit(X,y)
    return model

In [171]:
# demo
bys = Bys_learn(X,y)
bys

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [172]:
# 贝叶斯预测函数
def Bys_predict(model,X):
    y = model.predict(X)
    return y

In [174]:
# 预测的结果
y = Bys_predict(bys,X)
y[:20]

array([2., 1., 1., 2., 2., 2., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1.])

In [175]:
# 划分数据集
X_train,X_test,y_train,y_test = train_test_split(X,y,ratio = 0.3)

In [177]:
# 训练
bys = Bys_learn(X_train,y_train)

In [187]:
# 测试
y_pred_test = Bys_predict(bys,X_test)
print(y_pred_test[:20])
print('Accuracy:',end = '')
print(accuracy_score(y_pred_test,y_test))

[1. 0. 1. 0. 1. 1. 0. 1. 1. 0. 0. 0. 0. 0. 1. 2. 0. 2. 1. 1.]
Accuracy:0.9605263157894737


Method 3: SVM for classification or regression (30 points) 

In [188]:
# 导入所需要的包
from sklearn.svm import SVC,SVR

In [206]:
# 分类和回归
def regression_train(X,y):
    model = SVR()
    model.fit(X,y)
    return model

def classification_train(X,y):
    model = SVC()
    model.fit(X,y)
    return model

def regression_pred(model,X):
    y = model.predict(X)
    return y

def classification_pred(model,X):
    y = model.predict(X)
    return y

    

In [207]:
# demo
model1 = classification_train(X,y)
classification_pred(model1,X)[:20]

array([2., 1., 1., 2., 2., 2., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1.])

In [212]:
# demo
y = data['MEDV']
model2 = regression_train(X,y)
classification_pred(model2,X)[:20]

array([1.01473126, 1.01473125, 1.8997983 , 1.89984847, 1.89984312,
       1.90014316, 1.01473126, 1.86072433, 0.09998665, 1.01459538,
       0.16873318, 1.01473119, 1.01473126, 1.01473161, 1.0100143 ,
       1.01473126, 1.01474099, 1.01590253, 1.01473126, 1.01668066])

# Task 2: Handwritten Digits Recognition 

Method 2: Convolutional Neural Network (30 points)

任务定义：利用卷积神经网络识别手写数字  
输入：手写数字图片  
输出：识别结果  
性能评价：准确率  
环境：python3.5  

In [27]:
# 导入所需要的包
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchvision import datasets, transforms
from torch.autograd import Variable

In [28]:
# 网络
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        # 输入1通道，输出10通道，kernel 5*5
        self.conv1 = nn.Conv2d(1, 10, kernel_size=5)
        self.conv2 = nn.Conv2d(10, 20, kernel_size=5)
        self.mp = nn.MaxPool2d(2)
        # fully connect
        self.fc = nn.Linear(320, 10)

    def forward(self, x):
        # in_size = 64
        in_size = x.size(0) # one batch
        # x: 64*10*12*12
        x = F.relu(self.mp(self.conv1(x)))
        # x: 64*20*4*4
        x = F.relu(self.mp(self.conv2(x)))
        # x: 64*320
        x = x.view(in_size, -1) # flatten the tensor
        # x: 64*10
        x = self.fc(x)
        return F.log_softmax(x)

In [35]:
# 下载训练数据
train_dataset = datasets.MNIST(root='./MNIST_data/',
                               train=True,
                               transform=transforms.ToTensor(),
                               download=True)
# 下载测试数据
test_dataset = datasets.MNIST(root='./MNIST_data/',
                              train=False,
                              transform=transforms.ToTensor())

# Data_loader
train_loader = torch.utils.data.DataLoader(dataset=train_dataset,
                                           batch_size=64,
                                           shuffle=True)

test_loader = torch.utils.data.DataLoader(dataset=test_dataset,
                                          batch_size=64,
                                          shuffle=False)

In [36]:
# 训练函数
def train(epoch):
    for batch_idx, (data, target) in enumerate(train_loader):
        data, target = Variable(data), Variable(target)
        optimizer.zero_grad()
        output = model(data)
        loss = F.nll_loss(output, target)
        loss.backward()
        optimizer.step()
        if batch_idx % 200 == 0:
            print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
                epoch, batch_idx * len(data), len(train_loader.dataset),
                100. * batch_idx / len(train_loader), loss.item()))

In [37]:
# 计算损失、测试
def loss():
    test_loss = 0
    correct = 0
    for data, target in test_loader:
        data, target = Variable(data, volatile=True), Variable(target)
        output = model(data)
        # sum up batch loss
        test_loss += F.nll_loss(output, target, size_average=False).item()
        # get the index of the max log-probability
        pred = output.data.max(1, keepdim=True)[1]
        correct += pred.eq(target.data.view_as(pred)).cpu().sum()

    test_loss /= len(test_loader.dataset)
    print('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format(
        test_loss, correct, len(test_loader.dataset),
        100. * correct / len(test_loader.dataset)))

In [38]:
# 开始训练
for epoch in range(1, 4):
    train(epoch)
    loss()



Test set: Average loss: 2.2703, Accuracy: 9490/10000 (94%)


Test set: Average loss: 2.1848, Accuracy: 9658/10000 (96%)


Test set: Average loss: 2.1426, Accuracy: 9723/10000 (97%)

