# 逻辑回归

虽然是逻辑回归，但是Cost Function的定义使用的差值平方，不严谨

In [43]:
import numpy as np
import pandas as pd
import matplotlib as plt

# 读取数据

In [57]:
df = pd.read_csv('data/binary.csv')
df.head(5)

Unnamed: 0,admit,gre,gpa,rank
0,0,380,3.61,3
1,1,660,3.67,3
2,1,800,4.0,1
3,1,640,3.19,4
4,0,520,2.93,4


#  数据清理

In [58]:
# Make dummy variables for rank
df = pd.concat([df, pd.get_dummies(df['rank'], prefix='rank')], axis=1)
df = df.drop('rank', axis=1)
df.head()

Unnamed: 0,admit,gre,gpa,rank_1,rank_2,rank_3,rank_4
0,0,380,3.61,0,0,1,0
1,1,660,3.67,0,0,1,0
2,1,800,4.0,1,0,0,0
3,1,640,3.19,0,0,0,1
4,0,520,2.93,0,0,0,1


In [59]:
# Standarize featurs
for field in ['gre', 'gpa']:
    mean, std = df[field].mean(), df[field].std()
    df.loc[:,field] = (df[field]-mean)/std
    
df.head()

Unnamed: 0,admit,gre,gpa,rank_1,rank_2,rank_3,rank_4
0,0,-1.798011,0.578348,0,0,1,0
1,1,0.625884,0.736008,0,0,1,0
2,1,1.837832,1.603135,1,0,0,0
3,1,0.452749,-0.525269,0,0,0,1
4,0,-0.586063,-1.208461,0,0,0,1


In [60]:
# Cross sets
from sklearn.model_selection import train_test_split

X = df[['gre', 'gpa', 'rank_1', 'rank_2', 'rank_3', 'rank_4']]
y = df['admit']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

In [61]:
X_train.head()

Unnamed: 0,gre,gpa,rank_1,rank_2,rank_3,rank_4
92,1.837832,1.340369,0,1,0,0
223,1.837832,0.210476,0,0,1,0
234,1.837832,0.368135,1,0,0,0
232,-1.798011,-0.026014,0,1,0,0
377,1.837832,1.603135,0,1,0,0


In [62]:
y_train.head()

92     0
223    0
234    1
232    0
377    1
Name: admit, dtype: int64

In [63]:
zip(X_train.values, y_train)[0:3]

[(array([ 1.83783211,  1.34036925,  0.        ,  1.        ,  0.        ,  0.        ]),
  0),
 (array([ 1.83783211,  0.21047555,  0.        ,  0.        ,  1.        ,  0.        ]),
  0),
 (array([ 1.83783211,  0.36813514,  1.        ,  0.        ,  0.        ,  0.        ]),
  1)]

# 训练

In [90]:
# 定义训练集数量和features数
n_records,n_features = X.shape

# 初始化weights
weights = np.random.normal(scale = 1 / n_featurs**.5, size = n_features)

# 定义迭代次数
epochs = 1000

# 定义learning_rate
learning_rate = 0.5

# 开始训练
for e in range(epochs):
    delta_w = np.zeros(weights.shape)
    # 求解weights处的Gradient
    for x, y in zip(X_train.values, y_train):
        output = sigmoid(np.dot(x, weights))
        
        error = y - output
        
        delta_w += - error * output * (1 - output) * x / n_records
    # 第一次迭代后的weights
    weights = weights - delta_w * learning_rate
    
weights

array([ 0.1226679 ,  0.33046276,  0.12125896, -0.57079512, -1.29866091,
       -1.4011461 ])

In [91]:
# Calculate accuracy on test data
tes_out = sigmoid(np.dot(X_test, weights))
predictions = tes_out > 0.5
accuracy = np.mean(predictions == y_test)
print("Prediction accuracy: {:.3f}".format(accuracy))

Prediction accuracy: 0.733


# 训练

In [79]:
# 初始化参数
n_records,n_features = X_train.shape

print '训练集数量: %d' % n_records
print 'Feature数量: %d' % n_features

训练集数量: 280
Feature数量: 6


In [80]:
# 初始化权重
weights = np.random.normal(scale = 1 / n_featurs**.5, size = n_features)
weights

array([-0.80003198, -0.54222968,  0.08036826,  0.30147772,  0.06996081,
       -0.04721321])

In [81]:
# 迭代次数 和 learning_rate
epochs = 1000    
learnrate = 0.5
last_loss = None

In [82]:
for e in range(epochs):
    del_w = np.zeros(weights.shape)
    for x, y in zip(X_train.values, y_train):
        # Calculate the output
        output = sigmoid(np.dot(x, weights))
        
        # Calculate the error
        error = y - output
        
        # Calculate change in weights
        del_w += error * output * (1 - output) * x
        
    weights += learnrate * del_w / n_records

    # Printing out the mean square error on the training set
    if e % (epochs / 10) == 0:
        out = sigmoid(np.dot(x, weights))
        loss = np.mean((out - y) ** 2)
        if last_loss and last_loss < loss:
            print("Train loss: ", loss, "  WARNING - Loss Increasing")
        else:
            print("Train loss: ", loss)
        last_loss = loss

('Train loss: ', 0.11201157735795815)
('Train loss: ', 0.10185764115944329)
('Train loss: ', 0.081880477817588421)
('Train loss: ', 0.071666724116591946)
('Train loss: ', 0.065759023120983431)
('Train loss: ', 0.062083588853207465)
('Train loss: ', 0.059689488722726329)
('Train loss: ', 0.058080848246216836)
('Train loss: ', 0.056975548896788512)


# Predict

In [83]:
# Calculate accuracy on test data
tes_out = sigmoid(np.dot(X_test, weights))
predictions = tes_out > 0.5
accuracy = np.mean(predictions == y_test)
print("Prediction accuracy: {:.3f}".format(accuracy))

Prediction accuracy: 0.717
