一个简单的例子，线性模型$y = \alpha  + \beta x$的拟合

# 生成数据

In [1]:
#导入模块
import numpy as np

In [2]:
x = np.random.random(30)
x

array([0.95368104, 0.80492002, 0.10423788, 0.3938102 , 0.42479412,
       0.73810132, 0.26135347, 0.08222948, 0.46155981, 0.24986903,
       0.12220917, 0.45760443, 0.40322561, 0.64141753, 0.19806553,
       0.9990956 , 0.4980831 , 0.50046251, 0.37857936, 0.36361657,
       0.60374452, 0.83847713, 0.93301178, 0.46894614, 0.76178984,
       0.46962464, 0.08080696, 0.0086308 , 0.16600824, 0.26130495])

In [3]:
help(np.random.random)

Help on built-in function random:

random(...) method of numpy.random.mtrand.RandomState instance
    random(size=None)
    
    Return random floats in the half-open interval [0.0, 1.0). Alias for
    `random_sample` to ease forward-porting to the new random API.



In [4]:
x = 10 * x
x

array([9.53681039, 8.0492002 , 1.0423788 , 3.93810196, 4.24794123,
       7.38101321, 2.61353469, 0.82229476, 4.61559806, 2.49869033,
       1.22209173, 4.57604431, 4.03225615, 6.41417535, 1.9806553 ,
       9.99095604, 4.98083099, 5.00462505, 3.7857936 , 3.63616574,
       6.03744522, 8.38477128, 9.33011775, 4.6894614 , 7.61789839,
       4.69624636, 0.80806964, 0.08630799, 1.66008238, 2.6130495 ])

In [5]:
y = 20*x + 3
y

array([193.73620788, 163.98400402,  23.84757601,  81.76203925,
        87.95882455, 150.62026412,  55.2706938 ,  19.44589521,
        95.31196129,  52.97380668,  27.44183468,  94.52088622,
        83.64512295, 131.28350691,  42.61310607, 202.81912088,
       102.61661972, 103.09250102,  78.71587192,  75.72331487,
       123.74890441, 170.69542554, 189.60235502,  96.78922792,
       155.35796788,  96.92492723,  19.16139279,   4.72615978,
        36.20164757,  55.26098998])

增加扰动项：$y = ax + b + \varepsilon $

In [6]:
y = 20 * x + 3 + np.random.randn(30)
y

array([192.61459537, 165.182298  ,  22.65364084,  81.65593491,
        86.94623797, 151.20003776,  54.37607197,  20.52628241,
        94.2207789 ,  53.42328797,  28.91555416,  94.66706783,
        84.21571599, 131.53771554,  43.05561209, 203.5880042 ,
       103.29440033, 103.54614998,  79.27187136,  75.22566337,
       123.4454525 , 170.05482079, 189.42339977,  96.11230857,
       155.7569352 ,  95.31050005,  17.85389811,   4.6542803 ,
        36.88362012,  55.29158766])

# 绘制散点图 

In [None]:
from plotnine import *
import pandas as pd
%matplotlib inline

In [None]:
x_y = pd.DataFrame({'x':x, 'y':y})
x_y.sample(10)

In [None]:
x_y.describe()

In [None]:
(
    ggplot(x_y, aes(x='x', y='y', colour='x+y')) +
    geom_line() +
    geom_point(fill='white', size=2.5) +
    theme(legend_position='none')
)

# 模型拟合

In [None]:
from sklearn.linear_model import LinearRegression

In [None]:
reg = LinearRegression()

In [None]:
x = x.reshape(-1, 1)

In [None]:
reg.fit(x, y)

In [None]:
print("Fitted lm model is y={:.3}x+{:.3}".format(reg.coef_[0], reg.intercept_))

# 拟合效果

In [None]:
yhat = reg.predict(x)
yhat

In [None]:
x_y_yhat = pd.DataFrame({'x':x.flatten(), 'y':y, 'yhat':yhat})

In [None]:
x_y_yhat_melt = pd.melt(x_y_yhat, id_vars='x', var_name='y_type', value_name='value')
x_y_yhat_melt.sample(10)

In [None]:
(
    ggplot(x_y_yhat_melt, aes(x='x', y='value', colour='y_type')) +
    geom_point(alpha=0.25)
)

In [None]:
(
    ggplot(x_y, aes(x='x', y='y')) +
    geom_point() +
    geom_smooth(method='lm', color='red')
)

In [None]:
from statsmodels.api import OLS

In [None]:
lm2 = OLS(x, y).fit()

In [None]:
lm2.summary()