In [1]:
import pandas as pd
import numpy as np

In [2]:
import statsmodels.api as sm
import statsmodels.formula.api as smf

In [3]:
def dnorm(mean, variance, size=1):
    if isinstance(size, int):
        size = size,
    return mean + np.sqrt(variance) * np.random.randn(*size)

np.random.randn(12345)

N = 100
X = np.c_[dnorm(0, 0.4, size=N),
         dnorm(0, 0.6, size=N),
         dnorm(0, 0.2, size=N)]

eps = dnorm(0, 0.1, size=N)
beta = [0.1, 0.3, 0.5]

y = np.dot(X, beta) + eps

In [4]:
X[:5]

array([[-0.14547023,  0.51892205, -0.67074481],
       [-0.43853526, -1.11282469, -0.38235063],
       [ 0.05065718, -0.06252877, -0.24393794],
       [-0.32825984,  0.72558718, -0.05962621],
       [-0.41319638, -0.27584646, -0.35416597]])

In [5]:
y[:5]

array([-0.08472054, -0.23118548, -0.37118457,  0.29148103, -0.1662778 ])

In [7]:
X_model = sm.add_constant(X)
X_model[:5]

array([[ 1.        , -0.14547023,  0.51892205, -0.67074481],
       [ 1.        , -0.43853526, -1.11282469, -0.38235063],
       [ 1.        ,  0.05065718, -0.06252877, -0.24393794],
       [ 1.        , -0.32825984,  0.72558718, -0.05962621],
       [ 1.        , -0.41319638, -0.27584646, -0.35416597]])

In [9]:
model = sm.OLS(y, X)
result = model.fit()
result.params

array([0.01938788, 0.28443885, 0.47460399])

In [10]:
result.summary()

0,1,2,3
Dep. Variable:,y,R-squared (uncentered):,0.484
Model:,OLS,Adj. R-squared (uncentered):,0.468
Method:,Least Squares,F-statistic:,30.33
Date:,"Sat, 19 Sep 2020",Prob (F-statistic):,6.44e-14
Time:,10:05:58,Log-Likelihood:,-19.429
No. Observations:,100,AIC:,44.86
Df Residuals:,97,BIC:,52.67
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
x1,0.0194,0.052,0.375,0.709,-0.083,0.122
x2,0.2844,0.040,7.045,0.000,0.204,0.365
x3,0.4746,0.065,7.257,0.000,0.345,0.604

0,1,2,3
Omnibus:,3.035,Durbin-Watson:,1.657
Prob(Omnibus):,0.219,Jarque-Bera (JB):,2.091
Skew:,-0.161,Prob(JB):,0.352
Kurtosis:,2.369,Cond. No.,1.69


In [11]:
data = pd.DataFrame(X, columns=['col0', 'col1', 'col2'])
data['y'] = y
data[:5]

Unnamed: 0,col0,col1,col2,y
0,-0.14547,0.518922,-0.670745,-0.084721
1,-0.438535,-1.112825,-0.382351,-0.231185
2,0.050657,-0.062529,-0.243938,-0.371185
3,-0.32826,0.725587,-0.059626,0.291481
4,-0.413196,-0.275846,-0.354166,-0.166278


In [12]:
results = smf.ols('y ~ col0 + col1 + col2', data=data).fit()
results.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.478
Model:,OLS,Adj. R-squared:,0.461
Method:,Least Squares,F-statistic:,29.27
Date:,"Sat, 19 Sep 2020",Prob (F-statistic):,1.59e-13
Time:,10:10:28,Log-Likelihood:,-18.736
No. Observations:,100,AIC:,45.47
Df Residuals:,96,BIC:,55.89
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,-0.0348,0.030,-1.157,0.250,-0.094,0.025
col0,0.0144,0.052,0.277,0.782,-0.089,0.117
col1,0.2807,0.040,6.942,0.000,0.200,0.361
col2,0.4672,0.066,7.121,0.000,0.337,0.597

0,1,2,3
Omnibus:,2.787,Durbin-Watson:,1.674
Prob(Omnibus):,0.248,Jarque-Bera (JB):,2.007
Skew:,-0.166,Prob(JB):,0.367
Kurtosis:,2.391,Cond. No.,2.25


In [13]:
init_x = 4

import random
values = [init_x, init_x]
N = 1000

b0 = 0.8
b1 = -0.4
noise = dnorm(0, 0.1, N)
for i in range(N):
    new_x = values[-1] * b0 + values[-2] * b1 + noise[i]
    values.append(new_x)

In [14]:
MAXLAGS = 5
model = sm.tsa.AR(values)
results = model.fit(MAXLAGS)
results.summary()

statsmodels.tsa.AR has been deprecated in favor of statsmodels.tsa.AutoReg and
statsmodels.tsa.SARIMAX.

AutoReg adds the ability to specify exogenous variables, include time trends,
and add seasonal dummies. The AutoReg API differs from AR since the model is
treated as immutable, and so the entire specification including the lag
length must be specified when creating the model. This change is too
substantial to incorporate into the existing AR api. The function
ar_select_order performs lag length selection for AutoReg models.

AutoReg only estimates parameters using conditional MLE (OLS). Use SARIMAX to
estimate ARX and related models using full MLE via the Kalman Filter.





0,1,2,3
Dep. Variable:,y,No. Observations:,1002.0
Model:,AR(5),Log Likelihood,-263.732
Method:,cmle,S.D. of innovations,0.315
Date:,"Sat, 19 Sep 2020",AIC,-2.295
Time:,10:20:49,BIC,-2.26
Sample:,0,HQIC,-2.282
,,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,0.0090,0.010,0.896,0.370,-0.011,0.029
L1.y,0.7105,0.032,22.403,0.000,0.648,0.773
L2.y,-0.2828,0.039,-7.262,0.000,-0.359,-0.206
L3.y,-0.0716,0.040,-1.794,0.073,-0.150,0.007
L4.y,0.0019,0.039,0.050,0.960,-0.074,0.077
L5.y,0.0002,0.030,0.006,0.995,-0.059,0.059

0,1,2,3,4
,Real,Imaginary,Modulus,Frequency
AR.1,1.0205,-1.1437j,1.5328,-0.1341
AR.2,1.0205,+1.1437j,1.5328,0.1341
AR.3,-5.7099,-0.0000j,5.7099,-0.5000
AR.4,16.9459,-0.0000j,16.9459,-0.0000
AR.5,-23.8031,-0.0000j,23.8031,-0.5000
