## Листок 7: Прогнозирование

### Ссылка на листки, ноутбуки и данные

https://github.com/artamonoff/Econometrica

## Ссылки на документацию

https://www.statsmodels.org/stable/generated/statsmodels.regression.linear_model.OLSResults.html#statsmodels.regression.linear_model.OLSResults

https://tedboy.github.io/statsmodels_doc/index.html

In [39]:
import pandas as pd
import numpy as np
import io
# import statsmodels.api as sm 
import statsmodels.formula.api as smf
from statsmodels.iolib.summary2 import summary_params # вывод результатов тестирования
from statsmodels.iolib.summary2 import summary_col # вывод результатов тестирования

## 1 sleep equation #1

In [21]:
data1 = pd.read_csv('https://raw.githubusercontent.com/artamonoff/Econometrica/master/python-notebooks/data-csv/sleep75.csv')
data1.shape

(706, 34)

In [22]:
# Модель 1
# Создаем спецификацию модели через формулу и подгоняем модель
Sleep_eq1 = smf.ols(formula = 'sleep~totwrk+age+south+male+smsa+yngkid+marr', data = data1).fit()

In [23]:
Sleep_eq1.summary()

0,1,2,3
Dep. Variable:,sleep,R-squared:,0.131
Model:,OLS,Adj. R-squared:,0.123
Method:,Least Squares,F-statistic:,15.06
Date:,"Mon, 03 Apr 2023",Prob (F-statistic):,2.14e-18
Time:,21:19:24,Log-Likelihood:,-5255.9
No. Observations:,706,AIC:,10530.0
Df Residuals:,698,BIC:,10560.0
Df Model:,7,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,3450.9128,80.726,42.748,0.000,3292.418,3609.408
totwrk,-0.1692,0.018,-9.372,0.000,-0.205,-0.134
age,2.6891,1.469,1.830,0.068,-0.195,5.574
south,101.5685,41.837,2.428,0.015,19.427,183.710
male,87.6690,35.104,2.497,0.013,18.747,156.591
smsa,-54.7476,33.123,-1.653,0.099,-119.780,10.285
yngkid,-13.9624,50.341,-0.277,0.782,-112.801,84.876
marr,31.2106,42.233,0.739,0.460,-51.709,114.130

0,1,2,3
Omnibus:,62.368,Durbin-Watson:,1.947
Prob(Omnibus):,0.0,Jarque-Bera (JB):,169.049
Skew:,-0.445,Prob(JB):,1.96e-37
Kurtosis:,5.226,Cond. No.,12600.0


In [24]:
summary_col(Sleep_eq1, stars=None)

0,1
,sleep
Intercept,3450.9128
,(80.7262)
totwrk,-0.1692
,(0.0181)
age,2.6891
,(1.4692)
south,101.5685
,(41.8368)
male,87.6690


In [25]:
print(summary_col(Sleep_eq1, stars=True))


                  sleep    
---------------------------
Intercept      3450.9128***
               (80.7262)   
totwrk         -0.1692***  
               (0.0181)    
age            2.6891*     
               (1.4692)    
south          101.5685**  
               (41.8368)   
male           87.6690**   
               (35.1041)   
smsa           -54.7476*   
               (33.1230)   
yngkid         -13.9624    
               (50.3412)   
marr           31.2106     
               (42.2331)   
R-squared      0.1312      
R-squared Adj. 0.1225      
Standard errors in
parentheses.
* p<.1, ** p<.05, ***p<.01


Example (forecast through the matrix):  https://www.statology.org/statsmodels-predict/

In [47]:
data_string = '''№  totwrk age south male smsa yngkid marr
1  2150  37    0    1    1     0     1  
2  1950  28    1    1    0     1     0  
3  2240  26    0    0    1     0     0'''
wn_df = pd.read_csv(io.StringIO(data_string), sep='\s+')
wn_df.drop(columns='№', inplace=True)
wn_df

Unnamed: 0,totwrk,age,south,male,smsa,yngkid,marr
0,2150,37,0,1,1,0,1
1,1950,28,1,1,0,1,0
2,2240,26,0,0,1,0,0


In [26]:
#create new DataFrame
sleep_new = pd.DataFrame({'totwrk': [2150, 1950, 2240],
                       'age': [37, 28, 26],
                       'south': [0, 1, 0],
                       'male': [1, 1, 0],
                       'smsa': [1, 0, 1],
                       'yngkid': [0, 1, 0],
                       'marr': [1, 0, 0]})
#view new DataFrame
sleep_new

Unnamed: 0,totwrk,age,south,male,smsa,yngkid,marr
0,2150,37,0,1,1,0,1
1,1950,28,1,1,0,1,0
2,2240,26,0,0,1,0,0


In [27]:
#predict sleep
Sleep_eq1.predict(sleep_new).round(2)

0    3250.68
1    3371.46
2    3086.98
dtype: float64

## 3 wage equation #1

In [28]:
data3 = pd.read_csv('https://raw.githubusercontent.com/artamonoff/Econometrica/master/python-notebooks/data-csv/wage2.csv')
data3.shape

(935, 17)

In [29]:
Wage_eq1=smf.ols(formula='np.log(wage)~age+IQ+south+married+urban', data = data3).fit()

In [30]:
summary_col(Wage_eq1, stars=True)

0,1
,np.log(wage)
Intercept,4.9740***
,(0.1654)
age,0.0213***
,(0.0040)
IQ,0.0082***
,(0.0008)
south,-0.0990***
,(0.0268)
married,0.2010***


In [31]:
print(summary_col(Wage_eq1, stars=True))


               np.log(wage)
---------------------------
Intercept      4.9740***   
               (0.1654)    
age            0.0213***   
               (0.0040)    
IQ             0.0082***   
               (0.0008)    
south          -0.0990***  
               (0.0268)    
married        0.2010***   
               (0.0402)    
urban          0.1750***   
               (0.0276)    
R-squared      0.1996      
R-squared Adj. 0.1953      
Standard errors in
parentheses.
* p<.1, ** p<.05, ***p<.01


In [46]:
data_wage= '''№ age IQ  south married urban
1 36  105   1      1      1  
2 29  123   0      1      0  
3 25  112   1      0      1'''
wg_nw = pd.read_csv(io.StringIO(data_wage), sep='\s+')
wg_nw.drop(columns='№',inplace=True)
wg_nw

Unnamed: 0,age,IQ,south,married,urban
0,36,105,1,1,1
1,29,123,0,1,0
2,25,112,1,0,1


In [32]:
Wage_new = pd.DataFrame({'age': [36,29,25],
                        'IQ': [105,123,112],
                        'south': [1,0,1],
                        'married': [1,1,0],
                        'urban': [1,0,1]})
Wage_new

Unnamed: 0,age,IQ,south,married,urban
0,36,105,1,1,1
1,29,123,0,1,0
2,25,112,1,0,1


In [33]:
# predict log(wage)
LOG_wage = Wage_eq1.predict(Wage_new)
LOG_wage

0    6.877240
1    6.800093
2    6.499888
dtype: float64

In [34]:
type(LOG_wage)

pandas.core.series.Series

In [35]:
#Convert pandas Series to DataFrame
# через конструктор
pd.DataFrame({'np.log(wage)': LOG_wage.values})

Unnamed: 0,np.log(wage)
0,6.87724
1,6.800093
2,6.499888


In [36]:
#Convert pandas Series to DataFrame
# через функцию конвертации
LOG_wage.to_frame(name='np.log(wage)')

Unnamed: 0,np.log(wage)
0,6.87724
1,6.800093
2,6.499888


In [37]:
# predict wage
np.exp(LOG_wage).round(2)

0    969.95
1    897.93
2    665.07
dtype: float64

In [38]:
# predict wage
np.exp(LOG_wage).round(2).to_frame(name='wage')

Unnamed: 0,wage
0,969.95
1,897.93
2,665.07
