Skip to content

Commit 4311a1b

Browse files
author
Algorithmica
authored
Add files via upload
1 parent ebf3c26 commit 4311a1b

7 files changed

+335
-0
lines changed
Lines changed: 84 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,84 @@
1+
import math
2+
import pandas as pd
3+
import numpy as np
4+
import os
5+
from sklearn import metrics
6+
from sklearn.model_selection import TimeSeriesSplit
7+
from statsmodels.tsa import ar_model
8+
import matplotlib.pyplot as plt
9+
10+
def grid_search_best_model_timeseries_ar(df, grid, cv):
11+
best_param = None
12+
best_score = np.infty
13+
tsp = TimeSeriesSplit(n_splits=cv)
14+
15+
for param in grid.get('lags'):
16+
scores = []
17+
for train_ind, test_ind in tsp.split(df):
18+
train_data = df.iloc[train_ind]
19+
test_data = df.iloc[test_ind]
20+
try:
21+
#print(train_data, test_data)
22+
estimator = ar_model.AutoReg(train_data, lags=param)
23+
res = estimator.fit()
24+
#print(res.params)
25+
#get out of sample predictions with test data start and end
26+
pred = estimator.predict(res.params, test_data.index[0], test_data.index[-1])
27+
#print(pred)
28+
y_pred = pred.values.reshape(-1)
29+
y_test = test_data.values.reshape(-1)
30+
score = math.sqrt(metrics.mean_squared_error(y_test, y_pred))
31+
scores.append(score)
32+
except:
33+
pass
34+
#print(scores)
35+
if len(scores) > 0 and np.mean(scores) < best_score :
36+
best_score = np.mean(scores)
37+
best_param = param
38+
39+
if best_param is not None:
40+
estimator = ar_model.AutoReg(df, lags=best_param)
41+
res = estimator.fit()
42+
print("best parameters:" + str(best_param))
43+
print("validation rmse:" + str(best_score))
44+
#get insample predictions with start and end indices
45+
predictions = estimator.predict(res.params, start=0, end=df.shape[0]-1 )
46+
y_pred = predictions.values.reshape(-1)
47+
y_train = df.values.reshape(-1)[best_param:]
48+
train_rmse = math.sqrt(metrics.mean_squared_error(y_train, y_pred))
49+
print("train rmse:" + str(train_rmse))
50+
return estimator, res
51+
else:
52+
return None, None
53+
54+
path = 'F:/'
55+
df = pd.read_csv(os.path.join(path,'uk-deaths-from-bronchitis-emphys.csv'))
56+
df.info()
57+
58+
df.columns = ['timestamp', 'y']
59+
df.index = pd.to_datetime(df['timestamp'], format='%Y-%m')
60+
df.index.freq = 'MS'
61+
df.drop('timestamp', axis=1, inplace=True)
62+
63+
#grid search and get final model with best parameters
64+
ar_grid = { 'lags':[2,3,4,5] }
65+
estimator, res = grid_search_best_model_timeseries_ar(df, ar_grid, 3)
66+
print(res.params)
67+
print(res.summary())
68+
plt.plot(df)
69+
plt.figure()
70+
res.resid.plot()
71+
72+
#get predictions for future(implicit intervals based on freq of train data)
73+
start_index = pd.datetime(1980, 1, 1)
74+
end_index = pd.datetime(1990, 12, 1)
75+
pred = estimator.predict(res.params, start_index, end_index)
76+
print(pred)
77+
78+
#get predictions for future(explicit intervals)
79+
index = pd.date_range('1-1-1980', '12-1-1990', freq='MS')
80+
pred = estimator.predict(res.params, index[0], index[-1])
81+
print(pred)
82+
83+
plt.figure()
84+
plt.plot(pred)
Lines changed: 90 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,90 @@
1+
import math
2+
import pandas as pd
3+
import numpy as np
4+
import os
5+
from sklearn import metrics
6+
from sklearn.model_selection import TimeSeriesSplit
7+
from itertools import product
8+
from statsmodels.tsa import arima_model
9+
import matplotlib.pyplot as plt
10+
11+
def grid_search_best_model_timeseries_arima(df, grid, cv):
12+
keys, values = zip(*grid.items())
13+
params =[]
14+
for v in product(*values):
15+
params.append(tuple(v))
16+
17+
print(params)
18+
best_param = None
19+
best_score = np.infty
20+
tsp = TimeSeriesSplit(n_splits=cv)
21+
22+
for param in params:
23+
scores = []
24+
for train_ind, test_ind in tsp.split(df):
25+
train_data = df.iloc[train_ind]
26+
test_data = df.iloc[test_ind]
27+
try:
28+
#print(train_data, test_data)
29+
estimator = arima_model.ARIMA(train_data, order=param)
30+
res = estimator.fit()
31+
#print(res.params)
32+
#get out of sample predictions with test data start and end
33+
y_pred = estimator.predict(res.params, test_data.index[0], test_data.index[-1])
34+
#print(y_pred)
35+
y_test = test_data.values.reshape(-1)
36+
score = math.sqrt(metrics.mean_squared_error(y_test, y_pred))
37+
scores.append(score)
38+
except:
39+
pass
40+
#print(scores)
41+
if len(scores) > 0 and np.mean(scores) < best_score :
42+
best_score = np.mean(scores)
43+
best_param = param
44+
45+
if best_param is not None:
46+
estimator = arima_model.ARIMA(df, order=best_param)
47+
res = estimator.fit()
48+
print("best parameters:" + str(best_param))
49+
print("validation rmse:" + str(best_score))
50+
#get insample predictions with start and end indices
51+
y_pred = estimator.predict(res.params, start=0, end=df.shape[0]-1 )
52+
y_train = df.values.reshape(-1)
53+
train_rmse = math.sqrt(metrics.mean_squared_error(y_train, y_pred))
54+
print("train rmse:" + str(train_rmse))
55+
return estimator, res
56+
else:
57+
return None, None
58+
59+
path = 'F:/'
60+
df = pd.read_csv(os.path.join(path,'uk-deaths-from-bronchitis-emphys.csv'))
61+
df.info()
62+
63+
df.columns = ['timestamp', 'y']
64+
df.index = pd.to_datetime(df['timestamp'], format='%Y-%m')
65+
df.index.freq = 'MS'
66+
df.drop('timestamp', axis=1, inplace=True)
67+
68+
#grid search and get final model with best parameters
69+
arima_grid = { 'p':[0,1,5,7], 'd':[0,1,2], 'q':[0,1,2,5] }
70+
estimator, res = grid_search_best_model_timeseries_arima(df, arima_grid, 3)
71+
print(res.params)
72+
print(res.summary())
73+
plt.plot(df)
74+
plt.figure()
75+
res.resid.plot()
76+
77+
#get predictions for future(implicit intervals based on freq of train data)
78+
start_index = pd.datetime(1980, 1, 1)
79+
end_index = pd.datetime(1990, 12, 1)
80+
pred = estimator.predict(res.params, start_index, end_index)
81+
print(pred)
82+
83+
#get predictions for future(explicit intervals)
84+
index = pd.date_range('1-1-1980', '12-1-1990', freq='MS')
85+
pred = estimator.predict(res.params, index[0], index[-1])
86+
pred = pd.Series(pred, index=index)
87+
print(pred)
88+
89+
plt.figure()
90+
plt.plot(pred)
Lines changed: 90 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,90 @@
1+
import math
2+
import pandas as pd
3+
import numpy as np
4+
import os
5+
from sklearn import metrics
6+
from sklearn.model_selection import TimeSeriesSplit
7+
from itertools import product
8+
from statsmodels.tsa import arima_model
9+
import matplotlib.pyplot as plt
10+
11+
def grid_search_best_model_timeseries_arma(df, grid, cv):
12+
keys, values = zip(*grid.items())
13+
params =[]
14+
for v in product(*values):
15+
params.append(tuple(v))
16+
17+
print(params)
18+
best_param = None
19+
best_score = np.infty
20+
tsp = TimeSeriesSplit(n_splits=cv)
21+
22+
for param in params:
23+
scores = []
24+
for train_ind, test_ind in tsp.split(df):
25+
train_data = df.iloc[train_ind]
26+
test_data = df.iloc[test_ind]
27+
try:
28+
#print(train_data, test_data)
29+
estimator = arima_model.ARMA(train_data, order=param)
30+
res = estimator.fit()
31+
#print(res.params)
32+
#get out of sample predictions with test data start and end
33+
y_pred = estimator.predict(res.params, test_data.index[0], test_data.index[-1])
34+
#print(y_pred)
35+
y_test = test_data.values.reshape(-1)
36+
score = math.sqrt(metrics.mean_squared_error(y_test, y_pred))
37+
scores.append(score)
38+
except:
39+
pass
40+
#print(scores)
41+
if len(scores) > 0 and np.mean(scores) < best_score :
42+
best_score = np.mean(scores)
43+
best_param = param
44+
45+
if best_param is not None:
46+
estimator = arima_model.ARMA(df, order=best_param)
47+
res = estimator.fit()
48+
print("best parameters:" + str(best_param))
49+
print("validation rmse:" + str(best_score))
50+
#get insample predictions with start and end indices
51+
y_pred = estimator.predict(res.params, start=0, end=df.shape[0]-1 )
52+
y_train = df.values.reshape(-1)
53+
train_rmse = math.sqrt(metrics.mean_squared_error(y_train, y_pred))
54+
print("train rmse:" + str(train_rmse))
55+
return estimator, res
56+
else:
57+
return None, None
58+
59+
path = 'F:/'
60+
df = pd.read_csv(os.path.join(path,'uk-deaths-from-bronchitis-emphys.csv'))
61+
df.info()
62+
63+
df.columns = ['timestamp', 'y']
64+
df.index = pd.to_datetime(df['timestamp'], format='%Y-%m')
65+
df.index.freq = 'MS'
66+
df.drop('timestamp', axis=1, inplace=True)
67+
68+
#grid search and get final model with best parameters
69+
arma_grid = { 'p':[0,1,2,3,5,7], 'q':[1,2,3,5,7] }
70+
estimator, res = grid_search_best_model_timeseries_arma(df, arma_grid, 3)
71+
print(res.params)
72+
print(res.summary())
73+
plt.plot(df)
74+
plt.figure()
75+
res.resid.plot()
76+
77+
#get predictions for future(implicit intervals based on freq of train data)
78+
start_index = pd.datetime(1980, 1, 1)
79+
end_index = pd.datetime(1990, 12, 1)
80+
pred = estimator.predict(res.params, start_index, end_index)
81+
print(pred)
82+
83+
#get predictions for future(explicit intervals)
84+
index = pd.date_range('1-1-1980', '12-1-1990', freq='MS')
85+
pred = estimator.predict(res.params, index[0], index[-1])
86+
pred = pd.Series(pred, index=index)
87+
print(pred)
88+
89+
plt.figure()
90+
plt.plot(pred)
Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
import pandas as pd
2+
import os
3+
from sklearn.model_selection import TimeSeriesSplit
4+
import statsmodels
5+
print(statsmodels.__version__)
6+
7+
path = 'F:/'
8+
df = pd.read_csv(os.path.join(path,'uk-deaths-from-bronchitis-emphys.csv'))
9+
df.info()
10+
11+
df.columns = ['timestamp', 'y']
12+
df.index = pd.to_datetime(df['timestamp'], format='%Y-%m')
13+
df.drop('timestamp', axis=1, inplace=True)
14+
15+
tsp = TimeSeriesSplit(n_splits=3)
16+
17+
for train_ind, test_ind in tsp.split(df):
18+
print(train_ind, test_ind)
19+
20+
for train_ind, test_ind in tsp.split(df):
21+
train_data = df.iloc[train_ind]
22+
test_data = df.iloc[test_ind]
23+
print(train_data, test_data)
Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,48 @@
1+
import math
2+
import pandas as pd
3+
import numpy as np
4+
import os
5+
from sklearn import metrics
6+
from sklearn.model_selection import TimeSeriesSplit
7+
from statsmodels.tsa import ar_model
8+
9+
path = 'F:/'
10+
df = pd.read_csv(os.path.join(path,'uk-deaths-from-bronchitis-emphys.csv'))
11+
df.info()
12+
13+
df.columns = ['timestamp', 'y']
14+
df.index = pd.to_datetime(df['timestamp'], format='%Y-%m').copy()
15+
df.index.freq = 'MS'
16+
df.drop('timestamp', axis=1, inplace=True)
17+
18+
#build model
19+
estimator = ar_model.AutoReg(df, lags=5)
20+
res = estimator.fit()
21+
print(res.params)
22+
print(res.model)
23+
print(res.summary())
24+
25+
#using model
26+
predictions = estimator.predict(res.params, start=0, end=df.shape[0]-1 )
27+
print(predictions)
28+
y_pred = predictions.values.reshape(-1)
29+
y_train = df.values.reshape(-1)[5:]
30+
train_rmse = math.sqrt(metrics.mean_squared_error(y_train, y_pred))
31+
print(train_rmse)
32+
33+
#evaluate model
34+
tsp = TimeSeriesSplit(n_splits=3)
35+
scores = []
36+
37+
for train_ind, test_ind in tsp.split(df):
38+
train_data = df.iloc[train_ind]
39+
test_data = df.iloc[test_ind]
40+
estimator = ar_model.AutoReg(df, lags=5)
41+
res = estimator.fit()
42+
pred = estimator.predict(res.params,test_data.index[0], test_data.index[-1])
43+
y_pred = pred.values.reshape(-1)
44+
y_test = test_data.values.reshape(-1)
45+
score = math.sqrt(metrics.mean_squared_error(y_test, y_pred))
46+
scores.append(score)
47+
print(scores)
48+
print(np.mean(scores))
1.66 KB
Binary file not shown.
2.45 KB
Binary file not shown.

0 commit comments

Comments
 (0)