forked from blakehauser/greedy-lib
-
Notifications
You must be signed in to change notification settings - Fork 0
/
regression.py
57 lines (39 loc) · 1.51 KB
/
regression.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
#!/usr/bin/env python
"""Takes list of predictors from a greedy algorithm output (from Oskar's code)
and uses the existing dataset to generate an R^2 value that characterizes the fit of the
Oskar model. To run, use the following command:
python regression.py <extended-regression-data-matrix-with-split-dates.csv --variables
file_with_predictors_list.csv --year 1994
Can use any year for the test/training split"""
import pandas as pd
import statsmodels.formula.api as smf
import sys
import argparse
from sklearn.metrics import r2_score
from greedy import collinify
parser = argparse.ArgumentParser(
description=('xxx...'))
parser.add_argument(
'--variables', required=True,
help=('The filename that contains the variables to use.'))
parser.add_argument(
'--year', required=True, type=int,
help=('Give the year for the cut-off between training and '
'test data.'))
args = parser.parse_args()
variables = open(args.variables).read().split()
# print('variables are:', variables)
data = pd.read_csv(sys.stdin, sep=' ')
data = collinify(data)
training = data.loc[(data['YEAR1'] <= args.year) &
(data['YEAR2'] <= args.year)]
results = smf.ols(formula='AGDIST ~ %s' % ' + '.join(variables),
data=training).fit()
# print('Summary of OLS fit:')
# print(results.summary())
test = data.loc[(data['YEAR1'] > args.year) &
(data['YEAR2'] > args.year)]
predicted = results.predict(test)
actual = test['AGDIST']
r2 = r2_score(actual, predicted)
print(r2)