# Reading data & Selecting features

In [None]:
import psycopg2
import pandas as pd

conn_string = "host='localhost' dbname='weather_env' user='postgres' password='postgres'"
conn = psycopg2.connect(conn_string)
cur = conn.cursor()
cur.execute("""SELECT * FROM environment_data.pivoted;""")
records = cur.fetchall()
cur.close()
df = pd.DataFrame(records, columns=[x[0] for x in cur.description])

# Exploring models
## Scoring options
- explained_variance
- max_error
- neg_mean_absolute_error
- neg_mean_squared_error
- neg_root_mean_squared_error
- neg_mean_squared_log_error
- neg_median_absolute_error
- r2, neg_mean_poisson_deviance
- neg_mean_gamma_deviance
- neg_mean_absolute_percentage_error
- d2_absolute_error_score
- d2_pinball_score
- d2_tweedie_score

In [None]:
ddf = df.describe().T.sort_values('count')
n = 0.5
subset = ddf.loc[ddf['count'] > max(ddf['count'].values)*n]
subdf = df[subset.index[::-1]].dropna()
subdf.columns

In [None]:
from sklearn.model_selection import cross_validate, GridSearchCV
import numpy as np

In [None]:
target = subdf.pop('arnhem_waterlevel')
features, target = subdf.values, target.values

In [None]:
random_sel = np.random.randint(0, len(features)-1, int(len(features)/100))
features, target = features[random_sel], target[random_sel]
print(features.shape)

# KNeighborsRegressor

K = 270 seems best with 96%

In [None]:
from sklearn.neighbors import KNeighborsRegressor

parameters = {
    'n_neighbors': [270],
}
model = KNeighborsRegressor()

gscv = GridSearchCV(
    model,
    parameters,
    cv=10,
    scoring=('r2'),
)

In [None]:
gscv.fit(features, target)

In [None]:
pd.DataFrame(gscv.cv_results_).sort_values('rank_test_score').drop(['split0_test_score', 'split1_test_score', 'split2_test_score', 'split3_test_score', 'split4_test_score',
                                                                    'split5_test_score', 'split6_test_score', 'split7_test_score', 'split8_test_score', 'split9_test_score',
                                                                    'std_fit_time', 'std_score_time'], axis=1).head(10)

In [None]:
n = np.random.randint(0, features.shape[0], size=10)
print(
    gscv.best_estimator_,
    gscv.best_score_,
    gscv.best_params_,
    features[n],
    target[n],
    np.round(gscv.predict(features[n]), 0),
    sep='\n'
)

# SVR

A poly kernel of the second degree seems best with 90%

In [None]:
from sklearn.svm import SVR

parameters = {
    'kernel':['poly', 'linear'],
    'degree': [2,],
    'C':[100000],
    'epsilon': [25]
}
model = SVR()

gscv = GridSearchCV(
    model,
    parameters,
    cv=10,
    scoring=('r2'),
)

In [None]:
gscv.fit(features, target)

In [None]:
pd.DataFrame(gscv.cv_results_).sort_values('rank_test_score').drop(['split0_test_score', 'split1_test_score', 'split2_test_score', 'split3_test_score', 'split4_test_score',
                                                                    'split5_test_score', 'split6_test_score', 'split7_test_score', 'split8_test_score', 'split9_test_score',
                                                                    'std_fit_time', 'std_score_time'], axis=1).head(10)

In [None]:
n = np.random.randint(0, features.shape[0], size=10)
print(
    gscv.best_estimator_,
    gscv.best_score_,
    gscv.best_params_,
    features[n],
    target[n],
    np.round(gscv.predict(features[n]), 0),
    sep='\n'
)

# LinearRegression

GridSearch has no use with LinearRegression. Performs quite good with 96%

In [None]:
from sklearn.linear_model import LinearRegression

model = LinearRegression()

gscv = cross_validate(
    model,
    features,
    target,
    cv=10,
    scoring=('r2'),
    return_estimator=True
)

In [None]:
scores = gscv['test_score']
models = gscv['estimator']
best_model = models[np.argmax(scores)]
print(
    best_model,
    np.max(scores)
)

In [None]:
n = np.random.randint(0, features.shape[0], size=10)
print(
    best_model,
    features[n],
    target[n],
    np.round(best_model.predict(features[n]), 0),
    sep='\n'
)

# RigdeRegression

LinearRegresion is better & Parameters seem to have no effect with 87%

In [None]:
from sklearn.linear_model import Ridge

parameters = {
    'alpha': [1, 50, 500, 5000]
}
model = Ridge()

gscv = GridSearchCV(
    model,
    parameters,
    cv=10,
    scoring=('r2'),
)

In [None]:
gscv.fit(features, target)

In [None]:
pd.DataFrame(gscv.cv_results_).sort_values('rank_test_score').drop(['split0_test_score', 'split1_test_score', 'split2_test_score', 'split3_test_score', 'split4_test_score',
                                                                    'split5_test_score', 'split6_test_score', 'split7_test_score', 'split8_test_score', 'split9_test_score',
                                                                    'std_fit_time', 'std_score_time'], axis=1).head(10)

In [None]:
n = np.random.randint(0, features.shape[0], size=10)
print(
    gscv.best_estimator_,
    gscv.best_score_,
    gscv.best_params_,
    features[n],
    target[n],
    np.round(gscv.predict(features[n]), 0),
    sep='\n'
)

# LassoRegression

LinearRegresion is better & Parameters seem to have no effect with 87%

In [None]:
from sklearn.linear_model import Lasso

parameters = {
    'alpha': [1, 50, 500, 5000]
}
model = Lasso()

gscv = GridSearchCV(
    model,
    parameters,
    cv=10,
    scoring=('r2'),
)

In [None]:
gscv.fit(features, target)

In [None]:
pd.DataFrame(gscv.cv_results_).sort_values('rank_test_score').drop(['split0_test_score', 'split1_test_score', 'split2_test_score', 'split3_test_score', 'split4_test_score',
                                                                    'split5_test_score', 'split6_test_score', 'split7_test_score', 'split8_test_score', 'split9_test_score',
                                                                    'std_fit_time', 'std_score_time'], axis=1).head(10)

In [None]:
n = np.random.randint(0, features.shape[0], size=10)
print(
    gscv.best_estimator_,
    gscv.best_score_,
    gscv.best_params_,
    features[n],
    target[n],
    np.round(gscv.predict(features[n]), 0),
    sep='\n'
)

# ElasticNet

Parameters seem to have no effect, max score is 87%

In [None]:
from sklearn.linear_model import ElasticNet

parameters = {
    'alpha': [4],
}
model = ElasticNet()

gscv = GridSearchCV(
    model,
    parameters,
    cv=10,
    scoring=('r2'),
)

In [None]:
gscv.fit(features, target)

In [None]:
pd.DataFrame(gscv.cv_results_).sort_values('rank_test_score').drop(['split0_test_score', 'split1_test_score', 'split2_test_score', 'split3_test_score', 'split4_test_score',
                                                                    'split5_test_score', 'split6_test_score', 'split7_test_score', 'split8_test_score', 'split9_test_score',
                                                                    'std_fit_time', 'std_score_time'], axis=1).head(10)

In [None]:
n = np.random.randint(0, features.shape[0], size=10)
print(
    gscv.best_estimator_,
    gscv.best_score_,
    gscv.best_params_,
    features[n],
    target[n],
    np.round(gscv.predict(features[n]), 0),
    sep='\n'
)

# DecisionTreeRegressor

Parameters seem to have no effect, max score is 95%

In [None]:
from sklearn.tree import DecisionTreeRegressor

parameters = {
    'max_depth': [None],
}
model = DecisionTreeRegressor()

gscv = GridSearchCV(
    model,
    parameters,
    cv=10,
    scoring=('r2'),
)

In [None]:
gscv.fit(features, target)

In [None]:
pd.DataFrame(gscv.cv_results_).sort_values('rank_test_score').drop(['split0_test_score', 'split1_test_score', 'split2_test_score', 'split3_test_score', 'split4_test_score',
                                                                    'split5_test_score', 'split6_test_score', 'split7_test_score', 'split8_test_score', 'split9_test_score',
                                                                    'std_fit_time', 'std_score_time'], axis=1).head(10)

In [None]:
n = np.random.randint(0, features.shape[0], size=10)
print(
    gscv.best_estimator_,
    gscv.best_score_,
    gscv.best_params_,
    features[n],
    np.round(target[n], 0),
    np.round(gscv.predict(features[n]), 0),
    sep='\n'
)

# RandomforestRegressor

200 estimators seems best with 97%

In [None]:
from sklearn.ensemble import RandomForestRegressor

parameters = {
    'n_estimators':[200],
}
model = RandomForestRegressor()

gscv = GridSearchCV(
    model,
    parameters,
    cv=10,
    scoring=('r2'),
)

In [None]:
gscv.fit(features, target)

In [None]:
pd.DataFrame(gscv.cv_results_).sort_values('rank_test_score').drop(['split0_test_score', 'split1_test_score', 'split2_test_score', 'split3_test_score', 'split4_test_score',
                                                                    'split5_test_score', 'split6_test_score', 'split7_test_score', 'split8_test_score', 'split9_test_score',
                                                                    'std_fit_time', 'std_score_time'], axis=1).head(10)

In [None]:
n = np.random.randint(0, features.shape[0], size=10)
print(
    gscv.best_estimator_,
    gscv.best_score_,
    gscv.best_params_,
    features[n],
    target[n],
    np.round(gscv.predict(features[n]), 0),
    sep='\n'
)