In [73]:
import pandas as pd
import numpy as np
from sklearn.gaussian_process import GaussianProcessRegressor as gp
from sklearn.preprocessing import StandardScaler as ss
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV
from sklearn.gaussian_process.kernels import RBF

In [2]:
x = pd.read_csv('features.csv', encoding="latin1")

In [5]:
print(set(x['article_group0_idx']))

set([u'HofKonditorei', u'Trockenprodukte', u'Erlebnis', u'Non-Food', u'Getr\xe4nke', u'Obst', u'HofChuchi', u'Frischprodukte', u'HofB\xe4ckerei', u'Gem\xfcse'])


In [102]:
x_juck_res = x[(x['shop_idx'] == u'Juckerhof Seegräben') & (x['is_resto'] == True) & (~ x['temperature'].isna())]      

In [103]:
x1 = x_juck_res.fillna(0)
x1.columns

Index([u'article_group0_idx', u'month_idx', u'hour_idx', u'mday_idx',
       u'year_idx', u'is_resto_idx', u'shop_idx', u'temperature',
       u'precipitation', u'snowfraction', u'sealevelpressure',
       u'winddirection', u'windspeed', u'relativehumidity', u'sunshinetime',
       u'totalcloudcover', u'lowclouds', u'midclouds', u'highclouds',
       u'wday_x', u'month', u'hour', u'mday', u'year', u'month.1', u'hour.1',
       u'mday.1', u'year.1', u'is_resto', u'price', u'count',
       u'n_transactions'],
      dtype='object')

In [104]:
y = x1['price']
X = x1[['month', 'hour.1', 'mday', 'wday_x', 'temperature', 'precipitation', 'sunshinetime', 'snowfraction', 'sealevelpressure', 'winddirection', 'windspeed', 'relativehumidity', 'totalcloudcover', 'lowclouds', 'midclouds', 'highclouds']]

one_hot = pd.get_dummies(X['wday_x'], prefix='day')
X = X.drop('wday_x', axis=1)
X = X.join(one_hot)

one_hot = pd.get_dummies(X['month'], prefix='month')
X = X.drop('month', axis=1)
X = X.join(one_hot)

#one_hot = pd.get_dummies(X['hour.1'], prefix='hour')
#X = X.drop('hour.1', axis=1)
#X = X.join(one_hot)

In [105]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)
normalizer = ss()
X_train = normalizer.fit_transform(X_train)
X_test = normalizer.transform(X_test)

In [106]:
# cross validation 
params = {'alpha': [1e-12, 1e-10, 1e-8, 1e-5],
          'kernel': [RBF(length_scale=1.0), 
                    RBF(length_scale=2.0),
                    RBF(length_scale=5),
                    RBF(length_scale=0.5)]
         }

model = gp(random_state=1, normalize_y=True)
cv = RandomizedSearchCV(model, param_distributions=params, n_iter=16, cv=5, verbose=50, n_jobs=4)

cv.fit(X_train, y_train)
cv.best_estimator_.score(X_test, y_test)

Fitting 5 folds for each of 16 candidates, totalling 80 fits
[CV] kernel=RBF(length_scale=1), alpha=1e-12 .........................
[CV] kernel=RBF(length_scale=1), alpha=1e-12 .........................
[CV] kernel=RBF(length_scale=1), alpha=1e-12 .........................
[CV] kernel=RBF(length_scale=1), alpha=1e-12 .........................
[CV] kernel=RBF(length_scale=1), alpha=1e-12 .........................


KeyboardInterrupt: 

In [107]:
model = gp()
model.fit(X_train, y_train)
model.score(X_test, y_test)

KeyboardInterrupt: 

In [68]:
model.score(X_test, y_test)

0.37306491852424295

In [93]:
cv.best_params_

{'alpha': 1e-05, 'kernel': RBF(length_scale=0.5)}

In [95]:
cv.cv_results_



{'mean_fit_time': array([ 13.31847162,  12.92483859,  12.81627665,  97.13844032,
         13.37376218,  14.45525413,  13.99023595,  96.17533584,
         12.88194218,  13.63541141,  12.69083238,  91.66077075,
         11.98254361,  12.30406375,  12.42912226,  82.2438035 ]),
 'mean_score_time': array([ 0.14919477,  0.16020002,  0.11805363,  0.25180044,  0.20406175,
         0.29135833,  0.17571402,  0.22438641,  0.17087822,  0.16100354,
         0.18612585,  0.15596299,  0.17410908,  0.20529466,  0.19019094,
         0.15651007]),
 'mean_test_score': array([-0.00202517, -0.00202517, -0.00202517,  0.1588654 , -0.00202517,
        -0.00202517, -0.00202517,  0.1588654 , -0.00202517, -0.00202517,
        -0.00202517,  0.15886541, -0.00202517, -0.00202517, -0.00202517,
         0.15886851]),
 'mean_train_score': array([ 1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,
         1.,  1.,  1.]),
 'param_alpha': masked_array(data = [1e-12 1e-12 1e-12 1e-12 1e-10 1e-10 1e-10 1e-10 

0.29871009603197807

(3227, 33)