In [1]:
import math;
import pandas as pd;
import numpy as np;
from sklearn.model_selection import learning_curve;
from sklearn.metrics import mean_squared_error;
from sklearn.model_selection import GridSearchCV;
from sklearn.model_selection import RandomizedSearchCV;
from sklearn.decomposition import PCA;
import matplotlib.pyplot as plot;
%matplotlib inline

### 共通関数

In [4]:
'''
 予測結果を表示します。
'''
def generate_predict(estimator, df, filename, y_offset=None):
    pred = estimator.predict((df.drop("datetime", axis=1) if ("datetime" in df.columns) else df));
    if (y_offset is not None): pred = pred + y_offset["test"];
    lines = [];
    for item in zip(df.datetime, np.round(pred).astype(int)):
        lines.append("{date},{num}\n".format(date=item[0].strftime('%Y-%-m-%-d'), num=item[1]));
    with open(filename, "w") as f:
        f.writelines(lines);

'''
 販売数の予測曲線を描画します。
'''
def draw_prediction_curve(estimator, df_target, df_known, y_offset=None, title=None):
    pred = estimator.predict((df_target.drop("datetime", axis=1) if ("datetime" in df_target.columns) else df_target));
    pred_recall = estimator.predict(df_known.drop(["y"], axis=1));
    train_offset = np.zeros_like(pred_recall) if (y_offset is None) else y_offset["train"];
    test_offset = np.zeros_like(pred) if (y_offset is None) else y_offset["test"];
    plot.title("prediction curve" if (title is None) else "prediction curve ({})".format(title));
    plot.plot(range(len(df_known)), df_known["y"] + train_offset, label="known");
    plot.plot(range(len(df_known)), pred_recall + train_offset, label="recall");
    plot.plot(np.arange(len(df_target)) + len(df_known), pred + test_offset, label="prediction");
    plot.legend();
    plot.show();
    print "recall RMSE: {}".format(np.sqrt(mean_squared_error(df_known["y"].as_matrix(), pred_recall)));

'''
 学習曲線を描画します。
'''
def draw_learning_curve(estimator, X, y, cv=5, print_score=True, print_max=False, title=None):
    if (print_score): print "estimator score: {}".format(estimator.score(X, y));
    space = np.linspace(0.5, 1.0, 10);
    training_sizes, train_scores, test_scores = learning_curve(estimator, X=X, y=y, cv=cv, train_sizes=space);
    plot.title("learning curve" if (title is None) else "learning curve ({})".format(title));
    plot.plot(training_sizes, train_scores.mean(axis=1), label="training scores");
    plot.plot(training_sizes, test_scores.mean(axis=1), label="test scores");
    plot.show();
    if (print_max): print "max score: {}".format(max(np.average(test_scores, axis=1)));

'''
 PCAの成分寄与率を描画します。
'''
def draw_pca_accumulation(pipeline, pca_step_name="pca", print_elements=False):
    explained_variance_ratio = dict(pipeline.steps)[pca_step_name].explained_variance_ratio_;
    plot.title("PCA explained variance ratio");
    plot.plot(np.cumsum(explained_variance_ratio));
    plot.show();
    if (print_elements): print "explained variance ratio: {}".format(np.cumsum(explained_variance_ratio));

'''
 クロスバリデーションクラスを返します。
'''
def _get_cross_validator(regressor, param_dists, cv=5, n_iter=1000, verbose=1):
    if (param_dists is not None and n_iter < 0):
        return GridSearchCV(regressor, param_dists, cv=cv, scoring="neg_mean_squared_error", verbose=verbose);
    elif (param_dists is not None and n_iter > 0):
        return RandomizedSearchCV(regressor, param_dists, cv=cv, n_iter=n_iter, scoring="neg_mean_squared_error", verbose=verbose);
    else:
        return None;

'''
 パラメータサーチを行い、ベストパラメータで結果を出力します。
'''
def predict_cv(regressor, param_dists, X, y, target, result_file, cv=5, n_iter=1000, verbose=1, draw_curves=True, print_scores=False, y_offset=None, title=None):
    estimator = None;
    pcv = _get_cross_validator(regressor, param_dists, cv, n_iter, verbose);
    if (pcv is not None):
        pcv.fit(X, y);
        if (print_scores):
            for i in range(len(pcv.cv_results_["mean_test_score"])):
                print("#iter {}: {} ({})".format(i, math.sqrt(abs(pcv.cv_results_["mean_test_score"][i])), pcv.cv_results_["params"][i]));
        print "best params: {}".format(pcv.best_params_);
        estimator = pcv.best_estimator_;
    else:
        regressor.fit(X, y);
        estimator = regressor;
    generate_predict(estimator, target, result_file, y_offset=y_offset);
    if (draw_curves):
        draw_learning_curve(estimator, X, y, title=title);
        draw_prediction_curve(estimator, target, df_known=pd.concat((X.reset_index(drop=True), pd.DataFrame(y).rename(columns={0: "y"})), axis=1), y_offset=y_offset, title=title);
    return estimator;

In [3]:
'''
 スキップ可能な主成分分析
'''
class NPCA(PCA):
    def __init__(self, n_components=None, copy=True, whiten=False, svd_solver="auto", tol=0.0, iterated_power="auto", random_state=None):
        self.pca = PCA(n_components, copy=copy, whiten=whiten, svd_solver=svd_solver, tol=tol, iterated_power=iterated_power, random_state=random_state);

    def get_params(self, deep=True):
        return self.pca.get_params(deep);

    def set_params(self, **params):
        self.pca.set_params(**params);

    def fit(self, X, y):
        if (self.pca.n_components > 0):
            self.pca.fit(X, y);

    def fit_transform(self, X, y):
        if (self.pca.n_components > 0):
            return self.pca.fit_transform(X, y);
        else:
            return X;

    def transform(self, X):
        if (self.pca.n_components > 0):
            return self.pca.transform(X);
        else:
            return X;

'''
 特定のカラムのみの主成分分析
'''
class RestrictedPCA(PCA):
    def __init__(self, target_columns, n_components=None, copy=True, whiten=False, svd_solver="auto", tol=0.0, iterated_power="auto", random_state=None):
        self.target_columns = target_columns;
        self.pca = PCA(n_components, copy=copy, whiten=whiten, svd_solver=svd_solver, tol=tol, iterated_power=iterated_power, random_state=random_state);

    def get_params(self, deep=True):
        params = self.pca.get_params(deep);
        params["target_columns"] = self.target_columns;
        return params;

    def set_params(self, **params):
        if ("target_columns" in params):
            self.target_columns = params.pop("target_columns");
        self.pca.set_params(**params);

    def fit(self, X, y):
        if (self.pca.n_components > 0):
            X_menus = X.ix[:,self.target_columns].as_matrix();
            self.pca.fit(X_menus, y);

    def fit_transform(self, X, y):
        X_menus = X.ix[:,self.target_columns].as_matrix();
        X_rest = X.drop(self.target_columns, axis=1).as_matrix();
        if (self.pca.n_components > 0):
            return np.concatenate((X_rest, self.pca.fit_transform(X_menus, y)), axis=1);
        else:
            return X_rest;

    def transform(self, X):
        X_menus = X.ix[:,self.target_columns].as_matrix();
        X_rest = X.drop(self.target_columns, axis=1).as_matrix();
        if (self.pca.n_components > 0):
            return np.concatenate((X_rest, self.pca.transform(X_menus)), axis=1);
        else:
            return X_rest;