In [1]:
import sys;
import notebookutil;
sys.meta_path.append(notebookutil.NotebookFinder());

import pandas as pd;
import numpy as np;
from collections import OrderedDict;
from sklearn.preprocessing import StandardScaler;

### データの読み込み、加工

In [2]:
'''
 データファイルを読み込みます。
'''
def load_data(data_dir="data", verbose=False):
    train_file = data_dir + "/train.csv";
    test_file = data_dir + "/test.csv";
    df_train = pd.read_csv(train_file, parse_dates=["datetime"]);
    df_test = pd.read_csv(test_file, parse_dates=["datetime"]);
    if (verbose):
        print "train length: {}".format(len(df_train));
        print "test length: {}".format(len(df_test));
    return (df_train, df_test);

'''
 学習/評価用のデータセットを準備します。
'''
def prepare_dataset(data_dir="data", verbose=False, pred_target="y", drop_columns=[], n_chops=0, menus=True, standardize=True, baselines={"stability": "exp"}):
    df_train, df_test = load_data(data_dir, verbose);
    df_train, df_test = preprocess_(df_train, df_test, n_chops, menus=menus, standardize=standardize, pred_target=pred_target);
    baseline_offset = subtract_baselines_(df_train, df_test, baselines);
    X = df_train.drop(np.hstack((pred_target, drop_columns)), axis=1);
    y = df_train[pred_target].as_matrix();
    target = df_test.drop(drop_columns, axis=1);
    return (X, y, target, baseline_offset);

'''
 長期変動要素をデータフレームから差し引きます。
'''
def subtract_baselines_(df_train, df_test, baselines):
    if (baselines is None):
        return None;
    train_offset = np.zeros(len(df_train));
    test_offset = np.zeros(len(df_test));
    x_train = np.arange(len(df_train));
    x_test = np.arange(len(df_test)) + len(df_train);
    y = df_train.y.as_matrix();
    if ("stability" in baselines):
        if (baselines["stability"] == "linear"):
            # 定常状態への差分を直線近似
            linear_offset = lambda x, a, b: a * x + b;
            a, b = np.polyfit(x_train, y, 1);
            df_train.y = df_train.y - linear_offset(x_train, a, b);
            train_offset = train_offset + linear_offset(x_train, a, b);
            test_offset = test_offset + linear_offset(x_test, a, b);
        if (baselines["stability"] == "exp"):
            # 定常状態への差分を指数関数近似
            exp_offset = lambda x, log_a, b: np.exp(log_a) * np.exp(b * x);
            b, log_a = np.polyfit(x_train, np.log(y), 1);
            df_train.y = df_train.y - exp_offset(x_train, log_a, b);
            train_offset = train_offset + exp_offset(x_train, log_a, b);
            test_offset = test_offset + exp_offset(x_test, log_a, b);
    if ("any" in baselines):
        for offset_func in baselines["any"]:
            df_train.y = df_train.y - offset_func(x_train);
            train_offset = train_offset + offset_func(x_train);
            test_offset = test_offset + offset_func(x_test);
    return {"train": train_offset, "test": test_offset};

'''
 ダミー化、欠損値処理を実施します。
'''
def preprocess_(df_train, df_test, n_chops=0, menus=True, standardize=True, pred_target="y"):
    train_length = len(df_train) - n_chops;
    df_combined = pd.concat([df_train, df_test], axis=0).iloc[n_chops:];
    # ダミー化
    df = categorical_to_dummy(df_combined, ["remarks", "event", "weather", "week", "payday"]);
    # 初日からの経過日数
    '''
    first_day = df.iloc[0]["datetime"];
    df["days_offset"] = df.datetime.apply(lambda dt: (dt - first_day).days);
    '''
    # 置換
    df = replace_series(df, "precipitation", "--", 0, to_type=float);
    df.loc[df.apply(lambda x: np.isnan(x["kcal"]), axis=1), ["kcal"]] = max(df.loc[df.apply(lambda x: not np.isnan(x["kcal"]), axis=1)]["kcal"]);
    # メニュー処理
    if (menus):
        df = add_menu_columns(df);
    # 正規化
    if (standardize):
        for name in filter(lambda colname: colname != pred_target, df.columns):
            standardize_(df, name);
    # データの再分割
    df_train = df.iloc[0: train_length].drop(["name", "datetime"], axis=1);
    df_test = df.iloc[train_length:].drop(["name", "y"], axis=1);
    return (df_train, df_test);

'''
 カラムを正規化します。内容は上書きされます。
'''
def standardize_(df, column):
    coltype = df[column].dtype;
    if ((coltype != int) and (coltype != float)): return;
    sc = StandardScaler();
    features = df[column].astype(float).values.reshape(-1, 1);
    df[column] = sc.fit_transform(features);

### メニュー処理

In [3]:
menus = OrderedDict();
# 調理法
menus["fried"] = [ "フライ", "メンチ", "カツ", "唐揚", "から揚", "酢豚", "天ぷら", "コロッケ" ]; # 揚げ物
menus["grilled"] = [ "焼" ]; # 焼き物
menus["simmered"] = [ "肉じゃが", "煮" ]; # 煮物
menus["stirred"] = [ "炒め", "チャンプル" ]; # 炒め物
menus["soup"] = [ "カレー", "シチュー", "ハヤシ" ]; # ルウもの
# 味付け
menus["spicy"] = [ "辛", "ピリ辛", "マーボ", "タンドリー", "ペッパー", "カレー", "チリ", "マスタード", "ペッパー", "チャプチェ", "プルコギ", "キムチ", "韓国", "チリソース" ]; # スパイシー
menus["sweetspicy"] = [ "照り焼", "甘酢", "すき焼き", "スキヤキ", "牛丼", "甘辛", "スイートチリ" ]; # 甘辛
# 食材
menus["fish"] = [ "イカ", "いか", "白身魚", "カキ", "さんま", "カレイ", "サバ", "海老", "エビ", "メダイ", "さわら", "ホタテ", "ます", "サーモン", "アジ", "キス", "かじき", "ぶり" ]; # 魚
menus["meat"] = [ "肉", "ヒレ", "メンチ", "ロース", "カツ", "ひれかつ", "ハムカツ", "ソーセージ", "カルビ", "鶏", "親子", "チキン", "豚", "ポーク", "ビーフ", "牛", "しゃぶ", "シャブ", "ハンバーグ", "ロコモコ", "プルコギ", "バーベキュー", "キーマ", "ベーコン" ]; # 肉
menus["vegetable"] = [ "野菜", "豆腐", "じゃが", "レモン", "おろし", "きのこ", "キャベツ", "青梗菜", "トマト", "筍", "茄子", "ゴーヤ" ]; # 野菜
menus["rice"] = [ "ご飯", "御飯", "丼" ]; # ご飯物
# スタイル
menus["japanese"] = [ "和風", "五目", "炊き込み", "ご飯", "うどん", "ねぎ", "胡麻", "味噌", "筑前煮" ]; # 和風
menus["chinese"] = [ "マーボ", "酢豚", "チンジャオロース", "青椒肉絲", "中華丼", "八宝菜" ]; # 中華
menus["large"] = [ "ビッグ", "ジャンボ", "スタミナ", "マヨ", "たっぷり", "ニンニク", "厚切", "ビュッフェ" ]; # ボリューム
# 雰囲気
menus["cool"] = [ "レモン", "冷", "おろし", "塩" ]; # さっぱり
menus["heartful"] = [ "手作り", "手ごね" ]; # まごころ
menus["fluffy"] = [ "クリーミー", "やわらか", "ジューシー" ]; # ふんわり
# その他
menus["trendy"] = [ "ボローニャ", "クノーデル", "ストロガノフ", "ムニエル", "フリカッセ", "洋食屋さん", "香草焼き", "デミソース", "コーンクリーム" ]; # おしゃれ
menus["junky"] = [ "チーズ", "ナッツ" ]; # ジャンキー
menus["misterious"] = [ "山賊焼き", "タルタル", "サムジョン" ]; # 謎の響き

'''
 メニュー成分を付与します。
'''
def add_menu_columns(df):
    for attr in menus:
        df = add_column_contains(df, "name", menus[attr], "menu_" + attr);
    return df;

'''
 メニューカラム
'''
menu_columns = ["menu_" + menu for menu in menus.keys()];

### データユーティリティ

In [4]:
'''
 名義尺度をダミー化します。
'''
def categorical_to_dummy(df, columns, drop_original=True, drop_first=False):
    df_copy = df.reset_index(drop=True);
    for column in columns:
        dummies = pd.get_dummies(df_copy.ix[:, column], drop_first=drop_first);
        dummies.columns = [column + str(i+1) for i in range(len(dummies.columns))];
        df_copy = pd.concat([df_copy, dummies], axis=1);
        if (drop_original):
            df_copy.drop(column, inplace=True, axis=1);
    return df_copy;

'''
 カラムの特定の値を置換します。
'''
def replace_series(df, column, src, dst, to_type=None):
    df_copy = df.copy();
    df_copy.ix[:, column] = df_copy.ix[:, column].apply(lambda data: dst if (data == src) else data);
    if (to_type != None):
        df_copy = df_copy.astype({column: to_type});
    return df_copy;

'''
 カラムに特定の文字列が含まれているかどうかを示すシリーズを追加します。
'''
def add_column_contains(df, column, fragments, colname):
    series = df.ix[:, column].apply(lambda data: 1 if (in_list_(data, fragments)) else 0).rename(colname);
    return pd.concat([df, series], axis=1);
def in_list_(target, fragments):
    for fragment in fragments:
        if (fragment in target):
            return True;
    return False;

'''
 カラムのユニーク内容を表示します。
'''
def print_uniques(df, column):
    print "======= {col_name}({length}) =======".format(col_name=column, length=len(df));
    print df[column].describe();
    print "------";
    for item in df[column].unique():
        print item;
    print "================";

In [33]:
#import matplotlib.pyplot as plot;
#n_chops = 100;
#X, y, target, y_offset = prepare_dataset(drop_columns=[], n_chops=n_chops, menus=True, baselines={"stability": "exp", "season": "sin"});
#plot.plot(range(len(X)), y)
#plot.show();
#print y_offset