## インポート

In [None]:
# パッケージの読み込み
import pandas as pd
import numpy as np
import datetime as dt
import time
from sklearn import linear_model
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_absolute_error
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
from statsmodels.tsa.tsatools import lagmat
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import ElasticNet, Ridge

pd.set_option('display.max_columns', 500)

# 特徴量エンジニアリング

**get_column_by_target_type(df, target_type)** : df 内の任意の型の配列 target_type を含む column を返す

**one_hot_encoding_by_target_column(df, category_column)** : df 内の任意の category_column を OneHotEncoder で dummy変数にして、そのdata frameおよび新たな column を返す

**make_column_by_category_and_numeric(df, category_column, numeric_column, operation)** : df 内の任意の category_column と numeric_column を組み合わせた新たな特徴量を生成する。具体的には category_column をgroup化した時に得られる統計量を追加する。

## 特徴量

**get_column_by_target_type(df, target_type)** : df 内の任意の型の配列 target_type を含む column を返す

In [None]:
# dependency
# - pandas
def get_column_by_target_type(df, target_type):
  df_columns = [ col for col in df.columns if df[col].dtype in target_type ]
  return df_columns

# sample
df = pd.read_csv('https://raw.githubusercontent.com/blue-eagle/estyle_data/master/day1_yc_data.csv',parse_dates=['date'], index_col=0).sort_index()
df["d"] = "s"
print(get_column_by_target_type(df, [object]))

['d']


**one_hot_encoding_by_target_column(df, category_column)** : df 内の任意の category_column を OneHotEncoder で dummy変数にして、そのdata frameおよび新たな column を返す

In [None]:
# dependency
# - pandas
# - sklearn.preprocessing.OneHotEncoder
def one_hot_encoding_by_target_column(df, category_column):
  df_cp = df.copy()
  ohe = OneHotEncoder(sparse=False, categories='auto')
  ohe.fit(df_cp[category_column])

  columns = []
  for i, col in enumerate(category_column):
    columns += [f'{col}_{v}' for v in ohe.categories_[i]]
  
  return columns, pd.DataFrame(ohe.transform(df_cp[category_column]), columns=columns)

# sample
df = pd.read_csv('https://raw.githubusercontent.com/blue-eagle/estyle_data/master/day1_yc_data.csv',parse_dates=['date'], index_col=0).sort_index()
df["d"] = "s"
df["e"] = "s"
df["f"] = "s"
columns, DF = one_hot_encoding_by_target_column(df, ["d", "e", "f", "holiday"])
DF.head()

Unnamed: 0,d_s,e_s,f_s,holiday_False,holiday_True
0,1.0,1.0,1.0,1.0,0.0
1,1.0,1.0,1.0,0.0,1.0
2,1.0,1.0,1.0,1.0,0.0
3,1.0,1.0,1.0,1.0,0.0
4,1.0,1.0,1.0,1.0,0.0


**make_column_by_category_and_numeric(df, category_column, numeric_column, operation)** : df 内の任意の category_column と numeric_column を組み合わせた新たな特徴量を生成する。具体的には category_column をgroup化した時に得られる統計量を追加する。

In [None]:
# dependency
# - pandas
def make_column_by_category_and_numeric(df, category_column, numeric_column, operation):
  df_gp = df.groupby( [category_column] )[ numeric_column ].agg(operation)
  columns = [f'({category_column})*({numeric_column})_{ope}' for ope in operation]
  df_gp.columns = columns
  return pd.merge(df, df_gp, on=category_column, how='left')

# sample
df = pd.read_csv('https://raw.githubusercontent.com/blue-eagle/estyle_data/master/day1_yc_data.csv',parse_dates=['date'], index_col=0).sort_index()
df = make_column_by_category_and_numeric(df, "holiday", 'Max_Temp', ["sum", 'mean', 'max', 'min'])
df.head()

Unnamed: 0,num_rides,holiday,Max_Temp,Min_Temp,Rain_in,Snow&Hail_in,obs_time_ Snow&Hail_in,(holiday)*(Max_Temp)_sum,(holiday)*(Max_Temp)_mean,(holiday)*(Max_Temp)_max,(holiday)*(Max_Temp)_min
0,281263.0,False,48,40,0.001,0.0,0.0,67123,63.026291,96,13
1,224893.0,True,41,37,0.21,0.001,0.0,1700,56.666667,90,14
2,278099.0,False,43,39,0.58,0.0,0.0,67123,63.026291,96,13
3,289301.0,False,52,34,0.0,0.0,0.0,67123,63.026291,96,13
4,324421.0,False,34,27,0.0,0.0,0.0,67123,63.026291,96,13
