## 1. データの準備
- データの読み込み
- 特徴量Xと正解ラベルyの設定

#### ●データの読み込み

In [11]:
# 必要なライブラリを読み込む
from sklearn import linear_model
from sklearn.metrics import mean_squared_error
import numpy as np
from sklearn.model_selection import train_test_split
import pandas as pd
from IPython.core.display import display

# csvファイルを読み込む
df_data = pd.read_csv('data.csv', header=0, quotechar='"', encoding='cp932')

# csvファイルの読み込みの正常終了を確認する
display(df_data.head(5), df_data.shape)

Unnamed: 0,Make,Model,Year,Engine Fuel Type,Engine HP,Engine Cylinders,Transmission Type,Driven_Wheels,Number of Doors,Market Category,Vehicle Size,Vehicle Style,highway MPG,city mpg,Popularity,MSRP
0,BMW,1 Series M,2011,premium unleaded (required),335.0,6.0,MANUAL,rear wheel drive,2.0,"Factory Tuner,Luxury,High-Performance",Compact,Coupe,26,19,3916,46135
1,BMW,1 Series,2011,premium unleaded (required),300.0,6.0,MANUAL,rear wheel drive,2.0,"Luxury,Performance",Compact,Convertible,28,19,3916,40650
2,BMW,1 Series,2011,premium unleaded (required),300.0,6.0,MANUAL,rear wheel drive,2.0,"Luxury,High-Performance",Compact,Coupe,28,20,3916,36350
3,BMW,1 Series,2011,premium unleaded (required),230.0,6.0,MANUAL,rear wheel drive,2.0,"Luxury,Performance",Compact,Coupe,28,18,3916,29450
4,BMW,1 Series,2011,premium unleaded (required),230.0,6.0,MANUAL,rear wheel drive,2.0,Luxury,Compact,Convertible,28,18,3916,34500


(9532, 16)

In [12]:
df_data.columns

Index(['Make', 'Model', 'Year', 'Engine Fuel Type', 'Engine HP',
       'Engine Cylinders', 'Transmission Type', 'Driven_Wheels',
       'Number of Doors', 'Market Category', 'Vehicle Size', 'Vehicle Style',
       'highway MPG', 'city mpg', 'Popularity', 'MSRP'],
      dtype='object')

#### ●特徴量Xと正解ラベルyの設定

In [13]:
# 特徴量Xと正解ラベルyの設定
X = df_data.iloc[:, 0:-1]
y = df_data.iloc[:, -1]

# 特徴量Xと正解ラベルyの設定の正常終了を確認する
display(X.head(5), X.shape)
display(y.head(5), y.shape)

Unnamed: 0,Make,Model,Year,Engine Fuel Type,Engine HP,Engine Cylinders,Transmission Type,Driven_Wheels,Number of Doors,Market Category,Vehicle Size,Vehicle Style,highway MPG,city mpg,Popularity
0,BMW,1 Series M,2011,premium unleaded (required),335.0,6.0,MANUAL,rear wheel drive,2.0,"Factory Tuner,Luxury,High-Performance",Compact,Coupe,26,19,3916
1,BMW,1 Series,2011,premium unleaded (required),300.0,6.0,MANUAL,rear wheel drive,2.0,"Luxury,Performance",Compact,Convertible,28,19,3916
2,BMW,1 Series,2011,premium unleaded (required),300.0,6.0,MANUAL,rear wheel drive,2.0,"Luxury,High-Performance",Compact,Coupe,28,20,3916
3,BMW,1 Series,2011,premium unleaded (required),230.0,6.0,MANUAL,rear wheel drive,2.0,"Luxury,Performance",Compact,Coupe,28,18,3916
4,BMW,1 Series,2011,premium unleaded (required),230.0,6.0,MANUAL,rear wheel drive,2.0,Luxury,Compact,Convertible,28,18,3916


(9532, 15)

0    46135
1    40650
2    36350
3    29450
4    34500
Name: MSRP, dtype: int64

(9532,)

## 2. データの加工
- 欠損状況の確認
- 基本統計量の確認
- yearに関する新しい特徴量の作成
- 欠損値の補完
- 使用する特徴量の選択

#### ●欠損状況の確認

In [14]:
display(df_data.isnull().sum())

Make                    0
Model                   0
Year                    0
Engine Fuel Type        3
Engine HP              53
Engine Cylinders       22
Transmission Type       0
Driven_Wheels           0
Number of Doors         6
Market Category      2999
Vehicle Size            0
Vehicle Style           0
highway MPG             0
city mpg                0
Popularity              0
MSRP                    0
dtype: int64

In [15]:
df_data.dtypes

Make                  object
Model                 object
Year                   int64
Engine Fuel Type      object
Engine HP            float64
Engine Cylinders     float64
Transmission Type     object
Driven_Wheels         object
Number of Doors      float64
Market Category       object
Vehicle Size          object
Vehicle Style         object
highway MPG            int64
city mpg               int64
Popularity             int64
MSRP                   int64
dtype: object

#### ●yearに関する特徴量の作成
- 特徴量作成の1つの例として、yearのデータをもとに車両が販売されてからの経過年数という新しい特徴量を生成する。

In [18]:
# 2017年を基準に車両が販売されてからの経過年数を新しい特徴量として生成する
X['Duration Since Production'] = 2017 - X['Year']
display(X.head(5))

Unnamed: 0,Make,Model,Year,Engine Fuel Type,Engine HP,Engine Cylinders,Transmission Type,Driven_Wheels,Number of Doors,Market Category,Vehicle Size,Vehicle Style,highway MPG,city mpg,Popularity,Duration Since Production
0,BMW,1 Series M,2011,premium unleaded (required),335.0,6.0,MANUAL,rear wheel drive,2.0,"Factory Tuner,Luxury,High-Performance",Compact,Coupe,26,19,3916,6
1,BMW,1 Series,2011,premium unleaded (required),300.0,6.0,MANUAL,rear wheel drive,2.0,"Luxury,Performance",Compact,Convertible,28,19,3916,6
2,BMW,1 Series,2011,premium unleaded (required),300.0,6.0,MANUAL,rear wheel drive,2.0,"Luxury,High-Performance",Compact,Coupe,28,20,3916,6
3,BMW,1 Series,2011,premium unleaded (required),230.0,6.0,MANUAL,rear wheel drive,2.0,"Luxury,Performance",Compact,Coupe,28,18,3916,6
4,BMW,1 Series,2011,premium unleaded (required),230.0,6.0,MANUAL,rear wheel drive,2.0,Luxury,Compact,Convertible,28,18,3916,6


#### ●欠損値の補完

In [19]:
# Engine HP, Engine Cylinders, Number of Doorsの欠損値を中央値で補完する
X_complement = X.fillna(X.median())

# 欠損が補完されたことを確認する
display(X_complement.isnull().sum())

Make                            0
Model                           0
Year                            0
Engine Fuel Type                3
Engine HP                       0
Engine Cylinders                0
Transmission Type               0
Driven_Wheels                   0
Number of Doors                 0
Market Category              2999
Vehicle Size                    0
Vehicle Style                   0
highway MPG                     0
city mpg                        0
Popularity                      0
Duration Since Production       0
dtype: int64

In [40]:
# modeの保管
X_complement['Market Category'] = X_complement['Market Category'].fillna(X_complement['Market Category'].mode().iloc[0])
X_complement['Engine Fuel Type'] = X_complement['Engine Fuel Type'].fillna(X_complement['Engine Fuel Type'].mode().iloc[0])

In [49]:
cat_cols = ['Make', 'Model', 'Engine Fuel Type', 'Transmission Type', 'Driven_Wheels', 'Market Category', 'Vehicle Size', 'Vehicle Style']
X_ohe = pd.get_dummies(X_complement, dummy_na=False, columns=cat_cols)

#### ●使用する特徴量の選択

In [51]:
# OHE後の特徴量選択
from sklearn.feature_selection import RFECV
from sklearn.ensemble import RandomForestClassifier

selector = RFECV(estimator=RandomForestClassifier(random_state=1), step=.05)
selector.fit(X_ohe, y)
train_cols = X_ohe.columns.values
X_fin = X_ohe.loc[:, train_cols[selector.support_]]





In [55]:
X_fin.head()

Unnamed: 0,Year,Engine HP,Engine Cylinders,Number of Doors,highway MPG,city mpg,Popularity,Duration Since Production,Make_Acura,Make_Alfa Romeo,...,Vehicle Style_Convertible,Vehicle Style_Convertible SUV,Vehicle Style_Coupe,Vehicle Style_Crew Cab Pickup,Vehicle Style_Extended Cab Pickup,Vehicle Style_Passenger Minivan,Vehicle Style_Passenger Van,Vehicle Style_Regular Cab Pickup,Vehicle Style_Sedan,Vehicle Style_Wagon
0,2011,335.0,6.0,2.0,26,19,3916,6,0,0,...,0,0,1,0,0,0,0,0,0,0
1,2011,300.0,6.0,2.0,28,19,3916,6,0,0,...,1,0,0,0,0,0,0,0,0,0
2,2011,300.0,6.0,2.0,28,20,3916,6,0,0,...,0,0,1,0,0,0,0,0,0,0
3,2011,230.0,6.0,2.0,28,18,3916,6,0,0,...,0,0,1,0,0,0,0,0,0,0
4,2011,230.0,6.0,2.0,28,18,3916,6,0,0,...,1,0,0,0,0,0,0,0,0,0


## 3. 学習器の作成と評価
- ホールドアウトによる学習用データ、検証用データの分割
- 線形回帰による予測モデル作成
- 作成したモデルによる予測値算出
- RMSEによるモデル評価

#### ●ホールドアウトによる学習用データ、検証用データの分割

In [57]:
from sklearn.preprocessing import StandardScaler
scl  = StandardScaler()
X_scl = scl.fit_transform(X_fin)

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


In [62]:
X_train, X_test, y_train, y_test = train_test_split(X_scl,
                                                    y,
                                                    test_size=0.10,
                                                    random_state=0)

# 学習用データ、検証用データの分割の正常終了を確認する
display(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(8578, 956)

(954, 956)

(8578,)

(954,)

In [71]:
import lightgbm as lgb
from sklearn.ensemble import RandomForestRegressor

rf = RandomForestRegressor(n_estimators=400)
rf.fit(X_train, y_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=400, n_jobs=None,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

In [73]:

y_pred_rf = rf.predict(X_test)
y_true = y_test

In [75]:
from sklearn.metrics import mean_squared_error
import numpy as np

print(np.sqrt(mean_squared_error(y_true, y_pred_rf)))

7634.937651128537


## 4. scoreデータの予測値をcsv出力
- scoreデータの読み込み
- 特徴量Xの設定
- データの加工
- 3.で作成した学習器に対してtestデータの特徴量から予測値算出
- 予測値のcsv出力

In [76]:
# csvファイルを読み込む
df_score = pd.read_csv('score.csv', header=0, quotechar='"', encoding='cp932')

#### ●特徴量Xの設定

In [77]:
# 特徴量Xの設定
X_score = df_score.iloc[:, :]


#### ●データの加工

In [86]:
# yearに関する新しい特徴量の作成
X_score['Duration Since Production'] = 2017 - X_score['Year']

# Engine HP, Engine Cylinders, Number of Doorsの欠損値を中央値で補完する
X_score_complement = X_score.fillna(X_score.median())
# modeの保管
X_score_complement['Market Category'] = X_score_complement['Market Category'].fillna(X_score_complement['Market Category'].mode().iloc[0])
X_score_complement['Engine Fuel Type'] = X_score_complement['Engine Fuel Type'].fillna(X_score_complement['Engine Fuel Type'].mode().iloc[0])

In [87]:
# ohe
X_score_ohe = pd.get_dummies(X_score_complement, dummy_na=False, columns=cat_cols)

In [89]:
df_cols_train = pd.DataFrame(None, columns=train_cols, dtype=float) # 空のdfを準備
X_score_ohe2 = pd.concat([df_cols_train, X_score_ohe])

In [93]:
# testデータセットにあって、full(train+val)データセットにないカラムを削除
X_score_ohe2 = X_score_ohe2.drop(list(set(X_score_ohe.columns.values)-set(X_ohe.columns.values)), axis=1)

# full(train+val)データセットにしかないカラムをtestデータセットに追加し、0埋め
X_score_ohe2.loc[:, list(set(X_ohe.columns.values)-set(X_score_ohe.columns.values))] = \
    X_score_ohe2.loc[:, list(set(X_ohe.columns.values)-set(X_score_ohe.columns.values))].fillna(0, axis=1)

# reindex scoring data to the same shape as the modeling dataset
X_score_ohe2 = X_score_ohe2.reindex(X_ohe.columns.values, axis=1)
X_score_fin = X_score_ohe2.loc[:, train_cols[selector.support_]]

In [95]:
X_score_scl = scl.transform(X_score_fin)

In [97]:
predict_X_score = rf.predict(X_score_scl)

#### ●scoreデータの特徴量からラベルの予測値算出

In [98]:
# score用データのラベルの予測値を表示する
display(predict_X_score, predict_X_score.shape)

array([36371.09166667, 37719.19270833,  2000.        , ...,
       65570.255     , 51394.0656746 , 53239.40997024])

(2382,)

#### ●予測値のcsv出力

In [100]:
np.savetxt("predict_X_score_mkinoshita.csv", predict_X_score, delimiter=",", fmt='%.5f')