## 1. データの準備
- データの読み込み
- 特徴量Xと正解ラベルyの設定

#### ●データの読み込み

In [48]:
# 必要なライブラリを読み込む
from sklearn import linear_model
from sklearn.metrics import mean_squared_error
import numpy as np
from sklearn.model_selection import train_test_split
import pandas as pd
from IPython.core.display import display

# csvファイルを読み込む
df_data = pd.read_csv('data.csv', header=0, quotechar='"', encoding='cp932')

# csvファイルの読み込みの正常終了を確認する
display(df_data.head(5), df_data.shape)

Unnamed: 0,Make,Model,Year,Engine Fuel Type,Engine HP,Engine Cylinders,Transmission Type,Driven_Wheels,Number of Doors,Market Category,Vehicle Size,Vehicle Style,highway MPG,city mpg,Popularity,MSRP
0,BMW,1 Series M,2011,premium unleaded (required),335.0,6.0,MANUAL,rear wheel drive,2.0,"Factory Tuner,Luxury,High-Performance",Compact,Coupe,26,19,3916,46135
1,BMW,1 Series,2011,premium unleaded (required),300.0,6.0,MANUAL,rear wheel drive,2.0,"Luxury,Performance",Compact,Convertible,28,19,3916,40650
2,BMW,1 Series,2011,premium unleaded (required),300.0,6.0,MANUAL,rear wheel drive,2.0,"Luxury,High-Performance",Compact,Coupe,28,20,3916,36350
3,BMW,1 Series,2011,premium unleaded (required),230.0,6.0,MANUAL,rear wheel drive,2.0,"Luxury,Performance",Compact,Coupe,28,18,3916,29450
4,BMW,1 Series,2011,premium unleaded (required),230.0,6.0,MANUAL,rear wheel drive,2.0,Luxury,Compact,Convertible,28,18,3916,34500


(9532, 16)

#### ●特徴量Xと正解ラベルyの設定

In [49]:
# 特徴量Xと正解ラベルyの設定
X = df_data.iloc[:, 0:-1]
y = df_data.iloc[:, -1]

# 特徴量Xと正解ラベルyの設定の正常終了を確認する
display(X.head(5), X.shape)
display(y.head(5), y.shape)

Unnamed: 0,Make,Model,Year,Engine Fuel Type,Engine HP,Engine Cylinders,Transmission Type,Driven_Wheels,Number of Doors,Market Category,Vehicle Size,Vehicle Style,highway MPG,city mpg,Popularity
0,BMW,1 Series M,2011,premium unleaded (required),335.0,6.0,MANUAL,rear wheel drive,2.0,"Factory Tuner,Luxury,High-Performance",Compact,Coupe,26,19,3916
1,BMW,1 Series,2011,premium unleaded (required),300.0,6.0,MANUAL,rear wheel drive,2.0,"Luxury,Performance",Compact,Convertible,28,19,3916
2,BMW,1 Series,2011,premium unleaded (required),300.0,6.0,MANUAL,rear wheel drive,2.0,"Luxury,High-Performance",Compact,Coupe,28,20,3916
3,BMW,1 Series,2011,premium unleaded (required),230.0,6.0,MANUAL,rear wheel drive,2.0,"Luxury,Performance",Compact,Coupe,28,18,3916
4,BMW,1 Series,2011,premium unleaded (required),230.0,6.0,MANUAL,rear wheel drive,2.0,Luxury,Compact,Convertible,28,18,3916


(9532, 15)

0    46135
1    40650
2    36350
3    29450
4    34500
Name: MSRP, dtype: int64

(9532,)

In [50]:
X['Engine Fuel Type'].mode()

0    regular unleaded
dtype: object

In [51]:
for i in range(X.shape[1]):
    print('%d : %s : %s' %(i, X.iloc[:,i].name,X.iloc[:,i].dtypes))

    if X.iloc[:,i].dtypes == object:
        print(list(X.iloc[:,i].unique()))
    else:
        print('種類：%d、最小値：%f、最大値：%f、欠損データ有無：%s' %(len(X.iloc[:,i].unique()),X.iloc[:,i].min(),X.iloc[:,i].max(), X.isnull().sum()[i]))

0 : Make : object
['BMW', 'Audi', 'FIAT', 'Mercedes-Benz', 'Chrysler', 'Nissan', 'Volvo', 'Mazda', 'Mitsubishi', 'Ferrari', 'Alfa Romeo', 'Toyota', 'McLaren', 'Maybach', 'Pontiac', 'Porsche', 'Saab', 'GMC', 'Hyundai', 'Plymouth', 'Honda', 'Oldsmobile', 'Suzuki', 'Ford', 'Cadillac', 'Kia', 'Bentley', 'Chevrolet', 'Dodge', 'Lamborghini', 'Lincoln', 'Subaru', 'Volkswagen', 'Spyker', 'Buick', 'Acura', 'Rolls-Royce', 'Maserati', 'Lexus', 'Aston Martin', 'Land Rover', 'Lotus', 'Infiniti', 'Scion', 'Genesis', 'HUMMER', 'Tesla', 'Bugatti']
1 : Model : object
['1 Series M', '1 Series', '100', '124 Spider', '190-Class', '2 Series', '200', '200SX', '240SX', '240', '2', '3 Series Gran Turismo', '3 Series', '300-Class', '3000GT', '300', '300M', '300ZX', '323', '350-Class', '350Z', '360', '370Z', '3', '4 Series Gran Coupe', '4 Series', '400-Class', '456M', '458 Italia', '4C', '4Runner', '5 Series Gran Turismo', '5 Series', '500-Class', '500e', '500', '500L', '500X', '560-Class', '570S', '575M', '57'

## 2. データの加工
- 欠損状況の確認
- 基本統計量の確認
- yearに関する新しい特徴量の作成
- 欠損値の補完
- 使用する特徴量の選択

#### ●欠損状況の確認

In [52]:
display(df_data.isnull().sum())

Make                    0
Model                   0
Year                    0
Engine Fuel Type        3
Engine HP              53
Engine Cylinders       22
Transmission Type       0
Driven_Wheels           0
Number of Doors         6
Market Category      2999
Vehicle Size            0
Vehicle Style           0
highway MPG             0
city mpg                0
Popularity              0
MSRP                    0
dtype: int64

#### ●基本統計量の確認

In [53]:
df_data.describe()

Unnamed: 0,Year,Engine HP,Engine Cylinders,Number of Doors,highway MPG,city mpg,Popularity,MSRP
count,9532.0,9479.0,9510.0,9526.0,9532.0,9532.0,9532.0,9532.0
mean,2010.399077,249.478637,5.632387,3.435335,26.60512,19.720835,1556.40726,40783.78
std,7.549785,109.239858,1.786855,0.881758,8.306401,8.906915,1443.035732,62641.47
min,1990.0,55.0,0.0,2.0,12.0,7.0,2.0,2000.0
25%,2007.0,170.0,4.0,2.0,22.0,16.0,549.0,21143.75
50%,2015.0,227.0,6.0,4.0,26.0,18.0,1385.0,29995.0
75%,2016.0,300.0,6.0,4.0,30.0,22.0,2009.0,42220.0
max,2017.0,1001.0,16.0,4.0,111.0,137.0,5657.0,2065902.0


#### ●yearに関する特徴量の作成
- 特徴量作成の1つの例として、yearのデータをもとに車両が販売されてからの経過年数という新しい特徴量を生成する。

In [54]:
# 2017年を基準に車両が販売されてからの経過年数を新しい特徴量として生成する
X['Duration Since Production'] = 2017 - X['Year']
display(X.head(5))

Unnamed: 0,Make,Model,Year,Engine Fuel Type,Engine HP,Engine Cylinders,Transmission Type,Driven_Wheels,Number of Doors,Market Category,Vehicle Size,Vehicle Style,highway MPG,city mpg,Popularity,Duration Since Production
0,BMW,1 Series M,2011,premium unleaded (required),335.0,6.0,MANUAL,rear wheel drive,2.0,"Factory Tuner,Luxury,High-Performance",Compact,Coupe,26,19,3916,6
1,BMW,1 Series,2011,premium unleaded (required),300.0,6.0,MANUAL,rear wheel drive,2.0,"Luxury,Performance",Compact,Convertible,28,19,3916,6
2,BMW,1 Series,2011,premium unleaded (required),300.0,6.0,MANUAL,rear wheel drive,2.0,"Luxury,High-Performance",Compact,Coupe,28,20,3916,6
3,BMW,1 Series,2011,premium unleaded (required),230.0,6.0,MANUAL,rear wheel drive,2.0,"Luxury,Performance",Compact,Coupe,28,18,3916,6
4,BMW,1 Series,2011,premium unleaded (required),230.0,6.0,MANUAL,rear wheel drive,2.0,Luxury,Compact,Convertible,28,18,3916,6


#### <font color=blue>・マーケットカテゴリをone-hot化</font>

In [55]:
MC = ['Luxury','High-Performance','Performance','Flex Fuel','Hatchback','Hybrid','Diesel','Factory Tuner','Exotic','Crossover']
for i in MC:
    X['Market Category '+i] = X['Market Category'].str.contains(i)*1
    X['Market Category '+i] = X['Market Category '+i].fillna(0)

X = X.drop('Market Category',axis=1)

#### <font color=blue>・Vehicle Sizeをリスト化</font>

In [56]:
VS_dict = {'Compact':1, 'Midsize':2, 'Large':3 }
X['Vehicle Size'] = X['Vehicle Size'].map(VS_dict)

#### ●欠損値の補完

In [57]:
# Engine HP, Engine Cylinders, Number of Doorsの欠損値を中央値で補完する
X = X.fillna(X.median())

# 欠損が補完されたことを確認する
display(X.isnull().sum())

Make                                0
Model                               0
Year                                0
Engine Fuel Type                    3
Engine HP                           0
Engine Cylinders                    0
Transmission Type                   0
Driven_Wheels                       0
Number of Doors                     0
Vehicle Size                        0
Vehicle Style                       0
highway MPG                         0
city mpg                            0
Popularity                          0
Duration Since Production           0
Market Category Luxury              0
Market Category High-Performance    0
Market Category Performance         0
Market Category Flex Fuel           0
Market Category Hatchback           0
Market Category Hybrid              0
Market Category Diesel              0
Market Category Factory Tuner       0
Market Category Exotic              0
Market Category Crossover           0
dtype: int64

#### <font color=blue>・モデルを削除する

In [58]:
X = X.drop('Year',axis=1)
X = X.drop('Model',axis=1)

#### <font color = blue>・One-hot-Vectle化する

In [59]:
X_ohe = pd.get_dummies(X)

#### ・カテゴリをリスト化する

In [60]:
# from sklearn.preprocessing import LabelEncoder
# for i in range(X_complement.shape[1]):
#     if X_complement.iloc[:,i].dtypes == object:
#         lbl = LabelEncoder()
#         lbl.fit(list(X_complement.iloc[:,i].values)
#         lbl.transform(list(X_complement .iloc[:,i].values))

#### ●使用する特徴量の選択

In [61]:
# # 特徴量を選択する
# X_choice = X_complement[['Engine HP', 'Engine Cylinders',
#                          'highway MPG', 'city mpg',
#                          'Popularity', 'Duration Since Production']]

# # 使用する特徴量の選択の正常終了を確認する
# display(X_choice.head(5))

In [None]:
import matplotlib.pyplot as plt
# from sklearn.svm import SVC
# from sklearn.model_selection import StratifiedKFold
from sklearn.feature_selection import RFECV
from sklearn.ensemble import RandomForestClassifier

selector = RFECV(estimator=RandomForestClassifier(random_state=1), step=.05)
selector.fit(X_ohe, y)
train_cols = X_ohe.columns.values
X_choice = X_ohe.loc[:, train_cols[selector.support_]]

# # SVMによる分類
# estimator = SVC(kernel='linear')

# # # 5分割交差検証
# # cv = StratifiedKFold(5)

# # 特徴量削減
# rfecv = RFECV(estimator, scoring='accuracy', step=1)

# # 学習
# rfecv.fit(X_choice, y)

# print('Feature ranking: \n{}'.format(rfecv.ranking_))



## 3. 学習器の作成と評価
- ホールドアウトによる学習用データ、検証用データの分割
- 線形回帰による予測モデル作成
- 作成したモデルによる予測値算出
- RMSEによるモデル評価

#### ●ホールドアウトによる学習用データ、検証用データの分割

In [29]:
from sklearn.preprocessing import StandardScaler
scl  = StandardScaler()
X_choice = scl.fit_transform(X_ohe)

In [30]:
X_train, X_test, y_train, y_test = train_test_split(X_choice,
                                                    y,
                                                    test_size=0.10,
                                                    random_state=0)

# 学習用データ、検証用データの分割の正常終了を確認する
display(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(8578, 101)

(954, 101)

(8578,)

(954,)

In [47]:
from sklearn.ensemble import RandomForestRegressor
# rf = RandomForestRegressor(n_estimators=200, max_features='auto',max_depth = 20)
rf = RandomForestRegressor(n_estimators=200, max_features=20)
rf.fit(X_train, y_train)
display(np.sqrt(mean_squared_error(y_train, rf.predict(X_train))))
display(np.sqrt(mean_squared_error(y_test, rf.predict(X_test))))

8028.414434912795

8882.411008138602

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
feature=rf.feature_importances_
f = pd.DataFrame({'number': range(len(feature)),'name':X_train.columns.values,
             'feature': feature[:]}).sort_values('feature',ascending=False)
sns.barplot(x=f['feature'].head(20),y=f['number'].head(20).astype(str)+' '+f['name'].head(20))
plt.show()

In [None]:
import lightgbm as lgb

In [None]:
model = lgb.LGBMRegressor(n_estimators=500,reg_lambda=0.1,randam_state=100)
model.fit(X_train, y_train)

# 線形回帰モデルの学習を行った学習器に対して学習用データ、検証用データでRMSEを計算する
display(np.sqrt(mean_squared_error(y_train, model.predict(X_train))))
display(np.sqrt(mean_squared_error(y_test, model.predict(X_test))))

# 検証データの予測結果を表示する
predict_X_test = model.predict(X_test)
display(predict_X_test)

In [None]:
# 事前準備
from keras.models import Sequential
from keras.layers import Dense, Activation

# モデルを作るぞ！と宣言
model = Sequential()

In [None]:
model.add(Dense(3, input_dim = len(X_train.columns), activation='relu'))
model.add(Dense(3, activation='relu'))
model.add(Dense(1, activation='linear'))
model.compile(optimizer='rmsprop',
              loss='binary_crossentropy',
              metrics=['accuracy'])

# train
model.fit(X_train, y_train, nb_epoch=10, batch_size=4)

#### ●線形回帰による予測モデル作成

In [None]:
# 線形回帰モデルの学習を行う
linear_regression = linear_model.LinearRegression()
linear_regression.fit(X_train, y_train)

# 線形回帰モデルの学習を行った学習器に対して学習用データ、検証用データでRMSEを計算する
display(np.sqrt(mean_squared_error(y_train, linear_regression.predict(X_train))))
display(np.sqrt(mean_squared_error(y_test, linear_regression.predict(X_test))))

# 検証データの予測結果を表示する
predict_X_test = linear_regression.predict(X_test)
display(predict_X_test)

## 4. scoreデータの予測値をcsv出力
- scoreデータの読み込み
- 特徴量Xの設定
- データの加工
- 3.で作成した学習器に対してtestデータの特徴量から予測値算出
- 予測値のcsv出力

#### ●scoreデータの読み込み

In [None]:
# csvファイルを読み込む
df_score = pd.read_csv('score.csv', header=0, quotechar='"', encoding='cp932')

#### ●特徴量Xの設定

In [None]:
# 特徴量Xの設定
X_score = df_score.iloc[:, :]

# 特徴量Xの設定正常終了を確認する
display(X_score.head(5), X_score.shape)

#### ●データの加工

In [None]:
# yearに関する新しい特徴量の作成
X_score['Duration Since Production'] = 2017 - X_score['Year']

# Engine HP, Engine Cylinders, Number of Doorsの欠損値を中央値で補完する
X_score_complement = X_score.fillna(X_score.median())

# 使用する特徴量の選択
X_score_choice = X_score_complement[['Engine HP', 'Engine Cylinders',
                                     'highway MPG', 'city mpg',
                                     'Popularity', 'Duration Since Production']]

# データの加工の正常終了を確認する
display(X_score_choice.head(5))

#### ●scoreデータの特徴量からラベルの予測値算出

In [None]:
# 線形回帰モデルの学習を行った学習器に対して、score用データを用いてラベルの予測値を算出する
predict_X_score = linear_regression.predict(X_score_choice)

# score用データのラベルの予測値を表示する
display(predict_X_score, predict_X_score.shape)

#### ●予測値のcsv出力

In [None]:
np.savetxt("predict_X_score.csv", predict_X_score, delimiter=",", fmt='%.5f')