In [None]:
#最初にマウント→メールアドレス選択→許可→「Mounted at /content/drive」を確認
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# 訓練データの読み込み
import csv
import numpy  as np
import pandas as pd
import glob


path = '/content/drive/My Drive/2023_Nishika_aki_train'
all_files = glob.glob(path + "/*.csv")

li = []

for filename in all_files:
    df = pd.read_csv(filename)
    li.append(df)

df_train_org = pd.concat(li, axis=0, ignore_index=True)


# 特徴量（絞り込み済）＋目的変数
df_train_X_y = df_train_org.iloc[:,[3,7,8,9,10,14,24,27]]


# all出力
df_train_X_y.to_csv('/content/drive/My Drive/2023_Nishika_aki_train/all/all_train_X_y_1.csv')


# ＮＧ値をＯＫ値へ置き換え
df_train_X_y = df_train_X_y.replace('1H?1H30', '75')
df_train_X_y = df_train_X_y.replace('1H30?2H', '105')
df_train_X_y = df_train_X_y.replace('2H?', '120')
df_train_X_y = df_train_X_y.replace('30分?60分', '45')
df_train_X_y = df_train_X_y.replace('2000㎡以上', '2000')


df_train_X_y.columns = ['post','eki','minute','mad','area','born','bai','price']

# ＮａＮを空白へ置き換え
df_train_X_y['eki'] = df_train_X_y['eki'].fillna('空白')
df_train_X_y['mad'] = df_train_X_y['mad'].fillna('空白')
df_train_X_y['born'] = df_train_X_y['born'].fillna('空白')

# ＮａＮを平均値へ置き換え
#★★★うまくできない（課題）
#df_train_X_y['minute'] = df_train_X_y['minute'].fillna(df_train_X_y['minute'].mean())
#やむを得ず
df_train_X_y['minute'] = df_train_X_y['minute'].fillna(11.61216821)


# all出力
df_train_X_y.to_csv('/content/drive/My Drive/2023_Nishika_aki_train/all/all_train_X_y_2.csv')

# データは１件もｄｒｏｐされていないことを確認
print(df_train_org.shape)
print(df_train_X_y.shape)


#各特徴量ごとの目的変数の平均をとる
post_means = df_train_X_y.groupby('post')['price'].mean()
eki_means  = df_train_X_y.groupby('eki')['price'].mean()
mad_means  = df_train_X_y.groupby('mad')['price'].mean()
born_means = df_train_X_y.groupby('born')['price'].mean()
bai_means  = df_train_X_y.groupby('bai')['price'].mean()


# 目的変数の平均を特徴量として列を置き換える（ターゲットエンコーディング）
df_train_X_y['post'] = df_train_X_y['post'].map(post_means)
df_train_X_y['eki']  = df_train_X_y['eki'].map(eki_means)
df_train_X_y['mad']  = df_train_X_y['mad'].map(mad_means)
df_train_X_y['born'] = df_train_X_y['born'].map(born_means)
df_train_X_y['bai']  = df_train_X_y['bai'].map(bai_means)


# all出力
df_train_X_y.to_csv('/content/drive/My Drive/2023_Nishika_aki_train/all/all_train_X_y_3.csv')


# 改めて学習データをＸ（特徴量）とｙ（目的変数）へ分割
df_train_X = df_train_X_y.iloc[:,:7]
df_train_y = df_train_X_y.iloc[:,7]

# ランダムフォレストで特徴量の重要度を評価
#import numpy as np
#import matplotlib.pyplot as plt
#from sklearn.ensemble import RandomForestRegressor

#feat_labels = df_train_X.columns[:7]

#forest = RandomForestRegressor(n_estimators=500,
#                                random_state=1)

#forest.fit(df_train_X, df_train_y)
#importances = forest.feature_importances_

#indices = np.argsort(importances)[::-1]

#for f in range(df_train_X.shape[1]):
#    print("%2d) %-*s %f" % (f + 1, 30,
#                            feat_labels[indices[f]],
#                            importances[indices[f]]))

#plt.title('Feature importance')
#plt.bar(range(df_train_X.shape[1]),
#        importances[indices],
#        align='center')

#plt.xticks(range(df_train_X.shape[1]),
#           feat_labels[indices], rotation=90)
#plt.xlim([-1, df_train_X.shape[1]])
#plt.tight_layout()
#plt.show()


# 学習データを、訓練データと検証データへ分割する
from sklearn.model_selection import train_test_split

X_train, X_valid, y_train, y_valid = train_test_split(
    df_train_X, df_train_y, test_size=0.1, random_state=0
    )


# 評価指標として決定係数（R^2）を使う
from sklearn.metrics import r2_score


# 機械学習アルゴリズム：標準化→ＰＣＡ→リッジ回帰のパイプライン
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.linear_model import Ridge
from sklearn.pipeline import make_pipeline

pipe_ridge = make_pipeline(StandardScaler(),
                           PCA(),
                           Ridge(random_state=0))


# グリッドサーチ
from sklearn.model_selection import GridSearchCV

pca_param_range = [7]
ridge_param_range = [ 100.0, 1000.0, 10000.0 ]

param_grid = [{'pca__n_components': pca_param_range,
               'ridge__alpha': ridge_param_range}]

gs = GridSearchCV(estimator=pipe_ridge,
                  param_grid=param_grid,
                  scoring='r2',
                  refit=True,
                  cv=10,
                  n_jobs=-1)

# モデルの最適なパラメータで重み付けを学習
gs = gs.fit(X_train, y_train)

print(gs.best_score_)


# モデルの最適なパラメータ
print(gs.best_params_)


# 予測
y_train_pred = gs.predict(X_train)
y_valid_pred = gs.predict(X_valid)


# 訓練データ、検証データそれぞれの精度を決定係数（R^2）で評価し、表示する
train_r2 = r2_score(y_train,y_train_pred)
valid_r2 = r2_score(y_valid,y_valid_pred)

print(train_r2)
print(valid_r2)

# テストデータの読み込み
df_test_org = pd.read_csv('/content/drive/My Drive/2023_Nishika_aki_test/test.csv')

# 特徴量（絞り込み済）＋目的変数
df_test_X = df_test_org.iloc[:,[3,7,8,9,10,14,24]]


# all出力
df_test_X.to_csv('/content/drive/My Drive/2023_Nishika_aki_test/all/all_test_X_1.csv')


# ＮＧ値をＯＫ値へ置き換え
df_test_X = df_test_X.replace('1H?1H30', '75')
df_test_X = df_test_X.replace('1H30?2H', '105')
df_test_X = df_test_X.replace('2H?', '120')
df_test_X = df_test_X.replace('30分?60分', '45')
df_test_X = df_test_X.replace('2000㎡以上', '2000')


# テストデータにしかない項目名の置き換え
df_test_X = df_test_X.replace('伊賀屋', '空白')
df_test_X = df_test_X.replace('栗東', '空白')
df_test_X = df_test_X.replace('志布志', '空白')
df_test_X = df_test_X.replace('小泉町(富山)', '空白')
df_test_X = df_test_X.replace('和歌山大学前', '空白')
df_test_X = df_test_X.replace('2022年第4四半期', '2021年第4四半期')
df_test_X = df_test_X.replace('2023年第1四半期', '2021年第4四半期')


df_test_X.columns = ['post','eki','minute','mad','area','born','bai']


# ＮａＮを空白へ置き換え
df_test_X['eki'] = df_test_X['eki'].fillna('空白')
df_test_X['mad'] = df_test_X['mad'].fillna('空白')
df_test_X['born'] = df_test_X['born'].fillna('空白')

# ＮａＮを平均値へ置き換え
#★★★うまくできない（課題）
#df_test_X['minute'] = df_test_X['minute'].fillna(df_test_X['minute'].mean())
#やむを得ず
df_test_X['minute'] = df_test_X['minute'].fillna(11.61216821)


# all出力
df_test_X.to_csv('/content/drive/My Drive/2023_Nishika_aki_test/all/all_test_X_2.csv')


# ターゲットエンコーディングのマッピングをテストデータへの適用
df_test_X['post'] = df_test_X['post'].map(post_means)
df_test_X['eki']  = df_test_X['eki'].map(eki_means)
df_test_X['mad']  = df_test_X['mad'].map(mad_means)
df_test_X['born'] = df_test_X['born'].map(born_means)
df_test_X['bai']  = df_test_X['bai'].map(bai_means)


# all出力
df_test_X.to_csv('/content/drive/My Drive/2023_Nishika_aki_test/all/all_test_X_3.csv')


# データは１件もｄｒｏｐされていないことを確認
print(df_test_org.shape)
print(df_test_X.shape)


# 予測
y_test_pred = gs.predict(df_test_X)

# 予測を出力
df_y_test_pred = pd.DataFrame(y_test_pred)
df_y_test_pred.to_csv('/content/drive/My Drive/2023_Nishika_aki_test/all/y_test_pred_pipe.csv')




  df = pd.read_csv(filename)
  df = pd.read_csv(filename)
  df = pd.read_csv(filename)
  df = pd.read_csv(filename)


(793377, 28)
(793377, 8)
0.7199814508328675
{'pca__n_components': 7, 'ridge__alpha': 1000.0}
0.7201521404169526
0.7215470524720058
(19271, 27)
(19271, 7)
