# 重要な特徴量のリスト作成と数値型変数のスケーリング

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
import joblib

# データの読み込み
data_path = '/Users/hayakawakazue/Downloads/house_price/train/train_encoded.csv'
data_cleaned = pd.read_csv(data_path)

# 無限値をNaNに置き換え
data_cleaned.replace([np.inf, -np.inf], np.nan, inplace=True)

# NaN値を中央値で補完
for column in data_cleaned.columns:
    if data_cleaned[column].isnull().any():
        median_value = data_cleaned[column].median()
        data_cleaned[column].fillna(median_value, inplace=True)

# 重要な特徴量のリスト
important_features = [
    'TotalArea', 'QualityScore', 'OverallQual', 'GrLivArea', 
    'GarageScore', 'ExterQual', 'KitchenQual', 
    'GarageCars', 'BsmtQual', 'GarageArea', 'TotalBsmtSF', 
    'BsmtQualityIndex', '1stFlrSF', 'AvgQualityCondition', 'QualityCondition',
    'FullBath', 'YearBuilt', 'YearRemodAdd', 'TotalRooms', 'TotRmsAbvGrd'
]

# 特徴量とターゲットの分離
X = data_cleaned[important_features]
y = data_cleaned['SalePrice']

# スケーリングの定義
scaler = StandardScaler()

# スケーリングの適用
X_scaled = scaler.fit_transform(X)

# スケーラーを保存
scaler_path = '/Users/hayakawakazue/Downloads/house_price/model/train_scaler.joblib'
joblib.dump(scaler, scaler_path)

# スケーリング結果をデータフレームに変換
X_scaled_df = pd.DataFrame(X_scaled, columns=important_features)
X_scaled_df['SalePrice'] = y

# データを保存
data_preprocessed_path = '/Users/hayakawakazue/Downloads/house_price/train/train_preprocessed.csv'
X_scaled_df.to_csv(data_preprocessed_path, index=False)

print(f"データを保存しました: {data_preprocessed_path}")
print(X_scaled_df.head())

# データの行数と列数をプリントする
print(f"データの行数: {X_scaled_df.shape[0]}")
print(f"データの列数: {X_scaled_df.shape[1]}")

print(data_cleaned.describe())
print(data_cleaned.columns)


データを保存しました: /Users/hayakawakazue/Downloads/house_price/train/train_preprocessed.csv
   TotalArea  QualityScore  OverallQual  GrLivArea  GarageScore  ExterQual  \
0   0.102384      0.938196     0.651479   0.428636     0.187084   1.052302   
1  -0.046599      0.080842    -0.071836  -0.502349    -0.071047  -0.689604   
2   0.331588      0.938196     0.651479   0.586571     0.363083   1.052302   
3   0.103530      0.080842     0.651479   0.443182     1.404407  -0.689604   
4   1.322894      1.223981     1.374795   1.442744     2.258000   1.052302   

   KitchenQual  GarageCars  BsmtQual  GarageArea  ...  BsmtQualityIndex  \
0     0.735994    0.315946  0.641645    0.373509  ...          0.502577   
1    -0.771091    0.315946  0.641645   -0.051541  ...          0.502577   
2     0.735994    0.315946  0.641645    0.663315  ...          0.502577   
3     0.735994    1.662750 -0.833633    0.827539  ...          0.502577   
4     0.735994    1.662750  0.641645    1.764579  ...          0.502577 