# 1.データ型の確認をする

In [1]:
import pandas as pd
import numpy as np

# データの読み込み
data_path = '/Users/hayakawakazue/Downloads/house_price/test/test18.csv'
test_df = pd.read_csv(data_path)

test_df.dtypes

Id                 int64
MSSubClass         int64
MSZoning          object
LotFrontage      float64
LotArea          float64
Street            object
LotShape          object
LandContour       object
Utilities         object
LotConfig         object
LandSlope         object
Neighborhood      object
Condition1        object
Condition2        object
BldgType          object
HouseStyle        object
OverallQual        int64
OverallCond        int64
YearBuilt          int64
YearRemodAdd       int64
RoofStyle         object
RoofMatl          object
Exterior1st       object
Exterior2nd       object
ExterQual          int64
Foundation        object
BsmtQual           int64
BsmtCond           int64
BsmtExposure      object
BsmtFinType1      object
BsmtFinType2      object
BsmtUnfSF        float64
TotalBsmtSF      float64
Heating           object
HeatingQC          int64
CentralAir        object
Electrical        object
1stFlrSF           int64
GrLivArea        float64
FullBath         float64


# 2. 新しい特徴量を作成する

In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import os

# 日本語フォントの設定
plt.rcParams['font.sans-serif'] = ['Hiragino Maru Gothic Pro']  # Mac用
plt.rcParams['axes.unicode_minus'] = False

# データの読み込み
data_path = '/Users/hayakawakazue/Downloads/house_price/test/test18.csv'
test_df = pd.read_csv(data_path)

# 数値型データの処理
numeric_cols = test_df.select_dtypes(include=[np.number]).columns
test_df[numeric_cols] = test_df[numeric_cols].replace([np.inf, -np.inf], np.nan)  # 無限大の値をNaNに置き換え
test_df[numeric_cols] = test_df[numeric_cols].fillna(0)  # NaNを0で置き換える

# 新しい特徴量の作成
test_df['TotalArea'] = test_df['GrLivArea'] + test_df['TotalBsmtSF'] + test_df['GarageArea']
test_df['YearsSinceRemodel'] = test_df['YrSold'] - test_df['YearRemodAdd'] 
test_df['HouseAge'] = test_df['YrSold'] - test_df['YearBuilt'] 
test_df['AreaPerYearRatio'] = test_df['GrLivArea'] / np.maximum(test_df['HouseAge'], 1e-5)
test_df['LotAreaSquared'] = test_df['LotArea'] ** 2
test_df['AreaRatio'] = test_df['GrLivArea'] / np.maximum(test_df['LotArea'], 1e-5)
test_df['InverseYearsSinceRemodel'] = 1 / np.maximum(test_df['YearsSinceRemodel'], 1e-5)
test_df['Age'] = test_df['YrSold'] - test_df[['YearBuilt', 'YearRemodAdd']].max(axis=1)
test_df['GarageAreaPerCar'] = test_df['GarageArea'] / np.maximum(test_df['GarageCars'], 1e-5)
test_df['GarageScore'] = test_df['GarageCars'] * test_df['GarageArea']
test_df['LotShapeQuality'] = test_df['LotArea'] * test_df['LotFrontage']
test_df['HasBsmt'] = (test_df['TotalBsmtSF'] > 0).astype(int)
test_df['TotalRooms'] = test_df['TotRmsAbvGrd'] + (test_df['BsmtQual'] > 0).astype(int) * 2  # 地下室を部屋数に加算
test_df['QualityScore'] = test_df['OverallQual'] + test_df['ExterQual'] + test_df['KitchenQual'] + test_df['BsmtQual'] + test_df['HeatingQC']

# BsmtQualとBsmtCondはすでに数値に変換済み
# BsmtQualとBsmtCondを使ってBsmtQualityIndexを作成
test_df['BsmtQualityIndex'] = test_df['BsmtQual'] * test_df['BsmtCond']

# 地下室の露出マッピング
exposure_mapping = {'Gd': 4, 'Av': 3, 'Mn': 2, 'No': 1, 'NA': 0}
test_df['BsmtExposure_Score'] = test_df['BsmtExposure'].map(exposure_mapping)

# 地下室完成タイプ1と2のマッピング
fin_type_mapping = {'GLQ': 6, 'ALQ': 5, 'BLQ': 4, 'Rec': 3, 'LwQ': 2, 'Unf': 1, 'NA': 0}
test_df['BsmtFinType1_Score'] = test_df['BsmtFinType1'].map(fin_type_mapping)
test_df['BsmtFinType2_Score'] = test_df['BsmtFinType2'].map(fin_type_mapping)

# 地下室の総仕上げレベル
test_df['TotalBsmtFin'] = test_df['BsmtFinType1_Score'] + test_df['BsmtFinType2_Score']

# 新しい特徴量を作成：質と条件の総合評価
test_df['QualityCondition'] = test_df['OverallQual'] * test_df['OverallCond']

# 質と条件の平均値を別の特徴量として加える
test_df['AvgQualityCondition'] = (test_df['OverallQual'] + test_df['OverallCond']) / 2

# ディレクトリが存在するか確認し、存在しない場合は作成
save_dir = '/Users/hayakawakazue/Downloads/house_price/test'
if not os.path.exists(save_dir):
    os.makedirs(save_dir)

# 保存
data_cleaned_path = os.path.join(save_dir, 'test_new_feature_engineered.csv')
test_df.to_csv(data_cleaned_path, index=False)

print(f"新しい特徴量を追加してデータを保存しました: {data_cleaned_path}")


新しい特徴量を追加してデータを保存しました: /Users/hayakawakazue/Downloads/house_price/test/test_new_feature_engineered.csv
