# 1.データ型の確認をする

In [2]:
import pandas as pd
import numpy as np

# データの読み込み
data_path = '/Users/hayakawakazue/Downloads/house_price/test/test18.csv'
test_df = pd.read_csv(data_path)

test_df.dtypes

Id                 int64
MSSubClass         int64
MSZoning          object
LotFrontage      float64
LotArea          float64
Street            object
LotShape           int64
LandContour       object
Utilities          int64
LotConfig         object
LandSlope          int64
Neighborhood      object
Condition1        object
Condition2        object
BldgType          object
HouseStyle        object
OverallQual        int64
OverallCond        int64
YearBuilt          int64
YearRemodAdd       int64
RoofStyle         object
RoofMatl          object
Exterior1st       object
Exterior2nd       object
ExterQual          int64
Foundation        object
BsmtQual           int64
BsmtCond           int64
BsmtExposure       int64
BsmtFinType1       int64
BsmtFinType2       int64
BsmtUnfSF        float64
TotalBsmtSF      float64
Heating           object
HeatingQC          int64
CentralAir        object
Electrical         int64
1stFlrSF           int64
GrLivArea        float64
FullBath         float64


# 2. 新しい特徴量を作成する

In [9]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import os

# 日本語フォントの設定
plt.rcParams['font.sans-serif'] = ['Hiragino Maru Gothic Pro']  # Mac用
plt.rcParams['axes.unicode_minus'] = False

# データの読み込み
data_path = '/Users/hayakawakazue/Downloads/house_price/test/test18.csv'
test_df = pd.read_csv(data_path)

# 数値型データの処理
numeric_cols = test_df.select_dtypes(include=[np.number]).columns
test_df[numeric_cols] = test_df[numeric_cols].replace([np.inf, -np.inf], np.nan)  # 無限大の値をNaNに置き換え
test_df[numeric_cols] = test_df[numeric_cols].fillna(test_df[numeric_cols].median())  # NaNを中央値で置き換える

# 新しい特徴量の作成
test_df['GrLivArea_OverallQual'] = test_df['GrLivArea'] * test_df['OverallQual']
# test_df['GarageArea_OverallQual'] = test_df['GarageArea'] * test_df['OverallQual']
test_df['TotalBsmtSF_OverallQual'] = test_df['TotalBsmtSF'] * test_df['OverallQual']
# test_df['GarageArea_TotalBsmtSF'] = test_df['GarageArea'] + test_df['TotalBsmtSF']
test_df['YearBuilt_YearRemodAdd'] = test_df['YearBuilt'] - test_df['YearRemodAdd']
test_df['TotalArea'] = test_df['GrLivArea'] + test_df['TotalBsmtSF'] + test_df['GarageArea']
test_df['QualityScore'] = test_df['OverallQual'] * test_df['ExterQual'] * test_df['KitchenQual'] * test_df['BsmtQual'] * test_df['GarageQual']
test_df['Age'] = test_df['YrSold'] - test_df['YearBuilt']
test_df['RemodelAge'] = test_df['YrSold'] - test_df['YearRemodAdd']
test_df['OverallConditionArea'] = test_df['OverallCond'] * test_df['TotalArea']

# 新しい特徴量の追加(低価格の予測）
test_df['TotalRmsAbvGrd_OverallCond'] = test_df['TotRmsAbvGrd'] * test_df['OverallCond']
# test_df['Neighborhood_LotArea'] = test_df['Neighborhood'] * test_df['LotArea']
test_df['Functional_OverallQual'] = test_df['Functional'] * test_df['OverallQual']

#新しい特徴量の追加（高価格の予測)
test_df['log_LotArea'] = np.log1p(test_df['LotArea'])
test_df['sqrt_GrLivArea'] = np.sqrt(test_df['GrLivArea'])
test_df['OverallQual_GrLivArea'] = test_df['OverallQual'] * test_df['GrLivArea']

# 新しい特徴量の作成例：ログ変換、平方根変換、相互作用項など
test_df['log_1stFlrSF'] = np.log1p(test_df['1stFlrSF'])
test_df['sqrt_TotalBsmtSF'] = np.sqrt(test_df['TotalBsmtSF'])
test_df['OverallQual_TotalArea'] = test_df['OverallQual'] * test_df['TotalArea']
test_df['BsmtQual_ExterQual'] = test_df['BsmtQual'] * test_df['ExterQual']

# ディレクトリが存在するか確認し、存在しない場合は作成
save_dir = '/Users/hayakawakazue/Downloads/house_price/test'
if not os.path.exists(save_dir):
    os.makedirs(save_dir)

# 保存
data_cleaned_path = os.path.join(save_dir, 'test_new_feature_engineered_draft.csv')
test_df.to_csv(data_cleaned_path, index=False)

print(f"新しい特徴量を追加してデータを保存しました: {data_cleaned_path}")


新しい特徴量を追加してデータを保存しました: /Users/hayakawakazue/Downloads/house_price/test/test_new_feature_engineered_draft.csv
