# ユニークな値の確認

In [2]:
import pandas as pd

# データの読み込み
data_path = '/Users/hayakawakazue/Downloads/house_price/test/test_new_feature_engineered_draft.csv'
test_df = pd.read_csv(data_path)

# カテゴリ変数とオブジェクト変数のリストを取得
encoding_columns = test_df.select_dtypes(include=['category', 'object']).columns

# 各カテゴリ変数とオブジェクト変数のユニークな値を表示
for col in encoding_columns:
    unique_values = test_df[col].unique()
    print(f"Column: {col}")
    print(f"Unique Values: {unique_values}")
    print(f"Number of Unique Values: {len(unique_values)}\n")


Column: MSZoning
Unique Values: ['RH' 'RL' 'RM' 'FV' 'C (all)']
Number of Unique Values: 5

Column: Street
Unique Values: ['Pave' 'Grvl']
Number of Unique Values: 2

Column: LandContour
Unique Values: ['Lvl' 'HLS' 'Bnk' 'Low']
Number of Unique Values: 4

Column: LotConfig
Unique Values: ['Inside' 'Corner' 'FR2' 'CulDSac' 'FR3']
Number of Unique Values: 5

Column: Neighborhood
Unique Values: ['NAmes' 'Gilbert' 'StoneBr' 'BrDale' 'NPkVill' 'NridgHt' 'Blmngtn'
 'NoRidge' 'Somerst' 'SawyerW' 'Sawyer' 'NWAmes' 'OldTown' 'BrkSide'
 'ClearCr' 'SWISU' 'Edwards' 'CollgCr' 'Crawfor' 'Blueste' 'IDOTRR'
 'Mitchel' 'Timber' 'MeadowV' 'Veenker']
Number of Unique Values: 25

Column: Condition1
Unique Values: ['Feedr' 'Norm' 'PosN' 'RRNe' 'Artery' 'RRNn' 'PosA' 'RRAn' 'RRAe']
Number of Unique Values: 9

Column: Condition2
Unique Values: ['Norm' 'Feedr' 'PosA' 'PosN' 'Artery']
Number of Unique Values: 5

Column: BldgType
Unique Values: ['1Fam' 'TwnhsE' 'Twnhs' 'Duplex' '2fmCon']
Number of Unique Values

In [2]:
import pandas as pd

# データの読み込み
data_path = '/Users/hayakawakazue/Downloads/house_price/test/test_new_feature_engineered_draft.csv'
data = pd.read_csv(data_path)

# 各列の欠損値の数を計算
missing_values = data.isna().sum()

# 欠損値がある列のみ表示（欠損値が0の列は表示しない）
missing_values = missing_values[missing_values > 0]
print(missing_values)


Series([], dtype: int64)


# ワンホットエンコーダを適用する

In [17]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
import joblib  # エンコーダーの保存に必要

# データの読み込み
data_path = '/Users/hayakawakazue/Downloads/house_price/test/test_new_feature_engineered_draft.csv'
test_df = pd.read_csv(data_path)

# ワンホットエンコーダをロードする
onehot_encoder_path = '/Users/hayakawakazue/Downloads/house_price/model/onehot_encoder.joblib'
onehot_encoder = joblib.load(onehot_encoder_path)

# エンコードするカテゴリ変数のリスト
categorical_columns = ['MSZoning', 'Street', 'LandContour', 'LotConfig', 'Neighborhood', 
                       'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 'RoofStyle', 
                       'RoofMatl', 'Exterior1st', 'Exterior2nd', 'Foundation', 'Heating', 
                       'CentralAir', 'GarageType', 'PavedDrive', 'SaleType', 'SaleCondition']

# ワンホットエンコーディングを適用
encoded_categorical_data = onehot_encoder.transform(test_df[categorical_columns])
encoded_categorical_df = pd.DataFrame(encoded_categorical_data, columns=onehot_encoder.get_feature_names_out(categorical_columns))

# 元のデータフレームにエンコードしたデータを結合
test_df = test_df.drop(columns=categorical_columns)
test_df = pd.concat([test_df, encoded_categorical_df], axis=1)

# データの確認
print(test_df.head())

# エンコードされたデータの保存
test_encoded_path = '/Users/hayakawakazue/Downloads/house_price/test/test_encoded.csv'
test_df.to_csv(test_encoded_path, index=False)
print(f"エンコードされたデータを保存しました: {test_encoded_path}")


     Id  MSSubClass  LotFrontage  LotArea  LotShape  Utilities  LandSlope  \
0  1461          20         80.0  11622.0         3          1          2   
1  1462          20         81.0  14267.0         2          1          2   
2  1463          60         74.0  13830.0         2          1          2   
3  1464          60         78.0   9978.0         2          1          2   
4  1465         120         43.0   5005.0         2          1          2   

   OverallQual  OverallCond  YearBuilt  ...  SaleType_ConLw  SaleType_New  \
0            5            6       1961  ...             0.0           0.0   
1            6            6       1958  ...             0.0           0.0   
2            5            5       1997  ...             0.0           0.0   
3            6            6       1998  ...             0.0           0.0   
4            8            5       1992  ...             0.0           0.0   

   SaleType_Oth  SaleType_WD  SaleCondition_Abnorml  SaleCondition_AdjLand