# ユニークな値の確認

In [1]:
import pandas as pd

# データの読み込み
data_path = '/Users/hayakawakazue/Downloads/house_price/train/train_new_feature_engineered_draft.csv'
train_df = pd.read_csv(data_path)

# カテゴリ変数とオブジェクト変数のリストを取得
encoding_columns = train_df.select_dtypes(include=['category', 'object']).columns

# 各カテゴリ変数とオブジェクト変数のユニークな値を表示
for col in encoding_columns:
    unique_values = train_df[col].unique()
    print(f"Column: {col}")
    print(f"Unique Values: {unique_values}")
    print(f"Number of Unique Values: {len(unique_values)}\n")


Column: MSZoning
Unique Values: ['RL' 'RM' 'C (all)' 'FV' 'RH']
Number of Unique Values: 5

Column: Street
Unique Values: ['Pave' 'Grvl']
Number of Unique Values: 2

Column: LandContour
Unique Values: ['Lvl' 'Bnk' 'Low' 'HLS']
Number of Unique Values: 4

Column: LotConfig
Unique Values: ['Inside' 'FR2' 'Corner' 'CulDSac' 'FR3']
Number of Unique Values: 5

Column: Neighborhood
Unique Values: ['CollgCr' 'Veenker' 'Crawfor' 'NoRidge' 'Mitchel' 'Somerst' 'NWAmes'
 'OldTown' 'BrkSide' 'Sawyer' 'NridgHt' 'NAmes' 'SawyerW' 'IDOTRR'
 'MeadowV' 'Edwards' 'Timber' 'Gilbert' 'StoneBr' 'ClearCr' 'NPkVill'
 'Blmngtn' 'BrDale' 'SWISU' 'Blueste']
Number of Unique Values: 25

Column: Condition1
Unique Values: ['Norm' 'Feedr' 'PosN' 'Artery' 'RRAe' 'RRNn' 'RRAn' 'PosA' 'RRNe']
Number of Unique Values: 9

Column: Condition2
Unique Values: ['Norm' 'Artery' 'RRNn' 'Feedr' 'PosN' 'PosA' 'RRAn' 'RRAe']
Number of Unique Values: 8

Column: BldgType
Unique Values: ['1Fam' '2fmCon' 'Duplex' 'TwnhsE' 'Twnhs']
Nu

In [3]:
import pandas as pd

# データの読み込み
data_path = '/Users/hayakawakazue/Downloads/house_price/train/train_new_feature_engineered_draft.csv'
data = pd.read_csv(data_path)

# 各列の欠損値の数を計算
missing_values = data.isna().sum()

# 欠損値がある列のみ表示（欠損値が0の列は表示しない）
missing_values = missing_values[missing_values > 0]
print(missing_values)


Series([], dtype: int64)


# ワンホットエンコーダを適用

In [2]:
from sklearn.preprocessing import OneHotEncoder
import pandas as pd
import joblib
import numpy as np

# データの読み込み
data_path = '/Users/hayakawakazue/Downloads/house_price/train/train_new_feature_engineered_draft.csv'
train_df = pd.read_csv(data_path)

# ワンホットエンコーダーの初期化
onehot_encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')

# エンコードするカテゴリ変数のリスト
categorical_columns = ['MSZoning', 'Street', 'LandContour', 'LotConfig', 'Neighborhood', 
                       'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 'RoofStyle', 
                       'RoofMatl', 'Exterior1st', 'Exterior2nd', 'Foundation', 'Heating', 
                       'CentralAir', 'GarageType', 'PavedDrive', 'SaleType', 'SaleCondition']

# ワンホットエンコーディングを適用
encoded_categorical_data = onehot_encoder.fit_transform(train_df[categorical_columns])
encoded_categorical_df = pd.DataFrame(encoded_categorical_data, columns=onehot_encoder.get_feature_names_out(categorical_columns))

# 元のデータフレームにエンコードしたデータを結合
train_df = train_df.drop(columns=categorical_columns)
train_df = pd.concat([train_df, encoded_categorical_df], axis=1)

# エンコーダの保存
joblib.dump(onehot_encoder, '/Users/hayakawakazue/Downloads/house_price/model/onehot_encoder.joblib')

# データを保存
output_path = '/Users/hayakawakazue/Downloads/house_price/train/train_encoded.csv'
train_df.to_csv(output_path, index=False)

print(f"カテゴリ変数をエンコードしたデータを保存しました: {output_path}")


カテゴリ変数をエンコードしたデータを保存しました: /Users/hayakawakazue/Downloads/house_price/train/train_encoded.csv
