# 1.列を確認する

In [1]:
import pandas as pd
import numpy as np
import os

# データの読み込み
data_path = '/Users/hayakawakazue/Downloads/house_price/train/train.csv'
train_df = pd.read_csv(data_path, low_memory=False)

# データの確認
print(train_df.head())
print(train_df.info())



   Id  MSSubClass MSZoning  LotFrontage  LotArea Street Alley LotShape  \
0   1          60       RL         65.0     8450   Pave   NaN      Reg   
1   2          20       RL         80.0     9600   Pave   NaN      Reg   
2   3          60       RL         68.0    11250   Pave   NaN      IR1   
3   4          70       RL         60.0     9550   Pave   NaN      IR1   
4   5          60       RL         84.0    14260   Pave   NaN      IR1   

  LandContour Utilities  ... PoolArea PoolQC Fence MiscFeature MiscVal MoSold  \
0         Lvl    AllPub  ...        0    NaN   NaN         NaN       0      2   
1         Lvl    AllPub  ...        0    NaN   NaN         NaN       0      5   
2         Lvl    AllPub  ...        0    NaN   NaN         NaN       0      9   
3         Lvl    AllPub  ...        0    NaN   NaN         NaN       0      2   
4         Lvl    AllPub  ...        0    NaN   NaN         NaN       0     12   

  YrSold  SaleType  SaleCondition  SalePrice  
0   2008        WD   

In [2]:
# 各特徴量の0の数を数える
zero_counts = (train_df == 0).sum()

# 全ての列を表示する設定
pd.set_option('display.max_rows', None)

# 結果を表示
print(zero_counts)


Id                  0
MSSubClass          0
MSZoning            0
LotFrontage         0
LotArea             0
Street              0
Alley               0
LotShape            0
LandContour         0
Utilities           0
LotConfig           0
LandSlope           0
Neighborhood        0
Condition1          0
Condition2          0
BldgType            0
HouseStyle          0
OverallQual         0
OverallCond         0
YearBuilt           0
YearRemodAdd        0
RoofStyle           0
RoofMatl            0
Exterior1st         0
Exterior2nd         0
MasVnrType          0
MasVnrArea        861
ExterQual           0
ExterCond           0
Foundation          0
BsmtQual            0
BsmtCond            0
BsmtExposure        0
BsmtFinType1        0
BsmtFinSF1        467
BsmtFinType2        0
BsmtFinSF2       1293
BsmtUnfSF         118
TotalBsmtSF        37
Heating             0
HeatingQC           0
CentralAir          0
Electrical          0
1stFlrSF            0
2ndFlrSF          829
LowQualFin

# 2.不要な列を削除する

In [3]:
import pandas as pd

# データの読み込み
data_path = '/Users/hayakawakazue/Downloads/house_price/train/train.csv'
train_data = pd.read_csv(data_path)

# 削除する列のリストを再評価
drop_columns = ['Id', 'MasVnrArea', 'MiscVal', 'PoolArea', '3SsnPorch', 'ScreenPorch', 'HalfBath', 'Fireplaces', 'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', 'BsmtFinSF1', 'BsmtFinSF2', '2ndFlrSF', 'LowQualFinSF', 'BsmtFullBath', 'BsmtHalfBath',  'Alley', 'MasVnrType', 'ExterCond', 'FireplaceQu', 'PoolQC', 'Fence', 'MiscFeature']

# 特徴量を削除
train_data_reduced = train_data.drop(columns=drop_columns)

# データを保存する
output_path = '/Users/hayakawakazue/Downloads/house_price/train/train1.csv'
train_data_reduced.to_csv(output_path, index=False)

print(f"Processed data saved to {output_path}")


Processed data saved to /Users/hayakawakazue/Downloads/house_price/train/train1.csv


# 3.データ型を確認する

In [4]:
import pandas as pd

# データの読み込み
data_path = '/Users/hayakawakazue/Downloads/house_price/train/train1.csv'
train_df = pd.read_csv(data_path)

# データの先頭5行を表示
print("データの先頭5行:")
print(train_df.head())

# データ型の確認
print("\nデータ型:")
print(train_df.dtypes)

# 各特徴量の0の数を数える
zero_counts = (train_df == 0).sum()

# 全ての列を表示する設定
pd.set_option('display.max_rows', None)

# 結果を表示
print(zero_counts)

データの先頭5行:
   MSSubClass MSZoning  LotFrontage  LotArea Street LotShape LandContour  \
0          60       RL         65.0     8450   Pave      Reg         Lvl   
1          20       RL         80.0     9600   Pave      Reg         Lvl   
2          60       RL         68.0    11250   Pave      IR1         Lvl   
3          70       RL         60.0     9550   Pave      IR1         Lvl   
4          60       RL         84.0    14260   Pave      IR1         Lvl   

  Utilities LotConfig LandSlope  ... GarageCars GarageArea GarageQual  \
0    AllPub    Inside       Gtl  ...          2        548         TA   
1    AllPub       FR2       Gtl  ...          2        460         TA   
2    AllPub    Inside       Gtl  ...          2        608         TA   
3    AllPub    Corner       Gtl  ...          3        642         TA   
4    AllPub       FR2       Gtl  ...          3        836         TA   

  GarageCond PavedDrive  MoSold  YrSold  SaleType  SaleCondition SalePrice  
0         TA     

# 4.欠損値の数と割合を確認する

In [7]:
import pandas as pd

# データの読み込み
data_path = '/Users/hayakawakazue/Downloads/house_price/train/train1.csv'
train_data = pd.read_csv(data_path)

# 欠損値の数と割合を確認
missing_values = train_data.isnull().sum()
missing_values_percent = (missing_values / len(train_data)) * 100

# データフレームにまとめる
missing_data = pd.DataFrame({'Missing Values': missing_values, 'Percentage': missing_values_percent})
missing_data = missing_data[missing_data['Missing Values'] > 0].sort_values(by='Missing Values', ascending=False)

# 欠損値の確認結果を表示
print(missing_data)

# データを保存する
output_path = '/Users/hayakawakazue/Downloads/house_price/train/train2.csv'
train_data.to_csv(output_path, index=False)

print(f"Processed data saved to {output_path}")

# 確認
print(train_data.dtypes)

              Missing Values  Percentage
LotFrontage              259   17.739726
GarageFinish              81    5.547945
GarageQual                81    5.547945
GarageType                81    5.547945
GarageCond                81    5.547945
GarageYrBlt               81    5.547945
BsmtExposure              38    2.602740
BsmtFinType2              38    2.602740
BsmtCond                  37    2.534247
BsmtQual                  37    2.534247
BsmtFinType1              37    2.534247
Electrical                 1    0.068493
Processed data saved to /Users/hayakawakazue/Downloads/house_price/train/train2.csv
MSSubClass         int64
MSZoning          object
LotFrontage      float64
LotArea            int64
Street            object
LotShape          object
LandContour       object
Utilities         object
LotConfig         object
LandSlope         object
Neighborhood      object
Condition1        object
Condition2        object
BldgType          object
HouseStyle        object
OverallQ

# 5.欠損値を中央値と最頻値で補完する

In [8]:
import pandas as pd

# データの読み込み
data_path = '/Users/hayakawakazue/Downloads/house_price/train/train2.csv'
train_df = pd.read_csv(data_path)

# 数値変数のリスト
numerical_cols = ['LotFrontage', 'GarageYrBlt']

# カテゴリカル変数のリスト
categorical_cols = ['GarageQual', 'GarageCond', 'GarageFinish', 'GarageType', 
                    'BsmtFinType2', 'BsmtExposure', 'BsmtCond', 'BsmtQual', 
                    'BsmtFinType1', 'Electrical']

# 数値変数の欠損値を中央値で置き換える
for col in numerical_cols:
    median_value = train_df[col].median()
    train_df[col].fillna(median_value, inplace=True)

# カテゴリカル変数の欠損値を最頻値で置き換える
for col in categorical_cols:
    mode_value = train_df[col].mode()[0]
    train_df[col].fillna(mode_value, inplace=True)

# 結果の確認
print(train_df[numerical_cols + categorical_cols].isnull().sum())

# 欠損値の数を確認
missing_values = train_df.isnull().sum()
# 欠損値の数を表示
print( "欠損値の数:",missing_values)

# データを保存する
output_path = '/Users/hayakawakazue/Downloads/house_price/train/train3.csv'
train_df.to_csv(output_path, index=False)

print(f"Processed data saved to {output_path}")


LotFrontage     0
GarageYrBlt     0
GarageQual      0
GarageCond      0
GarageFinish    0
GarageType      0
BsmtFinType2    0
BsmtExposure    0
BsmtCond        0
BsmtQual        0
BsmtFinType1    0
Electrical      0
dtype: int64
欠損値の数: MSSubClass       0
MSZoning         0
LotFrontage      0
LotArea          0
Street           0
LotShape         0
LandContour      0
Utilities        0
LotConfig        0
LandSlope        0
Neighborhood     0
Condition1       0
Condition2       0
BldgType         0
HouseStyle       0
OverallQual      0
OverallCond      0
YearBuilt        0
YearRemodAdd     0
RoofStyle        0
RoofMatl         0
Exterior1st      0
Exterior2nd      0
ExterQual        0
Foundation       0
BsmtQual         0
BsmtCond         0
BsmtExposure     0
BsmtFinType1     0
BsmtFinType2     0
BsmtUnfSF        0
TotalBsmtSF      0
Heating          0
HeatingQC        0
CentralAir       0
Electrical       0
1stFlrSF         0
GrLivArea        0
FullBath         0
BedroomAbvGr     0
Kitc

# 6.ユニークな値の確認

In [6]:
import pandas as pd
import numpy as np

# データの読み込み
data_path = '/Users/hayakawakazue/Downloads/house_price/train/train3.csv'
train_df = pd.read_csv(data_path)

# 特定の列のユニークな値を取得
unique_values = train_df['Street'].unique()

# numpyの出力制限を無効にする
np.set_printoptions(threshold=np.inf)

# ユニークな値を表示
print(unique_values)

# ユニークな値の総数を表示
print("ユニークな値の総数:", len(unique_values))

# データタイプを確認
print(train_df.dtypes)

print("Full train dataset shape is {}".format(train_df.shape))

['Pave' 'Grvl']
ユニークな値の総数: 2
MSSubClass         int64
MSZoning          object
LotFrontage      float64
LotArea            int64
Street            object
LotShape          object
LandContour       object
Utilities         object
LotConfig         object
LandSlope         object
Neighborhood      object
Condition1        object
Condition2        object
BldgType          object
HouseStyle        object
OverallQual        int64
OverallCond        int64
YearBuilt          int64
YearRemodAdd       int64
RoofStyle         object
RoofMatl          object
Exterior1st       object
Exterior2nd       object
ExterQual         object
Foundation        object
BsmtQual          object
BsmtCond          object
BsmtExposure      object
BsmtFinType1      object
BsmtFinType2      object
BsmtUnfSF          int64
TotalBsmtSF        int64
Heating           object
HeatingQC         object
CentralAir        object
Electrical        object
1stFlrSF           int64
GrLivArea          int64
FullBath           in

# 7.カテゴリ変数のうち数値に変換できるものは変換する

In [10]:
import pandas as pd
import numpy as np

# データの読み込み
data_path = '/Users/hayakawakazue/Downloads/house_price/train/train3.csv'
train_df = pd.read_csv(data_path)

# マッピングを定義
qual_mapping = {'Ex': 5, 'Gd': 4, 'TA': 3, 'Fa': 2, 'Po': 1, 'NA': 0}

# 数値に変換するカテゴリのリスト
train_df_columns = ['ExterQual', 'BsmtQual', 'BsmtCond', 'HeatingQC', 'KitchenQual', 'GarageQual', 'GarageCond']

# 各列にマッピングを適用
for col in train_df_columns:
    train_df[col] = train_df[col].map(qual_mapping)

# データを保存する
output_path = '/Users/hayakawakazue/Downloads/house_price/train/train4.csv'
train_df.to_csv(output_path, index=False)

print(train_df.dtypes)

print(f"Processed data saved to {output_path}")


MSSubClass         int64
MSZoning          object
LotFrontage      float64
LotArea            int64
Street            object
LotShape          object
LandContour       object
Utilities         object
LotConfig         object
LandSlope         object
Neighborhood      object
Condition1        object
Condition2        object
BldgType          object
HouseStyle        object
OverallQual        int64
OverallCond        int64
YearBuilt          int64
YearRemodAdd       int64
RoofStyle         object
RoofMatl          object
Exterior1st       object
Exterior2nd       object
ExterQual          int64
Foundation        object
BsmtQual           int64
BsmtCond           int64
BsmtExposure      object
BsmtFinType1      object
BsmtFinType2      object
BsmtUnfSF          int64
TotalBsmtSF        int64
Heating           object
HeatingQC          int64
CentralAir        object
Electrical        object
1stFlrSF           int64
GrLivArea          int64
FullBath           int64
BedroomAbvGr       int64
