In [1]:
import pandas as pd

# 列方向を省略しない
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)

In [2]:
def convert_yyyymm_to_datetime(df_train, df_test, col_name):
    df_train_yyyymm = pd.to_datetime(df_train[col_name], format='%Y%m')
    df_test_yyyymm = pd.to_datetime(df_test[col_name], format='%Y%m')
    return df_train_yyyymm, df_test_yyyymm

def calc_month_from_now(df_train, df_test, df_train_yyyymm, df_test_yyyymm, col_name):
    df_train[col_name + '_month'] = -(df_train_yyyymm - pd.Timestamp('2025-01-01')).dt.days // 30
    df_test[col_name + '_month'] = -(df_test_yyyymm - pd.Timestamp('2025-01-01')).dt.days // 30
    return df_train, df_test

def drop_column(df_train, df_test, col_name):
    df_train = df_train.drop(columns=[col_name],axis='columns')
    df_test = df_test.drop(columns=[col_name],axis='columns')
    return df_train, df_test

def generate_month_from_now_features(df_train, df_test,col_name):
    df_train_yyyymm, df_test_yyyymm = convert_yyyymm_to_datetime(df_train, df_test, col_name)
    df_train, df_test = calc_month_from_now(df_train, df_test, df_train_yyyymm, df_test_yyyymm, col_name)
    df_train, df_test = drop_column(df_train, df_test, col_name)
    return df_train, df_test

In [41]:
# categoryデータ列をone-hot-encoding
def one_hot_encoding(df_train, df_test, col_name,feature_columns):
    df = pd.concat([df_train, df_test])
    df = pd.get_dummies(df, columns=[col_name])
    df_train = df.iloc[:len(df_train)]
    df_test = df.iloc[len(df_train):].reset_index(drop=True)
    feature_columns = list(df_train.columns)
    return df_train, df_test, feature_columns

def one_hot_encoding_tag_id(df_train, df_test, col_name,feature_columns):
    df = pd.concat([df_train, df_test])
    tag_id_split = df[col_name].str.get_dummies(sep='/')
    tag_id_split = tag_id_split.add_prefix(col_name + '_')
    df = pd.concat([df, tag_id_split], axis=1)
    df = df.drop(columns=[col_name],axis='columns')
    df_train = df.iloc[:len(df_train)]
    df_test = df.iloc[len(df_train):].reset_index(drop=True)
    feature_columns = list(df.columns)
    return df_train, df_test, feature_columns

In [71]:
target_col = 'money_room'
index_col = 'index'
feature_columns = [
    #'building_id',
    'building_status',
    'building_type',
    'unit_count',
    'building_structure',
    'total_floor_area',
    'building_area',
    'floor_count',
    'basement_floor_count',
    'year_built',
    'building_land_area',
    'land_area_all',
    'unit_area_min',
    'unit_area_max',
    'building_land_chimoku',
    'land_youto',
    'land_toshi',
    'land_chisei',
    'land_area_kind',
    'land_setback_flg',
    'land_setback',
    'land_kenpei',
    'land_youseki',
    'land_road_cond',
    #'land_seigen',
    'building_area_kind',
    'management_form',
    'management_association_flg',
    #'reform_exterior',
    #'reform_exterior_other',
    'reform_exterior_date',
    #'reform_common_area',
    'reform_common_area_date',
    'building_tag_id',
    'room_floor',
    'balcony_area',
    'dwelling_unit_window_angle',
    'room_count',
    'unit_area',
    'floor_plan_code',
    #'reform_date',
    #'reform_place',
    #'reform_place_other',
    #'reform_wet_area',
    #'reform_wet_area_other',
    'reform_wet_area_date',
    #'reform_interior',
    #'reform_interior_other',
    'reform_interior_date',
    #'reform_etc',
    #'renovation_date',
    #'renovation_etc',
    'flg_open',
    'flg_own',
    'bukken_type',
    'flg_investment',
    'empty_number',
    #'empty_contents',
    #'addr1_1',
    #'addr1_2',
    #'eki_name1',
    #'bus_stop1',
    #'bus_time1',
    #'walk_distance1',
    #'rosen_name2',
    #'eki_name2',
    #'bus_stop2',
    #'bus_time2',
    #'walk_distance2',
    #'traffic_other',
    #'traffic_car',
    'snapshot_land_area',
    'snapshot_land_shidou',
    'land_shidou_a',
    'land_shidou_b',
    'land_mochibun_a',
    'land_mochibun_b',
    'house_area',
    'flg_new',
    'house_kanrinin',
    'room_kaisuu',
    'snapshot_window_angle',
    'madori_number_all',
    'madori_kind_all',
    'money_kyoueki',
    'money_kyoueki_tax',
    'money_rimawari_now',
    'money_shuuzen',
    'money_shuuzenkikin',
    #'money_sonota_str1',
    'money_sonota1',
    #'money_sonota_str2',
    'money_sonota2',
    #'money_sonota_str3',
    'money_sonota3',
    'parking_money',
    'parking_money_tax',
    'parking_kubun',
    'parking_distance',
    'parking_number',
    #'parking_memo',
    'genkyo_code',
    'usable_status',
    'usable_date',
    #'school_ele_name',
    'school_ele_distance',
    #'school_ele_code',
    #'school_jun_name',
    'school_jun_distance',
    #'school_jun_code',
    'convenience_distance',
    'super_distance',
    'hospital_distance',
    'park_distance',
    'drugstore_distance',
    'bank_distance',
    'shopping_street_distance',
    #'est_other_name',
    'est_other_distance',
    'statuses',
    'parking_keiyaku',
    #'money_hoshou_company',
    'free_rent_duration',
    'free_rent_gen_timing']

category_columns = ['building_status',
                    'building_type',
                    'building_structure',
                    'building_land_chimoku',
                    'land_youto',
                    'land_toshi',
                    'land_chisei',
                    'land_area_kind',
                    'land_setback_flg',
                    'land_road_cond',
                    'building_area_kind',
                    'management_form',
                    'management_association_flg',
                    'flg_open',
                    'flg_own',
                    'bukken_type',
                    'flg_investment',
                    'flg_new',
                    'house_kanrinin',
                    'snapshot_window_angle',
                    'madori_kind_all',
                    'money_kyoueki_tax',
                    'parking_money_tax',
                    'parking_kubun',
                    'genkyo_code',
                    'usable_status',
                    'parking_keiyaku']

tag_id_columns = ['building_tag_id','statuses']

yyyymm_columns = ['year_built',
                  'reform_exterior_date',
                  'reform_common_area_date',
                  'reform_wet_area_date',
                  'reform_interior_date',
                  'free_rent_gen_timing']

In [72]:
# fearure_columnsのみ読み込む
train = pd.read_csv('../data/train.csv', usecols=feature_columns + [target_col])
test = pd.read_csv('../data/test.csv', usecols=feature_columns + [index_col])

In [73]:
for col_name in yyyymm_columns:
    train, test = generate_month_from_now_features(train, test, col_name)
print(train.shape)
print(test.shape)

(584507, 86)
(384540, 86)


In [74]:
for col_name in category_columns:
    train, test, feature_columns = one_hot_encoding(train, test, col_name,feature_columns)
print(train.shape)
print(test.shape)

(584507, 199)
(384540, 199)


In [75]:
for col_name in tag_id_columns:
    train, test, feature_columns = one_hot_encoding_tag_id(train, test, col_name,feature_columns)
print(train.shape)
print(test.shape)

(584507, 430)
(384540, 430)


In [76]:
train.columns

Index(['money_room', 'unit_count', 'total_floor_area', 'building_area',
       'floor_count', 'basement_floor_count', 'building_land_area',
       'land_area_all', 'unit_area_min', 'unit_area_max',
       ...
       'statuses_335301', 'statuses_335401', 'statuses_335501',
       'statuses_336401', 'statuses_336501', 'statuses_340101',
       'statuses_340102', 'statuses_340201', 'statuses_340401',
       'statuses_350201'],
      dtype='object', length=430)

In [77]:
# lgihtGBMでroom_moneyを予測する
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

# 予測対象の設定
# yyyymm_columnsを削除して、_monthを追加
feature_columns = feature_columns + [col + '_month' for col in yyyymm_columns]
feature_columns = list(set(feature_columns) - set(yyyymm_columns))
train = train.drop(columns=index_col,axis='columns')
test = test.drop(columns=target_col,axis='columns')
feature_columns = list(set(feature_columns) - set([target_col]))
feature_columns = list(set(feature_columns) - set([index_col]))

# 特徴量とターゲットの設定
X = train[feature_columns]
y = train[target_col]

# データの分割
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# LightGBMのデータセット作成
train_data = lgb.Dataset(X_train, label=y_train)
val_data = lgb.Dataset(X_val, label=y_val)

# ハイパーパラメータの設定
params = {
    'objective': 'regression',
    'metric': 'rmse',
    'boosting_type': 'gbdt',
    'learning_rate': 0.01,
    'num_leaves': 31,
    'verbose': -1
}

# モデルの学習
model = lgb.train(params, train_data, valid_sets=[val_data], num_boost_round=1000)

# バリデーションデータでの予測
y_pred = model.predict(X_val)

# RMSEの計算
rmse = mean_squared_error(y_val, y_pred, squared=False)
print(f'Validation RMSE: {rmse}')

Validation RMSE: 18327.43480934778




In [78]:
test[target_col] = model.predict(test[feature_columns])

In [79]:
submission = test[['index', 'money_room']]
# indexを6桁でpadding
submission['index'] = submission['index'].astype(str).str.zfill(6)
submission.to_csv('../data/submission/submission.csv', index=False,header=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  submission['index'] = submission['index'].astype(str).str.zfill(6)


In [80]:
!signate submit --competition-id=1512 '../data/submission/submission.csv'

[32mYou have successfully submitted your predictions.We will send you the submission result to your email address.[0m
