# Baseline

## 데이터 불러오기

In [2]:
import pandas as pd
import numpy as np

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.ensemble import RandomForestClassifier

In [3]:
train = pd.read_csv('train.csv')

In [4]:
train.head()

Unnamed: 0,ID,매물확인방식,보증금,월세,전용면적,해당층,총층,방향,방수,욕실수,주차가능여부,총주차대수,관리비,중개사무소,제공플랫폼,게재일,허위매물여부
0,TRAIN_0000,현장확인,402500000.0,470000,,,15.0,서향,1.0,1.0,가능,40.0,96,t93Nt6I2I0,B플랫폼,2024-10-09,0
1,TRAIN_0001,현장확인,170500000.0,200000,,3.0,4.0,남동향,2.0,1.0,불가능,,0,q39iV5J4E6,D플랫폼,2024-12-26,0
2,TRAIN_0002,전화확인,114000000.0,380000,,2.0,3.0,동향,1.0,1.0,불가능,,0,b03oE4G3F6,A플랫폼,2024-11-28,0
3,TRAIN_0003,현장확인,163500000.0,30000,36.3,3.0,9.0,남동향,2.0,1.0,가능,13.0,10,G52Iz8V2B9,A플랫폼,2024-11-26,0
4,TRAIN_0004,현장확인,346000000.0,530000,,3.0,3.0,동향,2.0,1.0,불가능,,0,N45gM0M7R0,B플랫폼,2024-06-25,1


In [5]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2452 entries, 0 to 2451
Data columns (total 17 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   ID      2452 non-null   object 
 1   매물확인방식  2452 non-null   object 
 2   보증금     2452 non-null   float64
 3   월세      2452 non-null   int64  
 4   전용면적    1665 non-null   float64
 5   해당층     2223 non-null   float64
 6   총층      2436 non-null   float64
 7   방향      2452 non-null   object 
 8   방수      2436 non-null   float64
 9   욕실수     2434 non-null   float64
 10  주차가능여부  2452 non-null   object 
 11  총주차대수   1756 non-null   float64
 12  관리비     2452 non-null   int64  
 13  중개사무소   2452 non-null   object 
 14  제공플랫폼   2452 non-null   object 
 15  게재일     2452 non-null   object 
 16  허위매물여부  2452 non-null   int64  
dtypes: float64(7), int64(3), object(7)
memory usage: 325.8+ KB


## 데이터 전처리

In [6]:
x = train.drop(['ID','허위매물여부'],axis=1)
y = train['허위매물여부']

In [7]:
# SimpleImputer : 평균 대체
mean_imputer = SimpleImputer(strategy='mean')

# 결측값을 평균으로 대체할 열 목록
columns_fill_mean = ['해당층', '총층','전용면적','방수', '욕실수','총주차대수']

# 학습 데이터에 fit 및 transform
x[columns_fill_mean] = mean_imputer.fit_transform(x[columns_fill_mean])

In [8]:
# Label Encoding 적용 열
label_encode_cols = ['중개사무소','게재일','제공플랫폼','방향']

label_encoders = {}
for col in label_encode_cols:
    le = LabelEncoder()
    x[col] = le.fit_transform(x[col].astype(str))
    label_encoders[col] = le

In [9]:
# One-Hot Encoding 적용 열
one_hot_cols = ['매물확인방식', '주차가능여부']

# One-Hot Encoding 적용
one_hot_encoder = OneHotEncoder(sparse=False, handle_unknown='ignore')

# Train 데이터 변환
x_encoded = one_hot_encoder.fit_transform(x[one_hot_cols])
x_encoded_df = pd.DataFrame(x_encoded, columns=one_hot_encoder.get_feature_names_out(one_hot_cols), index=x.index)

# 기존 데이터와 병합
x = pd.concat([x.drop(columns=one_hot_cols), x_encoded_df], axis=1)



## 학습하기

In [10]:
model = RandomForestClassifier(n_estimators=100,
                               criterion='gini',
                               max_depth=None,
                               random_state=42)
model.fit(x, y)

## 예측하기

In [11]:
# Test 데이터 로드
test = pd.read_csv('test.csv')

In [12]:
# Test 결측값 대체
test[columns_fill_mean] = mean_imputer.transform(test[columns_fill_mean])

In [13]:
# Label Encoding 
for col in label_encode_cols:
    if col in test.columns:
        le = label_encoders[col]
        test[col] = test[col].astype(str)
        unseen = set(test[col].unique()) - set(le.classes_)
        if unseen:
            le.classes_ = np.append(le.classes_, list(unseen))
        test[col] = le.transform(test[col])

In [14]:
# One-Hot Encoding
test_encoded = one_hot_encoder.transform(test[one_hot_cols])
test_encoded_df = pd.DataFrame(test_encoded, columns=one_hot_encoder.get_feature_names_out(one_hot_cols), index=test.index)

test = pd.concat([test.drop(columns=one_hot_cols), test_encoded_df], axis=1)

In [15]:
test.drop(columns=['ID'],inplace=True)

In [16]:
pred = pd.Series(model.predict(test))

## 제출하기

In [17]:
submit = pd.read_csv('sample_submission.csv')

In [18]:
submit['허위매물여부'] = pred
submit.head()

Unnamed: 0,ID,허위매물여부
0,TEST_000,0
1,TEST_001,0
2,TEST_002,1
3,TEST_003,0
4,TEST_004,0


In [19]:
submit.to_csv('baseline_submission.csv',index=False)