# **Random Forest**

---

- **목표**
    - 중고차 판매 이력 데이터셋을 이용해 중고차 가격을 예측하라.
- **알고리즘** : Random Forest
- **문제유형** : 회귀
- **종속변수** : selling_price(판매 가격)
- **사용한 모델** : RandomForestRegressor
- **데이터셋**
    - 파일명 : car.csv
    - 소개
        - 중고차 판매 이력을 다룬 데이터입니다. 종속변수는 판매 가격이며, 독립변수로는 생산년도, 주행거리, 변속기, 마일리지, 배기량 등이 있다.
- **평가지표** : RMSE(Root Mean Square Error, 평균 제곱근 오차)

In [44]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

file_url = 'https://media.githubusercontent.com/media/musthave-ML10/data_source/main/car.csv'
data = pd.read_csv(file_url)

In [45]:
data.head()

Unnamed: 0,name,year,selling_price,km_driven,fuel,seller_type,transmission,owner,mileage,engine,max_power,torque,seats
0,Maruti Swift Dzire VDI,2014,450000,145500,Diesel,Individual,Manual,First Owner,23.4 kmpl,1248 CC,74 bhp,190Nm@ 2000rpm,5.0
1,Skoda Rapid 1.5 TDI Ambition,2014,370000,120000,Diesel,Individual,Manual,Second Owner,21.14 kmpl,1498 CC,103.52 bhp,250Nm@ 1500-2500rpm,5.0
2,Honda City 2017-2020 EXi,2006,158000,140000,Petrol,Individual,Manual,Third Owner,17.7 kmpl,1497 CC,78 bhp,"12.7@ 2,700(kgm@ rpm)",5.0
3,Hyundai i20 Sportz Diesel,2010,225000,127000,Diesel,Individual,Manual,First Owner,23.0 kmpl,1396 CC,90 bhp,22.4 kgm at 1750-2750rpm,5.0
4,Maruti Swift VXI BSIII,2007,130000,120000,Petrol,Individual,Manual,First Owner,16.1 kmpl,1298 CC,88.2 bhp,"11.5@ 4,500(kgm@ rpm)",5.0


In [46]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8128 entries, 0 to 8127
Data columns (total 13 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   name           8128 non-null   object 
 1   year           8128 non-null   int64  
 2   selling_price  8128 non-null   int64  
 3   km_driven      8128 non-null   int64  
 4   fuel           8128 non-null   object 
 5   seller_type    8128 non-null   object 
 6   transmission   8128 non-null   object 
 7   owner          8128 non-null   object 
 8   mileage        7907 non-null   object 
 9   engine         7907 non-null   object 
 10  max_power      7913 non-null   object 
 11  torque         7906 non-null   object 
 12  seats          7907 non-null   float64
dtypes: float64(1), int64(3), object(9)
memory usage: 825.6+ KB


In [47]:
round(data.describe(), 2) # 통계 정보 출력

Unnamed: 0,year,selling_price,km_driven,seats
count,8128.0,8128.0,8128.0,7907.0
mean,2013.8,638271.81,69819.51,5.42
std,4.04,806253.4,56550.55,0.96
min,1983.0,29999.0,1.0,2.0
25%,2011.0,254999.0,35000.0,5.0
50%,2015.0,450000.0,60000.0,5.0
75%,2017.0,675000.0,98000.0,5.0
max,2020.0,10000000.0,2360457.0,14.0


In [48]:
data['engine'].str.split(expand=True)

Unnamed: 0,0,1
0,1248,CC
1,1498,CC
2,1497,CC
3,1396,CC
4,1298,CC
...,...,...
8123,1197,CC
8124,1493,CC
8125,1248,CC
8126,1396,CC


In [49]:
data[['engine', 'engine_unit']] = data['engine'].str.split(expand=True)

In [50]:
data['engine'].head()

0    1248
1    1498
2    1497
3    1396
4    1298
Name: engine, dtype: object

In [51]:
data['engine'] = data['engine'].astype('float32') # 숫자형 변수로 변환
data['engine'].head() # 엔진 변수 확인

0    1248.0
1    1498.0
2    1497.0
3    1396.0
4    1298.0
Name: engine, dtype: float32

In [52]:
data['engine_unit'].unique() #고윳값 확인

array(['CC', nan], dtype=object)

In [53]:
data.drop('engine_unit', axis=1, inplace=True)

In [54]:
data[['max_power', 'max_power_unit']] = data['max_power'].str.split(expand=True)

In [55]:
data['max_power'].head()

0        74
1    103.52
2        78
3        90
4      88.2
Name: max_power, dtype: object

In [56]:
def isFloat(value):
    try:
        num=float(value)
        return num
    except ValueError:
        return np.NaN

In [57]:
data['max_power'] = data['max_power'].apply(isFloat)
data['max_power_unit'].unique()

array(['bhp', nan, None], dtype=object)

In [58]:
data.drop('max_power_unit', axis=1, inplace=True)

In [59]:
data[['mileage', 'mileage_unit']] = data['mileage'].str.split(expand=True)
data['mileage'] = data['mileage'].astype('float32')
data['mileage_unit'].unique()

array(['kmpl', 'km/kg', nan], dtype=object)

In [60]:
data['fuel'].unique()

array(['Diesel', 'Petrol', 'LPG', 'CNG'], dtype=object)

In [61]:
def mile(x):
    if x['fuel'] == 'Petrol':
        return x['mileage'] / 80.43
    elif x['fuel'] == 'Diesel':
        return x['mileage'] / 73.56
    elif x['fuel'] == 'LPG':
        return x['mileage'] / 40.85
    else:
        return x['mileage'] / 44.23

In [62]:
data['mileage'] = data.apply(mile, axis=1)

In [63]:
data.drop('mileage_unit', axis=1, inplace=True)

In [64]:
data['torque'].head()

0              190Nm@ 2000rpm
1         250Nm@ 1500-2500rpm
2       12.7@ 2,700(kgm@ rpm)
3    22.4 kgm at 1750-2750rpm
4       11.5@ 4,500(kgm@ rpm)
Name: torque, dtype: object

In [65]:
data['torque'] = data['torque'].str.upper()

In [66]:
def torque_unit(x):
    if 'NM' in str(x):
        return 'Nm'
    elif 'KGM' in str(x):
        return 'kgm'

In [67]:
data['torque_unit'] = data['torque'].apply(torque_unit)

In [68]:
data['torque_unit'].isna()
data[data['torque_unit'].isna()]
data[data['torque_unit'].isna()]['torque'].unique()

array([nan, '250@ 1250-5000RPM', '510@ 1600-2400', '110(11.2)@ 4800',
       '210 / 1900'], dtype=object)

In [69]:
data['torque_unit'].fillna('Nm', inplace=True) # 결측치를 Nm으로 대체

In [70]:
def split_num(x):
    x = str(x)
    for i, j in enumerate(x):
        if j not in '0123456789.':
            cut = i
            break
    return x[:cut]

In [71]:
data['torque'] = data['torque'].apply(split_num)

In [72]:
data['torque'] = data['torque'].replace('', np.NaN)
data['torque'] = data['torque'].astype('float64')

In [73]:
def torque_trans(x):
    if x['torque_unit'] == 'kgm':
        return x['torque'] * 9.8066
    else:
        return x['torque']

In [74]:
data['torque'] = data.apply(torque_trans, axis=1)
data.drop('torque_unit', axis=1, inplace=True)

In [75]:
data['name'] = data['name'].str.split(expand=True)[0]

In [76]:
data['name'].unique()

array(['Maruti', 'Skoda', 'Honda', 'Hyundai', 'Toyota', 'Ford', 'Renault',
       'Mahindra', 'Tata', 'Chevrolet', 'Fiat', 'Datsun', 'Jeep',
       'Mercedes-Benz', 'Mitsubishi', 'Audi', 'Volkswagen', 'BMW',
       'Nissan', 'Lexus', 'Jaguar', 'Land', 'MG', 'Volvo', 'Daewoo',
       'Kia', 'Force', 'Ambassador', 'Ashok', 'Isuzu', 'Opel', 'Peugeot'],
      dtype=object)

In [77]:
data['name'] = data['name'].replace('Land', 'Land Rover')

In [78]:
data.dropna(inplace=True)
len(data)

7906

In [79]:
data = pd.get_dummies(data, columns=['name', 'fuel', 'seller_type', 'transmission', 'owner'], drop_first=True)

In [80]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(data.drop('selling_price', axis=1), data['selling_price'], test_size=0.2, random_state=100)

In [81]:
from sklearn.ensemble import RandomForestRegressor

model = RandomForestRegressor(random_state=100)
model.fit(X_train, y_train)
train_pred = model.predict(X_train)
test_pred = model.predict(X_test)

In [82]:
from sklearn.metrics import mean_squared_error

print(f"Train rmse : {mean_squared_error(y_train, train_pred)**0.5}, Test rmse : {mean_squared_error(y_test, test_pred)**0.5}")

Train rmse : 53531.41548125947, Test rmse : 131855.18391308116


In [83]:
from sklearn.model_selection import KFold

data.reset_index(drop=True, inplace=True) # 인덱스를 변수로 가져옴, drop 매개변수는 기존 인덱스가 새로운 커럼형태로 추가되는것을 방지해준다.
kf = KFold(n_splits=5)
X = data.drop('selling_price', axis=1) # 종속변수를 제거하여 X에 저장
y = data['selling_price'] # 종속변수를 y에 저장

for i, j in kf.split(X):
    print(i, j)

[1582 1583 1584 ... 7903 7904 7905] [   0    1    2 ... 1579 1580 1581]
[   0    1    2 ... 7903 7904 7905] [1582 1583 1584 ... 3160 3161 3162]
[   0    1    2 ... 7903 7904 7905] [3163 3164 3165 ... 4741 4742 4743]
[   0    1    2 ... 7903 7904 7905] [4744 4745 4746 ... 6322 6323 6324]
[   0    1    2 ... 6322 6323 6324] [6325 6326 6327 ... 7903 7904 7905]


In [84]:
for train_index, test_index in kf.split(X):
    X_train, X_test = X.loc[train_index], X.loc[test_index]
    y_train, y_test = y[train_index], y[test_index]

In [86]:
train_rmse_total = []
test_rmse_total = []

for train_index, test_index in kf.split(X):
    X_train, X_test = X.loc[train_index], X.loc[test_index]
    y_train, y_test = y[train_index], y[test_index]

    model = RandomForestRegressor(random_state=100)
    model.fit(X_train, y_train)
    train_pred = model.predict(X_train)
    test_pred = model.predict(X_test)
    train_rmse = mean_squared_error(y_train, train_pred) ** 0.5
    test_rmse = mean_squared_error(y_test, test_pred) ** 0.5

    train_rmse_total.append(train_rmse)
    test_rmse_total.append(test_rmse)

In [87]:
train_rmse_total

[50825.5556350298,
 58854.04054344074,
 57904.19615940739,
 56218.23740006373,
 58967.150857632456]

In [88]:
print(f"Train rmse : {sum(train_rmse_total)/5}, Test rmse : {sum(test_rmse_total)/5}")

Train rmse : 56553.836119114814, Test rmse : 142936.58918244042
