In [205]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_squared_error, r2_score


# -----------------------------
# 1) 데이터 준비
# -----------------------------

df = pd.read_csv("/content/drive/MyDrive/기계학습프로그래밍/실습 파일/weather.csv").dropna()

# RISK_MM 열과 사용하지 않을 Date, Location 열을 제거
df = df.drop(columns=["RISK_MM", "Date", "Location"])
df

Unnamed: 0,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,WindDir3pm,WindSpeed9am,...,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,RainTomorrow
0,8.0,24.3,0.0,3.4,6.3,NW,30.0,SW,NW,6.0,...,68,29,1019.7,1015.0,7,7,14.4,23.6,No,Yes
1,14.0,26.9,3.6,4.4,9.7,ENE,39.0,E,W,4.0,...,80,36,1012.4,1008.4,5,3,17.5,25.7,Yes,Yes
2,13.7,23.4,3.6,5.8,3.3,NW,85.0,N,NNE,6.0,...,82,69,1009.5,1007.2,8,7,15.4,20.2,Yes,Yes
3,13.3,15.5,39.8,7.2,9.1,NW,54.0,WNW,W,30.0,...,62,56,1005.5,1007.0,2,7,13.5,14.1,Yes,Yes
4,7.6,16.1,2.8,5.6,10.6,SSE,50.0,SSE,ESE,20.0,...,68,49,1018.3,1018.5,7,7,11.1,15.4,Yes,No
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
361,9.0,30.7,0.0,7.6,12.1,NNW,76.0,SSE,NW,7.0,...,38,15,1016.1,1010.8,1,3,20.4,30.0,No,No
362,7.1,28.4,0.0,11.6,12.7,N,48.0,NNW,NNW,2.0,...,45,22,1020.0,1016.9,0,1,17.2,28.2,No,No
363,12.5,19.9,0.0,8.4,5.3,ESE,43.0,ENE,ENE,11.0,...,63,47,1024.0,1022.8,3,2,14.5,18.3,No,No
364,12.5,26.9,0.0,5.0,7.1,NW,46.0,SSW,WNW,6.0,...,69,39,1021.0,1016.2,6,7,15.8,25.9,No,No


In [206]:
# 결측치 확인
df.isnull().sum()

Unnamed: 0,0
MinTemp,0
MaxTemp,0
Rainfall,0
Evaporation,0
Sunshine,0
WindGustDir,0
WindGustSpeed,0
WindDir9am,0
WindDir3pm,0
WindSpeed9am,0


In [207]:
# 컬럼명 확인
df.columns

Index(['MinTemp', 'MaxTemp', 'Rainfall', 'Evaporation', 'Sunshine',
       'WindGustDir', 'WindGustSpeed', 'WindDir9am', 'WindDir3pm',
       'WindSpeed9am', 'WindSpeed3pm', 'Humidity9am', 'Humidity3pm',
       'Pressure9am', 'Pressure3pm', 'Cloud9am', 'Cloud3pm', 'Temp9am',
       'Temp3pm', 'RainToday', 'RainTomorrow'],
      dtype='object')

In [208]:
# 모든 레이블을 숫자로 변형
from sklearn.preprocessing import LabelEncoder

label_encoders = {}
for column in df.columns:
    label_encoders[column] = LabelEncoder()
    df[column] = label_encoders[column].fit_transform(df[column])

In [209]:
# 모든 컬럼 값이 숫자로 변경되었는지 확인
df

Unnamed: 0,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,WindDir3pm,WindSpeed9am,...,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,RainTomorrow
0,90,111,0,15,43,7,8,12,7,2,...,29,15,93,82,7,7,95,129,0,1
1,138,130,16,20,75,1,12,0,13,1,...,41,22,37,27,5,3,123,145,1,1
2,135,106,16,27,24,7,33,3,5,2,...,43,55,20,22,8,7,103,99,1,1
3,133,42,43,34,69,7,20,14,13,15,...,23,42,7,21,2,7,88,50,1,1
4,86,46,13,26,84,10,18,10,2,10,...,29,35,84,113,7,7,68,60,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
361,100,152,0,36,99,6,29,10,7,3,...,1,2,64,46,1,3,148,167,0,0
362,81,141,0,51,104,3,17,6,6,0,...,6,8,95,98,0,1,120,160,0,0
363,125,78,0,40,36,2,14,1,1,5,...,24,33,131,147,3,2,96,81,0,0
364,125,130,0,23,50,7,16,11,14,2,...,30,25,103,91,6,7,107,146,0,0


In [210]:
# 입력과 출력 분리
X = df.drop(columns=["Rainfall", "RainTomorrow"]) # RainTomorrow 사용 x
y = df["Rainfall"]

In [211]:
# 분리 후 확인
X

Unnamed: 0,MinTemp,MaxTemp,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,WindDir3pm,WindSpeed9am,WindSpeed3pm,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday
0,90,111,15,43,7,8,12,7,2,9,29,15,93,82,7,7,95,129,0
1,138,130,20,75,1,12,0,13,1,7,41,22,37,27,5,3,123,145,1
2,135,106,27,24,7,33,3,5,2,1,43,55,20,22,8,7,103,99,1
3,133,42,34,69,7,20,14,13,15,11,23,42,7,21,2,7,88,50,1
4,86,46,26,84,10,18,10,2,10,13,29,35,84,113,7,7,68,60,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
361,100,152,36,99,6,29,10,7,3,22,1,2,64,46,1,3,148,167,0
362,81,141,51,104,3,17,6,6,0,8,6,8,95,98,0,1,120,160,0
363,125,78,40,36,2,14,1,1,5,3,24,33,131,147,3,2,96,81,0
364,125,130,23,50,7,16,11,14,2,13,30,25,103,91,6,7,107,146,0


In [212]:
# 분리 후 확인
y


Unnamed: 0,Rainfall
0,0
1,16
2,16
3,43
4,13
...,...
361,0
362,0
363,0
364,0


In [213]:
# 훈련과 테스트 데이터 분리
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [214]:
# 각 데이터의 shape 확인 (전체데이터 수, 컬럼숫자)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((262, 19), (66, 19), (262,), (66,))

In [215]:
# 의사결정나무 회귀 모델 객체 생성
dt = DecisionTreeRegressor(random_state=42)

# 랜덤포레스트 회귀 모델 객체 생성
rf = RandomForestRegressor(n_estimators=200, random_state=42)

# 선형 회귀 모델 객체 생성
lr = LinearRegression()

In [216]:
# -----------------------------
# 모델 학습
# -----------------------------

dt.fit(X_train, y_train)
rf.fit(X_train, y_train)
lr.fit(X_train, y_train)

In [217]:
# 각 모델 예측
dt_y_pred = dt.predict(X_test)
rf_y_pred = rf.predict(X_test)
lr_y_pred = lr.predict(X_test)

In [218]:
# 각 모델 성능 평가
dt_mse = mean_squared_error(y_test, dt_y_pred)
dt_r2 = r2_score(y_test, dt_y_pred)

rf_mse = mean_squared_error(y_test, rf_y_pred)
rf_r2 = r2_score(y_test, rf_y_pred)

lr_mse = mean_squared_error(y_test, lr_y_pred)
lr_r2 = r2_score(y_test, lr_y_pred)

In [219]:
# MSE(제곱근 평균 제곱 오차) 계산
# R-squared(결정 계수) 계산

print("=== 의사 결정 나무 회귀 모델 ===")
print(f"Mean Squared Error (MSE): {dt_mse:.4f}")
print(f"R-squared (R2): {dt_r2:.4f}")
print("\n")

print("=== 랜덤 포레스트 회귀 모델 ===")
print(f"Mean Squared Error (MSE): {rf_mse:.4f}")
print(f"R-squared (R2): {rf_r2:.4f}")
print("\n")

print("=== 선형 회귀 모델 ===")
print(f"Mean Squared Error (MSE): {lr_mse:.4f}")
print(f"R-squared (R2): {lr_r2:.4f}")

=== 의사 결정 나무 회귀 모델 ===
Mean Squared Error (MSE): 37.8636
R-squared (R2): 0.6995


=== 랜덤 포레스트 회귀 모델 ===
Mean Squared Error (MSE): 35.3266
R-squared (R2): 0.7196


=== 선형 회귀 모델 ===
Mean Squared Error (MSE): 33.5511
R-squared (R2): 0.7337
