# PART2-1 Modeling: xgboost.XGBRFRegressor
wather 정보를 바탕으로 해당 날짜의 날씨를 예측하는 모델입니다.
- 예측할 날씨: 기온, 강수 유무, 눈 유무

### Library

In [1]:
# 기본 library
import warnings
warnings.filterwarnings(action='ignore')
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# sklearn library
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.metrics import explained_variance_score, mean_squared_error, mean_absolute_error, r2_score
from sklearn.pipeline import make_pipeline
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, log_loss

# preprocessing
# from category_encoders import OrdinalEncoder, OneHotEncoder, CountEncoder, TargetEncoder
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler, StandardScaler, KBinsDiscretizer, QuantileTransformer

# Model library
from xgboost import XGBRegressor, XGBClassifier
from sklearn.ensemble import RandomForestRegressor

# tuning
from scipy.stats.distributions import uniform
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV

# feature 해석 library
from scipy.stats import spearmanr
from sklearn.feature_selection import SelectKBest, mutual_info_classif

### PreProcessing

### 필요한 Column
- tm: 시간
- ta: 기온
- dsnw: 적설
- rn: 강수

In [2]:
df = pd.read_csv('/Users/yerin/AIB/section4/project/PART1_DB/weather.csv')
df = df[['tm', 'ta', 'rn', 'dsnw']]
df

Unnamed: 0,tm,ta,rn,dsnw
0,2012-01-01 00:00,0.4,,
1,2012-01-01 01:00,0.3,,
2,2012-01-01 02:00,-0.1,,
3,2012-01-01 03:00,-0.5,,
4,2012-01-01 04:00,-1.2,,
...,...,...,...,...
184690,2003-12-26 19:00,-3.2,,
184691,2003-12-26 20:00,-3.9,,
184692,2003-12-26 21:00,-4.5,,
184693,2003-12-26 22:00,-4.9,,


In [3]:
df_original = df.copy()

In [4]:
df = df_original

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 184695 entries, 0 to 184694
Data columns (total 4 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   tm      184695 non-null  object 
 1   ta      184687 non-null  float64
 2   rn      18925 non-null   float64
 3   dsnw    8168 non-null    float64
dtypes: float64(3), object(1)
memory usage: 5.6+ MB


In [6]:
def feature_engineering(df):
    
    # split tm
    df['tm'] = pd.to_datetime(df['tm'])
    df['year'] = df['tm'].dt.year
    df['month'] = df['tm'].dt.month
    df['day'] = df['tm'].dt.day
    df['time'] = df['tm'].dt.hour

    df = df.drop('tm', axis=1)
    df = df.sort_values(['year', 'month']).reset_index(drop=True)
    
    # Missing Value
    df.rn = df.rn.fillna(0)
    df.dsnw = df.dsnw.fillna(0)
    df['ta'] = df.groupby(['month', 'day'])['ta'].transform(lambda x: x.fillna(x.mean()))
    
    # rain 분류
    idx0 = df[df.rn == 0].index
    idx1 = df[df.rn > 0].index
    idx2 = df[df.rn > 20].index
    
    df.rn.loc[idx0] = "맑음"
    df.rn.loc[idx1] = "비"
    df.rn.loc[idx2] = "많은 비"
    
    # snow
    idx_snow = df[df.dsnw > 0].index
    idx_no = df[df.dsnw == 0].index
    
    df.dsnw.loc[idx_snow] = "눈"
    df.dsnw.loc[idx_no] = "눈 없음"

    
    return df

In [7]:
df = feature_engineering(df)

In [8]:
"""
def get_time_range(hour):
    if 0 <= hour <= 6:
        return '새벽'
    elif 6 <= hour <= 11:
        return '오전'
    elif 12 <= hour <= 17:
        return '오후'
    else:
        return '저녁'

# 시간대 및 날짜별 강수량 평균 계산
df['time_range'] = df['time'].apply(get_time_range)
df_avg_rain = df.groupby(['year', 'month', 'day', 'time_range'])['rn'].mean().reset_index()

# 시간대별 평균값으로 채워넣기
df = df.merge(df_avg_rain, on=['year', 'month', 'day', 'time_range'], how='left', suffixes=('', '_avg'))
df['rn'] = df['rn_avg'].fillna(df['rn'])
df.drop(columns=['time_range', 'rn_avg'], inplace=True)

df['rn'] = df['rn'].round(2)

print(df)
"""

"\ndef get_time_range(hour):\n    if 0 <= hour <= 6:\n        return '새벽'\n    elif 6 <= hour <= 11:\n        return '오전'\n    elif 12 <= hour <= 17:\n        return '오후'\n    else:\n        return '저녁'\n\n# 시간대 및 날짜별 강수량 평균 계산\ndf['time_range'] = df['time'].apply(get_time_range)\ndf_avg_rain = df.groupby(['year', 'month', 'day', 'time_range'])['rn'].mean().reset_index()\n\n# 시간대별 평균값으로 채워넣기\ndf = df.merge(df_avg_rain, on=['year', 'month', 'day', 'time_range'], how='left', suffixes=('', '_avg'))\ndf['rn'] = df['rn_avg'].fillna(df['rn'])\ndf.drop(columns=['time_range', 'rn_avg'], inplace=True)\n\ndf['rn'] = df['rn'].round(2)\n\nprint(df)\n"

In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 184695 entries, 0 to 184694
Data columns (total 7 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   ta      184695 non-null  float64
 1   rn      184695 non-null  object 
 2   dsnw    184695 non-null  object 
 3   year    184695 non-null  int32  
 4   month   184695 non-null  int32  
 5   day     184695 non-null  int32  
 6   time    184695 non-null  int32  
dtypes: float64(1), int32(4), object(2)
memory usage: 7.0+ MB


## Modeling

---

### 기온모델
기온 모델은 날짜만으로 기온을 예측하는 선형 회귀 모델이다.

In [10]:
X = df[['month', 'day', 'time']]
y = df['ta']

X.shape, y.shape

((184695, 3), (184695,))

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

In [12]:
model_ta = XGBRegressor(random_state = 42, max_depth = 15, learning_rate=0.1)
model_ta.fit(X_train, y_train)
y_val_pred = model_ta.predict(X_val)

rmse = mean_squared_error(y_val, y_val_pred, squared=False)
print(r2_score(y_val, y_val_pred))

0.8815918611566825


In [13]:
y_test_pred = model_ta.predict(X_test)
print(r2_score(y_test, y_test_pred))

0.8824712033183947


In [14]:
import pickle

with open('model_ta.pkl','wb') as pickle_file:
    pickle.dump(model_ta, pickle_file)

---

### 강수량 모델
강수량 모델은 기온 모델이 예측한 기온, 날짜를 이용하여 비를 예측하는 모델이다.

In [15]:
df_rn = df.copy()

In [16]:
df_rn = df_rn[df_rn.month <12]
df_rn = df_rn[df_rn.month >2]
df_rn = df_rn.reset_index(drop=True)

In [17]:
X = df_rn[['month', 'day', 'time', 'ta']]
y = df_rn['rn']

X.shape, y.shape

((139029, 4), (139029,))

In [18]:
y.value_counts()

rn
맑음      128855
비        10015
많은 비       159
Name: count, dtype: int64

In [19]:
from sklearn.preprocessing import LabelEncoder

# 레이블 인코더 생성
label_encoder = LabelEncoder()

# 문자열 클래스 레이블을 정수로 변환
y_encoded = label_encoder.fit_transform(y)

y_encoded

array([1, 1, 1, ..., 1, 1, 1])

In [20]:
np.unique(y_encoded, return_counts = True)

# 0 = 많은 비
# 1 = 비 안옴
# 2 = 비

(array([0, 1, 2]), array([   159, 128855,  10015]))

In [21]:
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

In [22]:
model_rn = XGBClassifier(random_state = 42, 
                         objective='multi:softmax', 
                         num_class=3, 
                         max_depth=15,
                         learning_rate = 0.25,
                         min_child_weight = 0.6)

In [23]:
model_rn.fit(X_train, y_train)
y_val_pred = model_rn.predict(X_val)
y_train_pred = model_rn.predict(X_train)

In [24]:
accuracy = accuracy_score(y_val, y_val_pred)
precision = precision_score(y_val, y_val_pred, average='macro')
recall = recall_score(y_val, y_val_pred, average='macro')
f1 = f1_score(y_val, y_val_pred, average='macro')

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)

Accuracy: 0.9236682400539447
Precision: 0.5246702447077837
Recall: 0.4238666318430633
F1 Score: 0.45015754894793414


In [25]:
y_test_pred = model_rn.predict(X_test)
accuracy_test = accuracy_score(y_test, y_test_pred)
f1_test = f1_score(y_test, y_test_pred, average='macro')

accuracy_test, f1_test

(0.925555635474358, 0.4990679876738579)

In [26]:
with open('model_rn.pkl','wb') as pickle_file:
    pickle.dump(model_rn, pickle_file)

---
눈

In [27]:
df_sn = df.copy()

In [28]:
df_sn = df_sn.query('(month < 3) or (month > 11)')

In [29]:
X = df_sn[['month', 'day', 'time', 'ta']]
y = df_sn['dsnw']

X.shape, y.shape

((45666, 4), (45666,))

In [30]:
y.value_counts(normalize=True)

dsnw
눈 없음    0.832107
눈       0.167893
Name: proportion, dtype: float64

In [31]:
# 레이블 인코더 생성
label_encoder = LabelEncoder()

# 문자열 클래스 레이블을 정수로 변환
y_encoded = label_encoder.fit_transform(y)

y_encoded

array([1, 1, 1, ..., 1, 1, 1])

In [40]:
np.unique(y_encoded, return_counts = True)

# 0= 눈
# 1=눈 없음

(array([0, 1]), array([ 7667, 37999]))

In [32]:
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

In [33]:
model_sn = XGBClassifier(random_state = 42, 
                         objective='binary:logistic',
                         max_depth=15,
                         learning_rate = 0.2,
                         min_child_weight = 0.6)

In [34]:
model_sn.fit(X_train, y_train)
y_val_pred = model_sn.predict(X_val)
y_train_pred = model_sn.predict(X_train)

In [35]:
accuracy = accuracy_score(y_val, y_val_pred)
precision = precision_score(y_val, y_val_pred, average='macro')
recall = recall_score(y_val, y_val_pred, average='macro')
f1 = f1_score(y_val, y_val_pred, average='macro')

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)

Accuracy: 0.8482277268372793
Precision: 0.7198713147008773
Recall: 0.6825295645004492
F1 Score: 0.6979535556416745


In [36]:
y_test_pred = model_sn.predict(X_test)
accuracy_test = accuracy_score(y_test, y_test_pred)
f1_test = f1_score(y_test, y_test_pred, average='macro')

accuracy_test, f1_test

(0.8481497700897744, 0.7025047390429628)

In [37]:
with open('model_sn.pkl','wb') as pickle_file:
    pickle.dump(model_sn, pickle_file)

---

기온 정보가 주어지지 않았을 때

In [10]:
df_rn = df.copy()

In [11]:
df_rn = df_rn[df_rn.month <12]
df_rn = df_rn[df_rn.month >2]
df_rn = df_rn.reset_index(drop=True)

In [12]:
X = df_rn[['month', 'day', 'time']]
y = df_rn['rn']

X.shape, y.shape

((139029, 3), (139029,))

In [13]:
from sklearn.preprocessing import LabelEncoder

# 레이블 인코더 생성
label_encoder = LabelEncoder()

# 문자열 클래스 레이블을 정수로 변환
y_encoded = label_encoder.fit_transform(y)

y_encoded

array([1, 1, 1, ..., 1, 1, 1])

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

In [15]:
model_rn = XGBClassifier(random_state = 42, 
                         objective='multi:softmax', 
                         num_class=3, 
                         max_depth=15,
                         learning_rate = 0.25,
                         min_child_weight = 0.6)

In [16]:
model_rn.fit(X_train, y_train)
y_val_pred = model_rn.predict(X_val)
y_train_pred = model_rn.predict(X_train)

In [17]:
accuracy = accuracy_score(y_val, y_val_pred)
precision = precision_score(y_val, y_val_pred, average='macro')
recall = recall_score(y_val, y_val_pred, average='macro')
f1 = f1_score(y_val, y_val_pred, average='macro')

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)

Accuracy: 0.9257810743987412
Precision: 0.34801539834937034
Recall: 0.3335050333623743
F1 Score: 0.3213055924185949


In [18]:
y_test_pred = model_rn.predict(X_test)
accuracy_test = accuracy_score(y_test, y_test_pred)
f1_test = f1_score(y_test, y_test_pred, average='macro')

accuracy_test, f1_test

(0.9274616989139035, 0.32112418413855603)