In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

### 1.Load Dataset.

In [None]:
from glob import glob

file_list = glob("./data/bike/*.csv")
file_list

In [None]:
train = pd.read_csv(file_list[2], parse_dates=["datetime"])
train.shape

In [None]:
train.info()

In [None]:
train.head()

In [None]:
test = pd.read_csv(file_list[1], parse_dates=["datetime"])
test.shape

In [None]:
test.info()

In [None]:
display(test.head())

### 2.Feature Engineering

In [None]:
train["year"] = train["datetime"].dt.year
train["month"] = train["datetime"].dt.month
train["day"] = train["datetime"].dt.day
train["hour"] = train["datetime"].dt.hour
train["minute"] = train["datetime"].dt.minute
train["second"] = train["datetime"].dt.second
train["dayofweek"] = train["datetime"].dt.dayofweek
train.shape

In [None]:
test["year"] = test["datetime"].dt.year
test["month"] = test["datetime"].dt.month
test["day"] = test["datetime"].dt.day
test["hour"] = test["datetime"].dt.hour
test["minute"] = test["datetime"].dt.minute
test["second"] = test["datetime"].dt.second
test["dayofweek"] = test["datetime"].dt.dayofweek
display(test.shape)

In [None]:
print(f"train windspeed column :\n{train['windspeed'].value_counts().head()}", 
      f"test windspeed column :\n{test['windspeed'].value_counts().head()}",
      sep="\n\n")

In [None]:
fig, axes = plt.subplots(2)
fig.set_size_inches(12, 14)

plt.sca(axes[0])
plt.xticks(rotation=30)
axes[0].set(title="Train's Windspeed")
sns.countplot(train,
              x="windspeed",
              ax=axes[0])

plt.sca(axes[1])
plt.xticks(rotation=30)
axes[1].set(title="Test's Windspeed")
sns.countplot(test,
              x="windspeed",
              ax=axes[1])

In [None]:
train_windspeed_zero = train[train["windspeed"] == 0]
train_windspeed_non_zero = train[train["windspeed"] != 0]
print(f"train\nwindspeed = 0: {train_windspeed_zero.shape[0]}",
      f"windspeed > 0: {train_windspeed_non_zero.shape[0]}"
      ,sep="\n")

In [None]:
data_windspeed_zero = train[train["windspeed"] == 0].copy()
data_windspeed_non_zero = train[train["windspeed"] != 0].copy()

In [None]:
data_windspeed_zero.head()

In [None]:
data_windspeed_non_zero.head()

In [None]:
feature_column_list = ["season", "weather", "humidity", "month", "temp", "year", "atemp"]
feature_column_list

In [None]:
data_windspeed_non_zero["windspeed"] = data_windspeed_non_zero["windspeed"].astype("str")

In [None]:
data_windspeed_non_zero.info()

In [None]:
from sklearn.ensemble import RandomForestClassifier

rf_model_windspeed = RandomForestClassifier()

rf_model_windspeed.fit(data_windspeed_non_zero[feature_column_list], data_windspeed_non_zero["windspeed"])

windspeed_predict = rf_model_windspeed.predict(data_windspeed_zero[feature_column_list])

In [None]:
len(windspeed_predict)

In [None]:
predict_windspeed_zero = data_windspeed_zero
predict_windspeed_non_zero = data_windspeed_non_zero

In [None]:
predict_windspeed_zero.head()

In [None]:
predict_windspeed_non_zero.head()

In [None]:
predict_windspeed_zero["windspeed"] = windspeed_predict
predict_windspeed_zero.head()

In [None]:
predict_windspeed_zero["windspeed"] = predict_windspeed_zero["windspeed"].astype("float")

In [None]:
plt.figure(figsize=(12, 5))
plt.xticks(rotation=30)
sns.countplot(predict_windspeed_zero,
              x="windspeed")

In [None]:
data = pd.concat([predict_windspeed_non_zero, predict_windspeed_zero]).sort_index()
data.head()


In [None]:
data["windspeed"] = data["windspeed"].astype("float")
data["windspeed"].describe()

In [None]:
# from sklearn.ensemble import RandomForestClassifier


# 관측되지 않은 '풍속(windspeed)'은 0으로 표현되는 것으로 보이므로,
# 값이 0이 아닌 다른 값을 이용하여 풍속을 예측하고 이를 이용하여 값을 보정
def predict_windspeed(data:pd.DataFrame) -> pd.DataFrame:
    # "windspeed" 컬럼값에 따라 dataframe 분리
    # - windspeed의 값이 0이 아닌 데이터셋을 이용하여 train
    # - windspeed의 값이 0인 데이터셋으로 test(predict)
    data_windspeed_zero = data[data["windspeed"] == 0].copy()
    data_windspeed_non_zero = data[data["windspeed"] != 0].copy()
    
    # feature로 사용할 컬럼 정의
    feature_column_list = ["season", "weather", "humidity", "month", "temp", "year", "atemp"]

    # train 대상 데이터셋의 labeled column(예측하려는 값)의 data type 변경
    data_windspeed_non_zero["windspeed"] = data_windspeed_non_zero["windspeed"].astype("str")

    # 모델 인스턴스 생성(랜덤포레스트)
    rf_model_windspeed = RandomForestClassifier()

    # 랜덤포레스트 알고리즘을 이용하여 데이터 학습
    rf_model_windspeed.fit(data_windspeed_non_zero[feature_column_list], data_windspeed_non_zero["windspeed"])

    # 학습된 모델을 이용하여 windspeed의 값이 0인 데이터셋의 windspeed 값을 예측
    windspeed_predict = rf_model_windspeed.predict(data_windspeed_zero[feature_column_list])

    # ???
    predict_windspeed_zero = data_windspeed_zero.copy()
    predict_windspeed_non_zero = data_windspeed_non_zero.copy()

    # 예측 결과를 저장
    predict_windspeed_zero["windspeed"] = windspeed_predict

    # 분리했던 데이터셋을 병합
    data = pd.concat([predict_windspeed_non_zero, predict_windspeed_zero]).sort_index()

    # data type 되돌리기
    data["windspeed"] = data["windspeed"].astype("float")

    return data



In [None]:
train = predict_windspeed(train)

In [None]:
test = predict_windspeed(test)

In [None]:
train.head()

In [None]:
test["windspeed"].describe()

In [None]:
fig, axes = plt.subplots(2)

fig.set_size_inches(12, 14)

plt.sca(axes[0])
axes[0].set(title="Train's Windspeed")
plt.xticks(rotation=30)
sns.countplot(train,
              x="windspeed",
              ax=axes[0])

plt.sca(axes[1])
axes[1].set(title="Test's Windspeed")
plt.xticks(rotation=30)
sns.countplot(test,
              x="windspeed",
              ax=axes[1])

### 3.Feature Selection

In [None]:
categorical_feature_list = ["season", 
                            "holiday", 
                            "workingday", 
                            "weather", 
                            "dayofweek",
                            "month", 
                            "year", 
                            "hour"]
categorical_feature_list

In [None]:
for category in categorical_feature_list:
    train[category] = train[category].astype("category")
    test[category] = test[category].astype("category")

In [None]:
test.info()

In [None]:
train.info()

In [None]:
train.columns

In [None]:
feature_list = ["season",
                "weather",
                "temp",
                "atemp",
                "humidity",
                "windspeed",
                "year",
                "hour",
                "dayofweek",
                "holiday",
                "workingday"]
feature_list

In [None]:
X_train = train[feature_list]
X_train.shape

In [None]:
X_train.head()

In [None]:
X_test = test[feature_list]
X_test.shape

In [None]:
X_test.head()

In [None]:
label_name = "count"

y_train = train[label_name]
y_train.shape

In [None]:
y_train.head()

### 4.Score

* Root Mean Squared Logarithmic Error

$ \sqrt{ \frac{1}{n} \sum_{i=1}^{n}{ (\log( p_i + 1) - \log(a_i + 1))^2} } $
$ = \sqrt{ \frac{1}{n} \sum_{i=1}^{n}{ (\log \frac {p_i + 1} {a_i + 1} )^2} } $

In [None]:
# RMSLE


def rmsle(predicted_values, actual_values):
    predicted_values = np.array(predicted_values)
    actual_values = np.array(actual_values)

    log_predict = np.log(predicted_values + 1)
    log_actual = np.log(actual_values + 1)

    difference = log_predict - log_actual

    difference = np.square(difference)

    mean_difference = difference.mean()

    score = np.sqrt(mean_difference)

    return score



In [None]:
from sklearn.metrics import make_scorer

rmsle_scorer = make_scorer(rmsle)
rmsle_scorer

### 5. Cross Validation

In [None]:
from sklearn.model_selection import KFold

k_fold = KFold(n_splits=10, shuffle=True, random_state=0)

### 6. Random Forest

In [None]:
from sklearn.ensemble import RandomForestRegressor

max_depth_list = []

model = RandomForestRegressor(n_estimators=200,
                              n_jobs=-1,
                              random_state=0                           
                              )

model

In [None]:
from sklearn.model_selection import cross_val_score

score = cross_val_score(model, X_train, y_train, cv=k_fold, scoring=rmsle_scorer)
score = score.mean()

print(f"Score= {score:.5f}")

### 7. Train

In [None]:
model.fit(X_train, y_train)

In [None]:
prediction = model.predict(X_test)
prediction.shape

In [None]:
prediction[:20]

In [None]:
fig, (ax1, ax2) = plt.subplots(ncols=2)

fig.set_size_inches(12, 6)

sns.histplot(x=y_train.values,
            ax=ax1,
            kde=True,
            bins=50)
sns.histplot(x=prediction,
             ax=ax2,
             kde=True,
             bins=50)

### 8. Submit

In [None]:
submission = pd.read_csv(file_list[0])
submission.head()

In [None]:
submission["count"] = prediction
submission.head()

In [None]:
submission.to_csv(f"./data/bike/Score_{score:.5f}_submission.csv", index=False)

In [None]:
pd.read_csv("./data/bike\\Score_0.32869_submission.csv")