In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
plt.rc("font", family="Malgun Gothic")
plt.rc("axes", unicode_minus=False)

In [8]:
from glob import glob

file_list = glob("./data/bike/*.csv")
file_list

['./data/bike\\sampleSubmission.csv',
 './data/bike\\test.csv',
 './data/bike\\train.csv']

In [11]:
train = pd.read_csv(file_list[2], parse_dates=["datetime"])
train.shape

(10886, 12)

In [12]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10886 entries, 0 to 10885
Data columns (total 12 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   datetime    10886 non-null  datetime64[ns]
 1   season      10886 non-null  int64         
 2   holiday     10886 non-null  int64         
 3   workingday  10886 non-null  int64         
 4   weather     10886 non-null  int64         
 5   temp        10886 non-null  float64       
 6   atemp       10886 non-null  float64       
 7   humidity    10886 non-null  int64         
 8   windspeed   10886 non-null  float64       
 9   casual      10886 non-null  int64         
 10  registered  10886 non-null  int64         
 11  count       10886 non-null  int64         
dtypes: datetime64[ns](1), float64(3), int64(8)
memory usage: 1020.7 KB


In [14]:
test = pd.read_csv(file_list[1], parse_dates=["datetime"])
test.shape

(6493, 9)

In [16]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6493 entries, 0 to 6492
Data columns (total 9 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   datetime    6493 non-null   datetime64[ns]
 1   season      6493 non-null   int64         
 2   holiday     6493 non-null   int64         
 3   workingday  6493 non-null   int64         
 4   weather     6493 non-null   int64         
 5   temp        6493 non-null   float64       
 6   atemp       6493 non-null   float64       
 7   humidity    6493 non-null   int64         
 8   windspeed   6493 non-null   float64       
dtypes: datetime64[ns](1), float64(3), int64(5)
memory usage: 456.7 KB


In [19]:
train.datetime.head()

0   2011-01-01 00:00:00
1   2011-01-01 01:00:00
2   2011-01-01 02:00:00
3   2011-01-01 03:00:00
4   2011-01-01 04:00:00
Name: datetime, dtype: datetime64[ns]

In [22]:
train["year"] = train["datetime"].dt.year
train["month"] = train["datetime"].dt.month
train["day"] = train["datetime"].dt.day
train["hour"] = train["datetime"].dt.hour
train["dayofweek"] = train["datetime"].dt.dayofweek
train.shape

(10886, 17)

In [23]:
test["year"] = test["datetime"].dt.year
test["month"] = test["datetime"].dt.month
test["day"] = test["datetime"].dt.day
test["hour"] = test["datetime"].dt.hour
test["dayofweek"] = test["datetime"].dt.dayofweek
test.shape

(6493, 14)

In [29]:
categorical_feature_list = ["season",
                            "holiday",
                            "workingday",
                            "weather",
                            "year",
                            "month",
                            "day",
                            "hour",
                            "dayofweek"]
len(categorical_feature_list)

9

In [35]:
train[categorical_feature_list] = train[categorical_feature_list].astype("category")
test[categorical_feature_list] = test[categorical_feature_list].astype("category")

In [36]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10886 entries, 0 to 10885
Data columns (total 17 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   datetime    10886 non-null  datetime64[ns]
 1   season      10886 non-null  category      
 2   holiday     10886 non-null  category      
 3   workingday  10886 non-null  category      
 4   weather     10886 non-null  category      
 5   temp        10886 non-null  float64       
 6   atemp       10886 non-null  float64       
 7   humidity    10886 non-null  int64         
 8   windspeed   10886 non-null  float64       
 9   casual      10886 non-null  int64         
 10  registered  10886 non-null  int64         
 11  count       10886 non-null  int64         
 12  year        10886 non-null  category      
 13  month       10886 non-null  category      
 14  day         10886 non-null  category      
 15  hour        10886 non-null  category      
 16  dayofweek   10886 non-

In [37]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6493 entries, 0 to 6492
Data columns (total 14 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   datetime    6493 non-null   datetime64[ns]
 1   season      6493 non-null   category      
 2   holiday     6493 non-null   category      
 3   workingday  6493 non-null   category      
 4   weather     6493 non-null   category      
 5   temp        6493 non-null   float64       
 6   atemp       6493 non-null   float64       
 7   humidity    6493 non-null   int64         
 8   windspeed   6493 non-null   float64       
 9   year        6493 non-null   category      
 10  month       6493 non-null   category      
 11  day         6493 non-null   category      
 12  hour        6493 non-null   category      
 13  dayofweek   6493 non-null   category      
dtypes: category(9), datetime64[ns](1), float64(3), int64(1)
memory usage: 313.4 KB


In [28]:
feature_name = ["season",
                "holiday",
                "workingday",
                "weather",
                "temp",
                "atemp",
                "humidity",
                "year",
                "month",
                "day",
                "dayofweek"]
len(feature_name)

11

In [50]:
X_train = train[feature_name]
print(f"{X_train.shape}")
display(X_train.head())


(10886, 11)


Unnamed: 0,season,holiday,workingday,weather,temp,atemp,humidity,year,month,day,dayofweek
0,1,0,0,1,9.84,14.395,81,2011,1,1,5
1,1,0,0,1,9.02,13.635,80,2011,1,1,5
2,1,0,0,1,9.02,13.635,80,2011,1,1,5
3,1,0,0,1,9.84,14.395,75,2011,1,1,5
4,1,0,0,1,9.84,14.395,75,2011,1,1,5


In [49]:
X_test = test[feature_name]
print(f"{X_test.shape}")
display(X_test.head())

(6493, 11)


Unnamed: 0,season,holiday,workingday,weather,temp,atemp,humidity,year,month,day,dayofweek
0,1,0,1,1,10.66,11.365,56,2011,1,20,3
1,1,0,1,1,10.66,13.635,56,2011,1,20,3
2,1,0,1,1,10.66,13.635,56,2011,1,20,3
3,1,0,1,1,10.66,12.88,56,2011,1,20,3
4,1,0,1,1,10.66,12.88,56,2011,1,20,3


In [52]:
label_name = "count"

y_train = train[label_name]
print(f"{y_train.shape[0]}, type:{type(y_train)}")
y_train.head()

10886, type:<class 'pandas.core.series.Series'>


0    16
1    40
2    32
3    13
4     1
Name: count, dtype: int64

* Root Mean Squared Logarithmic Error

$ \sqrt{ \frac{1}{n} \sum_{i=1}^{n}{ (\log( p_i + 1) - \log(a_i + 1))^2} } $
$ = \sqrt{ \frac{1}{n} \sum_{i=1}^{n}{ (\log \frac {p_i + 1} {a_i + 1} )^2} } $

In [53]:
from sklearn.metrics import make_scorer


def rmsle(predicted_values, actual_values):

    predicted_values = np.array(predicted_values)
    actual_values = np.array(actual_values)

    log_predicted_values = np.log1p(predicted_values)
    log_actual_values = np.log1p(actual_values)

    difference = log_predicted_values - log_actual_values
    difference = np.square(difference)

    mean_difference = difference.mean()

    score = np.sqrt(mean_difference)

    return score



In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import GridSearchCV
from sklearn import metrics

lr_model = LinearRegression()



