In [151]:
import pandas as pd
train = pd.read_csv("train_bike.csv",parse_dates=["datetime"])
test = pd.read_csv("test_bike.csv",parse_dates=["datetime"])
print(train.shape)
print(train.head())
print(train.dtypes)

(10886, 12)
             datetime  season  holiday  workingday  weather  temp   atemp  \
0 2011-01-01 00:00:00       1        0           0        1  9.84  14.395   
1 2011-01-01 01:00:00       1        0           0        1  9.02  13.635   
2 2011-01-01 02:00:00       1        0           0        1  9.02  13.635   
3 2011-01-01 03:00:00       1        0           0        1  9.84  14.395   
4 2011-01-01 04:00:00       1        0           0        1  9.84  14.395   

   humidity  windspeed  casual  registered  count  
0        81        0.0       3          13     16  
1        80        0.0       8          32     40  
2        80        0.0       5          27     32  
3        75        0.0       3          10     13  
4        75        0.0       0           1      1  
datetime      datetime64[ns]
season                 int64
holiday                int64
workingday             int64
weather                int64
temp                 float64
atemp                float64
humidity  

## Preprocessing

### parse datetime

In [163]:
train["datetime-year"] = train["datetime"].dt.year
train["datetime-month"] = train["datetime"].dt.month
train["datetime-day"] = train["datetime"].dt.day
train["datetime-hour"] = train["datetime"].dt.hour
train["datetime-minute"] = train["datetime"].dt.minute
train["datetime-second"] = train["datetime"].dt.second

In [165]:
train["commute_time_morning"] = (train["datetime-hour"]>6)&(test["datetime-hour"]<9)
train["commute_time_evening"] = (train["datetime-hour"]>16)&(test["datetime-hour"]<21)
train["cold_atemp_strong"] = (train["atemp"] <=10)
train["cold_atemp_weak"] = (train["atemp"]>10)&(test["atemp"]<=20)
train["warm_atemp_weak"] = (train["atemp"]>20)&(test["atemp"]<=30)
train["warm_atemp_strong"]= (train["atemp"]>30)&(test["atemp"]<=40)
train["hot_atemp"] = (train["atemp"] >40)

In [166]:
test["datetime-year"] = test["datetime"].dt.year
test["datetime-month"] = test["datetime"].dt.month
test["datetime-day"] = test["datetime"].dt.day
test["datetime-hour"] = test["datetime"].dt.hour
test["datetime-minute"] = test["datetime"].dt.minute
test["datetime-second"] = test["datetime"].dt.second
test["commute_time_morning"] = (test["datetime-hour"]>6)&(test["datetime-hour"]<9)
test["commute_time_evening"] = (test["datetime-hour"]>16)&(test["datetime-hour"]<21)
test["cold_atemp_strong"] = (test["atemp"] <=10)
test["cold_atemp_weak"] = (test["atemp"]>10)&(test["atemp"]<=20)
test["warm_atemp_weak"] = (test["atemp"]>20)&(test["atemp"]<=30)
test["warm_atemp_strong"]= (test["atemp"]>30)&(test["atemp"]<=40)
test["hot_atemp"] = (test["atemp"] >40)

In [168]:
feature_names = ["season", "holiday", "workingday", "weather", 
                 "atemp", "humidity",
                 "datetime-year","datetime-hour",
                 "commute_time_morning","commute_time_evening",
                 "cold_atemp_strong","cold_atemp_weak","warm_atemp_weak",
                 "warm_atemp_strong","hot_atemp"]

In [169]:
X_test = test[feature_names]
print(X_test.shape)
X_test.head()

(6493, 15)


Unnamed: 0,season,holiday,workingday,weather,atemp,humidity,datetime-year,datetime-hour,commute_time_morning,commute_time_evening,cold_atemp_strong,cold_atemp_weak,warm_atemp_weak,warm_atemp_strong,hot_atemp
0,1,0,1,1,11.365,56,2011,0,False,False,False,True,False,False,False
1,1,0,1,1,13.635,56,2011,1,False,False,False,True,False,False,False
2,1,0,1,1,13.635,56,2011,2,False,False,False,True,False,False,False
3,1,0,1,1,12.88,56,2011,3,False,False,False,True,False,False,False
4,1,0,1,1,12.88,56,2011,4,False,False,False,True,False,False,False


In [170]:
'''datetime - hourly date + timestamp  
season -  1 = spring, 2 = summer, 3 = fall, 4 = winter 
holiday - whether the day is considered a holiday
workingday - whether the day is neither a weekend nor holiday
weather - 1: Clear, Few clouds, Partly cloudy, Partly cloudy 
2: Mist + Cloudy, Mist + Broken clouds, Mist + Few clouds, Mist 
3: Light Snow, Light Rain + Thunderstorm + Scattered clouds, Light Rain + Scattered clouds 
4: Heavy Rain + Ice Pallets + Thunderstorm + Mist, Snow + Fog 
temp - temperature in Celsius
atemp - "feels like" temperature in Celsius
humidity - relative humidity
windspeed - wind speed
casual - number of non-registered user rentals initiated
registered - number of registered user rentals initiated
count - number of total rentals'''

'datetime - hourly date + timestamp  \nseason -  1 = spring, 2 = summer, 3 = fall, 4 = winter \nholiday - whether the day is considered a holiday\nworkingday - whether the day is neither a weekend nor holiday\nweather - 1: Clear, Few clouds, Partly cloudy, Partly cloudy \n2: Mist + Cloudy, Mist + Broken clouds, Mist + Few clouds, Mist \n3: Light Snow, Light Rain + Thunderstorm + Scattered clouds, Light Rain + Scattered clouds \n4: Heavy Rain + Ice Pallets + Thunderstorm + Mist, Snow + Fog \ntemp - temperature in Celsius\natemp - "feels like" temperature in Celsius\nhumidity - relative humidity\nwindspeed - wind speed\ncasual - number of non-registered user rentals initiated\nregistered - number of registered user rentals initiated\ncount - number of total rentals'

In [171]:
label_name_c = "casual"

c_train = train[label_name_c]

print(c_train.shape)

(10886,)


In [172]:
label_name_r = "registered"

r_train = train[label_name_r]
print(r_train.shape)

(10886,)


In [173]:
from sklearn.ensemble import RandomForestRegressor

model = RandomForestRegressor(random_state=38)
model

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
           oob_score=False, random_state=38, verbose=0, warm_start=False)

## Score

Evaluation Metric = **Root Mean Squared Logarithmic Error** (RMSLE)


$$ \sqrt{\frac{1}{n} \sum_{i=1}^n (\log(p_i + 1) - \log(a_i+1))^2 } $$

In [174]:
import numpy as np
from sklearn.metrics import make_scorer

def rmsle(predict, actual):
    predict = np.array(predict)  #predict값을 metrics로 변환
    actual = np.array(actual) #actual값을 metrics로 변환
    
    log_predict = np.log(predict + 1)
    log_actual = np.log(actual + 1)
    
    difference = log_predict - log_actual
    difference = np.square(difference)
    
    mean_difference = difference.mean()
    score = np.sqrt(mean_difference)
    
    return score

rmsle_scorer = make_scorer(rmsle)
rmsle_scorer

make_scorer(rmsle)

In [175]:
from sklearn.model_selection import cross_val_score  #cross validation 방법을 통해 scoring하는 방법

#cv : cross validation generator
     # cross_val_score(estimator, X ,y=None, score_func = None, cv = None, n_jobs=1, verbose=0)

'''
estimator: estimator object implementing ‘fit’ :
The object to use to fit the data
X: array-like of shape at least 2D :
The data to fit.
y: array-like, optional :
The target variable to try to predict in the case of supervised learning.
score_func: callable, optional :
callable, has priority over the score function in the estimator. In a non-supervised setting, where y is None, it takes the test data (X_test) as its only argument. In a supervised setting it takes the test target (y_true) and the test prediction (y_pred) as arguments.
cv: cross-validation generator, optional :
A cross-validation generator. If None, a 3-fold cross validation is used or 3-fold stratified cross-validation when y is supplied and estimator is a classifier.
n_jobs: integer, optional :
The number of CPUs to use to do the computation. -1 means ‘all CPUs’.
verbose: integer, optional :
The verbosity level
'''

score_c = cross_val_score(model, X_train, c_train, cv = 20, scoring=rmsle_scorer).mean()
print("Score_c = {0:.5f}".format(score_c))
model.fit(X_train,c_train)
predictions_c = model.predict(X_test)
print(predictions_c)

score_r = cross_val_score(model, X_train, r_train, cv = 20, scoring=rmsle_scorer).mean()
print("Score_r = {0:.5f}".format(score_r))
model.fit(X_train,r_train)
predictions_r = model.predict(X_test)
print(predictions_r)

predictions = predictions_c + predictions_r

print(predictions.shape)
predictions[0:10]

Score_c = 0.81241


ValueError: Number of features of the model must match the input. Model n_features is 9 and input n_features is 15 

In [None]:
submit = pd.read_csv("sampleSubmission.csv")
print(submit.shape)
submit.head()

In [None]:
submit["count"] = predictions
print(submit.shape)
submit.head()

In [None]:
submit.to_csv("baseline-script.csv", index=False)