In [1]:
#goo.gl/zbVRKa

import pandas as pd


## Load Dataset

In [2]:
train = pd.read_csv("data/train.csv", parse_dates=["datetime"])

print(train.shape)
train.head()

(10886, 12)


Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,count
0,2011-01-01 00:00:00,1,0,0,1,9.84,14.395,81,0.0,3,13,16
1,2011-01-01 01:00:00,1,0,0,1,9.02,13.635,80,0.0,8,32,40
2,2011-01-01 02:00:00,1,0,0,1,9.02,13.635,80,0.0,5,27,32
3,2011-01-01 03:00:00,1,0,0,1,9.84,14.395,75,0.0,3,10,13
4,2011-01-01 04:00:00,1,0,0,1,9.84,14.395,75,0.0,0,1,1


In [3]:
test = pd.read_csv("data/test.csv", parse_dates=["datetime"])

print(test.shape)
test.head()

(6493, 9)


Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed
0,2011-01-20 00:00:00,1,0,1,1,10.66,11.365,56,26.0027
1,2011-01-20 01:00:00,1,0,1,1,10.66,13.635,56,0.0
2,2011-01-20 02:00:00,1,0,1,1,10.66,13.635,56,0.0
3,2011-01-20 03:00:00,1,0,1,1,10.66,12.88,56,11.0014
4,2011-01-20 04:00:00,1,0,1,1,10.66,12.88,56,11.0014


## Preprocessing

### Parse Datetime

In [4]:
train["datetime-year"] = train["datetime"].dt.year
train["datetime-month"] = train["datetime"].dt.month
train["datetime-day"] = train["datetime"].dt.day
train["datetime-hour"] = train["datetime"].dt.hour
train["datetime-minute"] = train["datetime"].dt.minute
train["datetime-second"] = train["datetime"].dt.second
train["datetime-dayofweek"] = train["datetime"].dt.dayofweek

print(train.shape)
train[["datetime", "datetime-year", "datetime-month", "datetime-day", "datetime-hour", "datetime-minute", "datetime-second", "datetime-dayofweek"]].head()

(10886, 19)


Unnamed: 0,datetime,datetime-year,datetime-month,datetime-day,datetime-hour,datetime-minute,datetime-second,datetime-dayofweek
0,2011-01-01 00:00:00,2011,1,1,0,0,0,5
1,2011-01-01 01:00:00,2011,1,1,1,0,0,5
2,2011-01-01 02:00:00,2011,1,1,2,0,0,5
3,2011-01-01 03:00:00,2011,1,1,3,0,0,5
4,2011-01-01 04:00:00,2011,1,1,4,0,0,5


In [5]:
test["datetime-year"] = test["datetime"].dt.year
test["datetime-month"] = test["datetime"].dt.month
test["datetime-day"] = test["datetime"].dt.day
test["datetime-hour"] = test["datetime"].dt.hour
test["datetime-minute"] = test["datetime"].dt.minute
test["datetime-second"] = test["datetime"].dt.second
test["datetime-dayofweek"] = test["datetime"].dt.dayofweek

print(test.shape)
test[["datetime", "datetime-year", "datetime-month", "datetime-day", "datetime-hour", "datetime-minute", "datetime-second", "datetime-dayofweek"]].head()

(6493, 16)


Unnamed: 0,datetime,datetime-year,datetime-month,datetime-day,datetime-hour,datetime-minute,datetime-second,datetime-dayofweek
0,2011-01-20 00:00:00,2011,1,20,0,0,0,3
1,2011-01-20 01:00:00,2011,1,20,1,0,0,3
2,2011-01-20 02:00:00,2011,1,20,2,0,0,3
3,2011-01-20 03:00:00,2011,1,20,3,0,0,3
4,2011-01-20 04:00:00,2011,1,20,4,0,0,3


### Concatenate year and month

In [6]:
def concatenate_year_month(datetime):
    return "{0}-{1}".format(datetime.year, datetime.month)

train["datetime-year_month"] = train["datetime"].apply(concatenate_year_month)
test["datetime-year_month"] = test["datetime"].apply(concatenate_year_month)



print(train.shape)
train[["datetime", "datetime-year_month"]].head()
test[["datetime", "datetime-year_month"]].head()


(10886, 20)


Unnamed: 0,datetime,datetime-year_month
0,2011-01-20 00:00:00,2011-1
1,2011-01-20 01:00:00,2011-1
2,2011-01-20 02:00:00,2011-1
3,2011-01-20 03:00:00,2011-1
4,2011-01-20 04:00:00,2011-1


### friday

In [7]:
train["work_fri"]= (train["datetime-dayofweek"]==4) & (train["workingday"]==1)

test["work_fri"]= (test["datetime-dayofweek"]==4) & (test["workingday"]==1)


### workingday&holiday 

In [8]:
train["wh10"]= (train["workingday"]==1) & (train["holiday"]==0)
train["wh01"]= (train["workingday"]==0) & (train["holiday"]==1)


In [9]:
test["wh10"]= (test["workingday"]==1) & (test["holiday"]==0)
test["wh01"]= (test["workingday"]==0) & (test["holiday"]==1)


### temp+humid/humid

In [10]:
train["temp+humid/humid"]=(train["temp"]+(train["humidity"] + 1))/(train["humidity"] + 1)
test["temp+humid/humid"]=(test["temp"]+(test["humidity"] + 1))/(test["humidity"] + 1)

## Train

In [11]:
feature_names = ["season", "weather","wh01","wh10", "work_fri",
                 "temp", "atemp", "humidity",
                 "datetime-year", "datetime-hour", "datetime-dayofweek"]
#humid 연산효과?
feature_names

['season',
 'weather',
 'wh01',
 'wh10',
 'work_fri',
 'temp',
 'atemp',
 'humidity',
 'datetime-year',
 'datetime-hour',
 'datetime-dayofweek']

In [12]:
X_train = train[feature_names]

print(X_train.shape)
X_train.head()

(10886, 11)


Unnamed: 0,season,weather,wh01,wh10,work_fri,temp,atemp,humidity,datetime-year,datetime-hour,datetime-dayofweek
0,1,1,False,False,False,9.84,14.395,81,2011,0,5
1,1,1,False,False,False,9.02,13.635,80,2011,1,5
2,1,1,False,False,False,9.02,13.635,80,2011,2,5
3,1,1,False,False,False,9.84,14.395,75,2011,3,5
4,1,1,False,False,False,9.84,14.395,75,2011,4,5


In [13]:
X_test = test[feature_names]

print(X_test.shape)
X_test.head()

(6493, 11)


Unnamed: 0,season,weather,wh01,wh10,work_fri,temp,atemp,humidity,datetime-year,datetime-hour,datetime-dayofweek
0,1,1,False,True,False,10.66,11.365,56,2011,0,3
1,1,1,False,True,False,10.66,13.635,56,2011,1,3
2,1,1,False,True,False,10.66,13.635,56,2011,2,3
3,1,1,False,True,False,10.66,12.88,56,2011,3,3
4,1,1,False,True,False,10.66,12.88,56,2011,4,3


In [14]:
label_name1 = "registered"

y_train1 = train[label_name1]

print(y_train1.shape)
y_train1.head()
print(train[y_train1.isnull()==True].shape)

(10886,)
(0, 24)


In [15]:
label_name2 = "casual"
y_train2 = train[label_name2]

print(y_train2.shape)
y_train2.head()

(10886,)


0    3
1    8
2    5
3    3
4    0
Name: casual, dtype: int64

In [16]:
from sklearn.ensemble import RandomForestRegressor

# http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestRegressor.html
# n_estimators
# max_depth
# max_features

model1 = RandomForestRegressor(n_estimators=500,
                              random_state=37,
                               max_depth=45,
                              n_jobs=-1)
model2 = RandomForestRegressor(n_estimators=500,
                              random_state=37,
                               max_depth=45,
                              n_jobs=-1)


## Score

$$ \sqrt{\frac{1}{n} \sum_{i=1}^n (\log(p_i + 1) - \log(a_i+1))^2 } $$

In [17]:
import numpy as np
from sklearn.metrics import make_scorer

def rmsle(predict, actual):
    predict = np.array(predict)
    actual = np.array(actual)
    
    log_predict = np.log(predict + 1)
    log_actual = np.log(actual + 1)
    
    difference = log_predict - log_actual
    difference = np.square(difference)
    
    mean_difference = difference.mean()
    
    score = np.sqrt(mean_difference)
    
    return score

rmsle_scorer = make_scorer(rmsle)
rmsle_scorer

make_scorer(rmsle)

from sklearn.cross_validation import cross_val_score


score = cross_val_score(model1, X_train, y_train1, cv=3, \
                        scoring=rmsle_scorer).mean()

print("Score = {0:.5f}".format(score))

score = cross_val_score(model2, X_train, y_train2, cv=20, \
                         scoring=rmsle_scorer).mean()

print("Score = {0:.5f}".format(score))

## Train

In [18]:
model1.fit(X_train, y_train1)
model2.fit(X_train, y_train2)


RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=45,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=500, n_jobs=-1,
           oob_score=False, random_state=37, verbose=0, warm_start=False)

In [19]:
predictions1 = model1.predict(X_test)


print(predictions1.shape)
predictions1

(6493,)


array([ 11.036     ,   4.604     ,   1.7915    , ...,  95.992     ,
        97.01666667,  43.826     ])

In [20]:
predictions2 = model2.predict(X_test)

print(predictions2.shape)
predictions2

(6493,)


array([ 2.018,  0.684,  1.33 , ...,  4.09 ,  3.462,  2.218])

## register+casaul

In [21]:
predictions=(predictions1+predictions2)

## Submit

In [22]:
submission = pd.read_csv("data/sampleSubmission.csv")

submission["count"] = predictions

print(submission.shape)
submission.head()

(6493, 2)


Unnamed: 0,datetime,count
0,2011-01-20 00:00:00,13.054
1,2011-01-20 01:00:00,5.288
2,2011-01-20 02:00:00,3.1215
3,2011-01-20 03:00:00,2.982667
4,2011-01-20 04:00:00,2.352


In [23]:
submission.to_csv("beat-the-top-25.csv", index=False)

#.42293 .41088(24) .40850(25)