## Bike Sharing Demand

https://www.kaggle.com/c/bike-sharing-demand

数据描述
```
       (1) datetime：代表数据日期，以年-月-日 小时的形式给出。 
       (2) season：数据记录时的季节。1 为春季, 2为夏季,3 为秋季,4 为冬季。 
       (3) hodliday：当日是否为假期。1代表是，0代表不是。 
       (4) workingday：当日是否为工作日，即既不是周末也不是假期。1代表是，0代表不是。 
       (5) weather:当日天气： 
              1: 天气晴朗或者少云/部分有云。 
              2: 有雾和云/风等。 
              3: 小雪/小雨，闪电及多云。 
              4: 大雨/冰雹/闪电和大雾/大雪。 
       (6) temp - 当日摄氏温度。 
       (7) atemp - 当日人们感觉的温度。
       (8) humidity - 当日湿度。 
       (9) windspeed - 风速。 
       (10) casual -非预定自行车的人数 
       (11) registered - 登记预定自信车的人数。 
       (12) count - 总租车数，我们需要预测的值。即casual+registered数目。 
```

自觉上的分析：

```
1 从datetime中提取出月份、日期、workday 、小时等消息，一般来说：不同季节、不同日期、是否是周末、上下班时间的租车数量是不同的
2 天气特征：temp、atemp需要分析其相关性，是否可以保留一个就好了（去除另外一个）
3 casual 和registered之和是 count总数，这两列可以移除
4  datetime抽取特征后也可以移除
```


In [None]:

import numpy as np
import pandas as pd
from sklearn.model_selection import cross_validate
from sklearn import linear_model, metrics, preprocessing, pipeline, grid_search
from sklearn.model_selection import train_test_split


In [54]:
# 定义代价函数
def rmsle(y, y_):
    log1 = np.nan_to_num(np.array([np.log(v+1) for v in y]))
    log2 = np.nan_to_num(np.array([np.log(v+1) for v in y_]))
    cal = np.square(log1 - log2)
    return np.sqrt(np.mean(cal))

In [55]:

data =  pd.read_csv('../data/train.csv')

data.head(3)

Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,count
0,2011-01-01 00:00:00,1,0,0,1,9.84,14.395,81,0.0,3,13,16
1,2011-01-01 01:00:00,1,0,0,1,9.02,13.635,80,0.0,8,32,40
2,2011-01-01 02:00:00,1,0,0,1,9.02,13.635,80,0.0,5,27,32


In [56]:
data.isnull().values.any()

False

In [57]:
data.datetime =  data.datetime.apply(pd.to_datetime)
data.head(3)

Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,count
0,2011-01-01 00:00:00,1,0,0,1,9.84,14.395,81,0.0,3,13,16
1,2011-01-01 01:00:00,1,0,0,1,9.02,13.635,80,0.0,8,32,40
2,2011-01-01 02:00:00,1,0,0,1,9.02,13.635,80,0.0,5,27,32


In [58]:
data['month'] = data.datetime.apply(lambda x:x.month)
data.head(3)

Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,count,month
0,2011-01-01 00:00:00,1,0,0,1,9.84,14.395,81,0.0,3,13,16,1
1,2011-01-01 01:00:00,1,0,0,1,9.02,13.635,80,0.0,8,32,40,1
2,2011-01-01 02:00:00,1,0,0,1,9.02,13.635,80,0.0,5,27,32,1


In [59]:
data['hour'] = data.datetime.apply(lambda x:x.hour)
data.head(3)

Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,count,month,hour
0,2011-01-01 00:00:00,1,0,0,1,9.84,14.395,81,0.0,3,13,16,1,0
1,2011-01-01 01:00:00,1,0,0,1,9.02,13.635,80,0.0,8,32,40,1,1
2,2011-01-01 02:00:00,1,0,0,1,9.02,13.635,80,0.0,5,27,32,1,2


In [60]:
labels = data['count']
labels.values

array([ 16,  40,  32, ..., 168, 129,  88])

In [61]:
train_data,test_data, train_labels,test_labels = train_test_split(data, labels,test_size=0.235, random_state=42)

In [62]:
train_data.shape

(8327, 14)

In [63]:
test_data.shape

(2559, 14)

In [64]:
train_data.head(3)
train_data = train_data.drop(['datetime', 'count','casual', 'registered'], axis=1)

In [65]:
test_data = test_data.drop(['datetime', 'count','casual', 'registered'], axis=1)

In [66]:
train_data.head(3)

Unnamed: 0,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,month,hour
10082,4,0,1,1,13.94,15.91,46,15.0013,11,11
7203,2,0,1,3,18.86,22.725,51,7.0015,4,12
3006,3,0,1,1,25.42,31.06,53,16.9979,7,4


In [67]:
binary_data_columns = ['holiday', 'workingday']
binary_data_indices = np.array([(column in binary_data_columns) for column in train_data.columns], dtype = bool)

categorical_data_columns = ['season', 'weather', 'month'] 
categorical_data_indices = np.array([(column in categorical_data_columns) for column in train_data.columns], dtype = bool)

numeric_data_columns = ['temp', 'atemp', 'humidity', 'windspeed', 'hour']
numeric_data_indices = np.array([(column in numeric_data_columns) for column in train_data.columns], dtype = bool)

In [69]:
transformer_list = [        
            #binary
            ('binary_variables_processing', preprocessing.FunctionTransformer(lambda data: data[:, binary_data_indices])), 
                    
            #numeric
            ('numeric_variables_processing', pipeline.Pipeline(steps = [
                ('selecting', preprocessing.FunctionTransformer(lambda data: data[:, numeric_data_indices])),
                ('scaling', preprocessing.StandardScaler(with_mean = 0))            
                        ])),
        
            #categorical
            ('categorical_variables_processing', pipeline.Pipeline(steps = [
                ('selecting', preprocessing.FunctionTransformer(lambda data: data[:, categorical_data_indices])),
                ('hot_encoding', preprocessing.OneHotEncoder(handle_unknown = 'ignore'))            
                        ])),
        ]

In [76]:
regressor = linear_model.Lasso(max_iter=4000)


In [77]:
estimator =  pipeline.Pipeline(steps=[
    ('feature_preocessing', pipeline.FeatureUnion(transformer_list =  transformer_list)),
    ('model_fitting', regressor)
])

estimator.fit(train_data, train_labels)
predicted =  estimator.predict(test_data)

print("RMSLE: ", rmsle(test_labels, predicted))
print("MAE: ",  metrics.mean_absolute_error(test_labels, predicted))



RMSLE:  1.2611449216096313
MAE:  106.9133919864821


  after removing the cwd from sys.path.


# 使用超参数搜索

In [84]:
parameters_grid = {'model_fitting__alpha':[0.1,1,2,3,4,10,30]}


rmsle_scorer = metrics.make_scorer(rmsle, greater_is_better=False)
grid_cv = grid_search.GridSearchCV(estimator, parameters_grid, scoring = rmsle_scorer, cv = 4)
grid_cv.fit(train_data, train_labels)

predicted = grid_cv.best_estimator_.predict(test_data)

print("RMSLE: ", rmsle(test_labels, predicted))
#print("MAE: ",  metrics.mean_absolute_error(test_labels, predicted))
print("Best params: ", grid_cv.best_params_)

  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.


RMSLE:  1.2282550963939098
Best params:  {'model_fitting__alpha': 4}


  after removing the cwd from sys.path.


# 使用随机森林

In [95]:

from sklearn.ensemble import RandomForestRegressor

regressor = RandomForestRegressor(random_state=0, max_depth=20, n_estimators=300)
estimator =  pipeline.Pipeline(steps=[
    ('feature_preocessing', pipeline.FeatureUnion(transformer_list =  transformer_list)),
    ('model_fitting', regressor)
])

estimator.fit(train_data, train_labels)
print('randomforest rmsle:', rmsle(test_labels,estimator.predict(test_data)))





randomforest rmsle: 0.419156345667906



# 使用GBDT

In [86]:
from sklearn.ensemble import GradientBoostingRegressor
gbr = GradientBoostingRegressor(n_estimators=300, learning_rate=0.9,max_depth=4)
estimator =  pipeline.Pipeline(steps=[
    ('feature_preocessing', pipeline.FeatureUnion(transformer_list =  transformer_list)),
    ('model_fitting', gbr)
])

estimator.fit(train_data, train_labels)
print('randomforest rmsle:', rmsle(test_labels,estimator.predict(test_data)))



randomforest rmsle: 0.963975264957492


  after removing the cwd from sys.path.


# 预测测试数据

In [96]:
real_test_data = pd.read_csv("../data/test.csv")
real_test_data_ids = real_test_data["datetime"]
real_test_data.head()

Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed
0,2011-01-20 00:00:00,1,0,1,1,10.66,11.365,56,26.0027
1,2011-01-20 01:00:00,1,0,1,1,10.66,13.635,56,0.0
2,2011-01-20 02:00:00,1,0,1,1,10.66,13.635,56,0.0
3,2011-01-20 03:00:00,1,0,1,1,10.66,12.88,56,11.0014
4,2011-01-20 04:00:00,1,0,1,1,10.66,12.88,56,11.0014


In [97]:
real_test_data.datetime = real_test_data.datetime.apply(pd.to_datetime)
real_test_data['month'] = real_test_data.datetime.apply(lambda x : x.month)
real_test_data['hour'] = real_test_data.datetime.apply(lambda x : x.hour)
real_test_data.head()

Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,month,hour
0,2011-01-20 00:00:00,1,0,1,1,10.66,11.365,56,26.0027,1,0
1,2011-01-20 01:00:00,1,0,1,1,10.66,13.635,56,0.0,1,1
2,2011-01-20 02:00:00,1,0,1,1,10.66,13.635,56,0.0,1,2
3,2011-01-20 03:00:00,1,0,1,1,10.66,12.88,56,11.0014,1,3
4,2011-01-20 04:00:00,1,0,1,1,10.66,12.88,56,11.0014,1,4


In [98]:
real_test_data = real_test_data.drop(['datetime'], axis = 1)
real_test_predictions = estimator.predict(real_test_data)

In [99]:
real_test_predictions.min()

1.5122222222222224

In [100]:
submission = pd.DataFrame({
        "datetime": real_test_data_ids,
        "count": [max(0, x) for x in real_test_predictions]
    })
submission.head()


Unnamed: 0,count,datetime
0,14.673333,2011-01-20 00:00:00
1,5.866667,2011-01-20 01:00:00
2,5.859487,2011-01-20 02:00:00
3,4.13,2011-01-20 03:00:00
4,3.86,2011-01-20 04:00:00


In [101]:
submission.to_csv('../result/bike_prediction.csv', index= False)