In [3]:
import numpy as np
import pandas as pd

%matplotlib inline
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')
sns.set_style('darkgrid')

from scipy import stats

In [4]:
# train.csv 파일 읽어오기
train = pd.read_csv('bike_train.csv', encoding = 'cp949')
print(train.shape)
train.head()

(10886, 12)


Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,count
0,2011-01-01 00:00:00,1,0,0,1,9.84,14.395,81,0.0,3,13,16
1,2011-01-01 01:00:00,1,0,0,1,9.02,13.635,80,0.0,8,32,40
2,2011-01-01 02:00:00,1,0,0,1,9.02,13.635,80,0.0,5,27,32
3,2011-01-01 03:00:00,1,0,0,1,9.84,14.395,75,0.0,3,10,13
4,2011-01-01 04:00:00,1,0,0,1,9.84,14.395,75,0.0,0,1,1


In [5]:
# test.csv 파일 읽어오기
test = pd.read_csv('bike_test.csv', encoding = 'cp949')
print(test.shape)
test.head()

(6493, 9)


Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed
0,2011-01-20 00:00:00,1,0,1,1,10.66,11.365,56,26.0027
1,2011-01-20 01:00:00,1,0,1,1,10.66,13.635,56,0.0
2,2011-01-20 02:00:00,1,0,1,1,10.66,13.635,56,0.0
3,2011-01-20 03:00:00,1,0,1,1,10.66,12.88,56,11.0014
4,2011-01-20 04:00:00,1,0,1,1,10.66,12.88,56,11.0014


In [8]:
# 판다스의 to_datetime을 활용해 datetime 컬럼의 타입을 문자열(String)에서 날짜/시간(datetime) 데이터로 바꿔주기
train['datetime'] = pd.to_datetime(train['datetime'])
test['datetime'] = pd.to_datetime(test['datetime'])

# train 데이터에 연, 시를 나타내는 새로운 컬럼을 생성하기
train['year'] = train['datetime'].dt.year
train['hour'] = train['datetime'].dt.hour

# test 데이터에 연, 시를 나타내는 새로운 컬럼을 생성하기
test['year'] = test['datetime'].dt.year
test['hour'] = test['datetime'].dt.hour


In [9]:
# 1. rainy_days 변수 생성
# train
train['rainy_days'] = list(map(lambda x : 0 if (x == 3) or (x == 4) else 1, train['weather']))

# test
test['rainy_days'] = list(map(lambda x : 0 if (x == 3) or (x == 4) else 1, test['weather']))


In [10]:
# 2. hour 변수의 더미화
# train
tr_hour_series = pd.get_dummies(data=train['hour'], prefix='h')
tr_hour_series = tr_hour_series.drop('h_4', axis = 1)

# test
te_hour_series = pd.get_dummies(data=test['hour'], prefix='h')
te_hour_series = te_hour_series.drop('h_4', axis = 1)

In [13]:
# 3. workingday*hour 교호작용 변수 생성
from sklearn.preprocessing import PolynomialFeatures as poly

# train
hw_tr = pd.concat([train['workingday'], tr_hour_series], axis = 1)

m_poly = poly(degree=2, interaction_only = True)
m_poly.fit(hw_tr)
train_x_poly = m_poly.transform(hw_tr)

# 확장된 데이터 셋의 실제 컬럼 이름
names = m_poly.get_feature_names()

# 여기서의 x0이 workingday 변수, x0를 활용하여 workingday와 교호작용된 확장변수 골라내기
col_bool = pd.Series(names).str.contains('x0')

tr_hw_poly = pd.DataFrame(train_x_poly, columns = names)
col = tr_hw_poly.columns[col_bool]

# workingday 변수 제거
tr_hw_poly = tr_hw_poly.loc[:, col].drop(['x0'], axis = 1)

In [16]:
#test
hw_te = pd.concat([test['workingday'], te_hour_series], axis = 1)

m_poly2 = poly(degree=2, interaction_only = True)
m_poly2.fit(hw_te)
test_x_poly = m_poly2.transform(hw_te)

# 확장된 데이터 셋의 실제 컬럼 이름
names = m_poly2.get_feature_names()

# 여기서의 x0이 workingday 변수, x0를 활용하여 workingday와 교호작용된 확장변수 골라내기
col_bool = pd.Series(names).str.contains('x0')

te_hw_poly = pd.DataFrame(test_x_poly, columns = names)
col = te_hw_poly.columns[col_bool]

# workingday 변수 제거
te_hw_poly = te_hw_poly.loc[:, col].drop(['x0'], axis = 1)

In [17]:
# 4. year 변수 더미 변수화
# train 
tr_years = pd.get_dummies(train['year'], prefix = 'year')

# test
te_years = pd.get_dummies(test['year'], prefix = 'year')

In [18]:
# 5. temp 변수 minmax 스케일링
from sklearn.preprocessing import MinMaxScaler as minmax

# train
tr_temps = np.array(train['temp']).reshape(-1,1)

mm = minmax()
mm.fit(tr_temps)
train['temp'] = mm.transform(tr_temps)

# test
te_temps = np.array(test['temp']).reshape(-1,1)

mm = minmax()
mm.fit(te_temps)
test['temp'] = mm.transform(te_temps)

In [22]:
# 6. 사용할 변수들로만 구성된 데이터 프레임 생성
# train
train_use = train.loc[:,['temp','count','rainy_days']]
train_use = pd.concat([train_use, tr_hour_series, tr_hw_poly, tr_years['year_2012']], axis = 1)

# test
test_use = test.loc[:,['temp','rainy_days']]
test_use = pd.concat([test_use, te_hour_series, te_hw_poly, te_years['year_2012']], axis = 1)

In [23]:
# 7. 기본 모델 생성 (RFRegressor)
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split,cross_val_score

x_col = train_use.columns[train_use.columns != 'count']

train_x, test_x, train_y, test_y = train_test_split(train_use[x_col], train_use['count'], test_size = 0.3, random_state = 1)

rf = RandomForestRegressor()
rf.fit(train_x, train_y)
rf_predict = pd.DataFrame(rf.predict(test_x))

In [24]:
# 8. train, test 평가
from sklearn.metrics import mean_squared_error, mean_squared_log_error

RMSLE = np.sqrt(mean_squared_log_error(test_y, rf_predict))
RMSLE

0.4641467120121117