In [5]:
import numpy as np
import pandas as pd

data_path = './bike-sharing-demand/'

train = pd.read_csv(data_path + 'train.csv') # 훈련 데이터
test = pd.read_csv(data_path + 'test.csv') # 테스트 데이터
submission = pd.read_csv(data_path + 'sampleSubmission.csv') # 제출 샘플 데이터

## 이상치 제거

In [6]:
train = train[train['weather'] != 4] # 폭우, 폭설 데이터는 이상치 데이터 였다.

## 데이터 합치기

In [7]:
all_data = pd.concat([train, test] , ignore_index= True) # 인덱스 무시하고 이어 붙이기
all_data

Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,count
0,2011-01-01 00:00:00,1,0,0,1,9.84,14.395,81,0.0000,3.0,13.0,16.0
1,2011-01-01 01:00:00,1,0,0,1,9.02,13.635,80,0.0000,8.0,32.0,40.0
2,2011-01-01 02:00:00,1,0,0,1,9.02,13.635,80,0.0000,5.0,27.0,32.0
3,2011-01-01 03:00:00,1,0,0,1,9.84,14.395,75,0.0000,3.0,10.0,13.0
4,2011-01-01 04:00:00,1,0,0,1,9.84,14.395,75,0.0000,0.0,1.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...
17373,2012-12-31 19:00:00,1,0,1,2,10.66,12.880,60,11.0014,,,
17374,2012-12-31 20:00:00,1,0,1,2,10.66,12.880,60,11.0014,,,
17375,2012-12-31 21:00:00,1,0,1,1,10.66,12.880,60,11.0014,,,
17376,2012-12-31 22:00:00,1,0,1,1,10.66,13.635,56,8.9981,,,


## 파생 피처(변수) 추가

In [9]:
from datetime import datetime

# 날짜 피처 생성

all_data['date'] = all_data['datetime'].apply(lambda x : x.split()[0])

all_data['year'] = all_data['datetime'].apply(lambda x : x.split()[0].split('-')[0])

all_data['month'] = all_data['datetime'].apply(lambda x : x.split()[0].split('-')[1])

all_data['hour'] = all_data['datetime'].apply(lambda x : x.split()[1].split(':')[0])

all_data['minute'] = all_data['datetime'].apply(lambda x : x.split()[1].split(':')[1])

# 요일 피처 생성

all_data['weekday'] = all_data['date'].apply(lambda dateString : datetime.strptime(dateString, '%Y-%m-%d').weekday())

# 훈련 데이터는 매달 1~19일 , 테스트 데이터는 20일~ 말일 ==> 대여 수량을 예측할 때 day 피처는 사용할 필요가 없다.

In [10]:
all_data['datetime'] = pd.to_datetime(all_data['datetime'])

all_data['year'] = all_data['datetime'].dt.year # 연도

all_data['month'] = all_data['datetime'].dt.month # 월

all_data['hour'] = all_data['datetime'].dt.hour # 시간

all_data['weekday'] = all_data['datetime'].dt.weekday # 요일

all_data

Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,count,date,year,month,hour,minute,weekday
0,2011-01-01 00:00:00,1,0,0,1,9.84,14.395,81,0.0000,3.0,13.0,16.0,2011-01-01,2011,1,0,00,5
1,2011-01-01 01:00:00,1,0,0,1,9.02,13.635,80,0.0000,8.0,32.0,40.0,2011-01-01,2011,1,1,00,5
2,2011-01-01 02:00:00,1,0,0,1,9.02,13.635,80,0.0000,5.0,27.0,32.0,2011-01-01,2011,1,2,00,5
3,2011-01-01 03:00:00,1,0,0,1,9.84,14.395,75,0.0000,3.0,10.0,13.0,2011-01-01,2011,1,3,00,5
4,2011-01-01 04:00:00,1,0,0,1,9.84,14.395,75,0.0000,0.0,1.0,1.0,2011-01-01,2011,1,4,00,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17373,2012-12-31 19:00:00,1,0,1,2,10.66,12.880,60,11.0014,,,,2012-12-31,2012,12,19,00,0
17374,2012-12-31 20:00:00,1,0,1,2,10.66,12.880,60,11.0014,,,,2012-12-31,2012,12,20,00,0
17375,2012-12-31 21:00:00,1,0,1,1,10.66,12.880,60,11.0014,,,,2012-12-31,2012,12,21,00,0
17376,2012-12-31 22:00:00,1,0,1,1,10.66,13.635,56,8.9981,,,,2012-12-31,2012,12,22,00,0


## 필요없는 피처 제거

In [11]:
drop_features = ['casual' , 'registered' , 'datetime' , 'date' , 'month' , 'windspeed']

all_data = all_data.drop(drop_features , axis = 1)

all_data

Unnamed: 0,season,holiday,workingday,weather,temp,atemp,humidity,count,year,hour,minute,weekday
0,1,0,0,1,9.84,14.395,81,16.0,2011,0,00,5
1,1,0,0,1,9.02,13.635,80,40.0,2011,1,00,5
2,1,0,0,1,9.02,13.635,80,32.0,2011,2,00,5
3,1,0,0,1,9.84,14.395,75,13.0,2011,3,00,5
4,1,0,0,1,9.84,14.395,75,1.0,2011,4,00,5
...,...,...,...,...,...,...,...,...,...,...,...,...
17373,1,0,1,2,10.66,12.880,60,,2012,19,00,0
17374,1,0,1,2,10.66,12.880,60,,2012,20,00,0
17375,1,0,1,1,10.66,12.880,60,,2012,21,00,0
17376,1,0,1,1,10.66,13.635,56,,2012,22,00,0
