# 공유 자전거 수요 예측 (11.24 정호섭)

### 목표
- `전처리 최적화`
- 하나쌤 완벽하게 이해시키기
- 기본 베이스라인 만들기
- 지금까지 전처리한 변수를 통해 비교해보기

In [1]:
# 기본 라이브러리
import numpy as np
import pandas as pd

# 시각화 라이브러리
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')
sns.set_style('darkgrid')

In [2]:
# teste.csv 파일 읽어오기
train_org = pd.read_csv('data/bike_train.csv')
train = train_org.copy()
print(train.shape)
train_org.head()

(10886, 12)


Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,count
0,2011-01-01 00:00:00,1,0,0,1,9.84,14.395,81,0.0,3,13,16
1,2011-01-01 01:00:00,1,0,0,1,9.02,13.635,80,0.0,8,32,40
2,2011-01-01 02:00:00,1,0,0,1,9.02,13.635,80,0.0,5,27,32
3,2011-01-01 03:00:00,1,0,0,1,9.84,14.395,75,0.0,3,10,13
4,2011-01-01 04:00:00,1,0,0,1,9.84,14.395,75,0.0,0,1,1


In [3]:
# test 데이터 불러오기
test_org = pd.read_csv('data/bike_test.csv')
test = test_org.copy()
print(test.shape)
test_org.head()

(6493, 9)


Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed
0,2011-01-20 00:00:00,1,0,1,1,10.66,11.365,56,26.0027
1,2011-01-20 01:00:00,1,0,1,1,10.66,13.635,56,0.0
2,2011-01-20 02:00:00,1,0,1,1,10.66,13.635,56,0.0
3,2011-01-20 03:00:00,1,0,1,1,10.66,12.88,56,11.0014
4,2011-01-20 04:00:00,1,0,1,1,10.66,12.88,56,11.0014


In [4]:
from sklearn.preprocessing import MinMaxScaler

# 전처리 함수
def preprocess(df, is_it_train):
    # 날짜 변환
    df['datetime'] = pd.to_datetime(df['datetime'])
    df['year'] = df['datetime'].dt.year
    df['month'] = df['datetime'].dt.month
    df['day'] = df['datetime'].dt.day
    df['hour'] = df['datetime'].dt.hour
    df['dayofweek'] = df['datetime'].dt.dayofweek
    
    # weather 대체해줌.
    df['weather'].replace({4 : 3}, inplace = True)
    
    # temp 스케일링 해주기
    scaler = MinMaxScaler()
    df['temp_scaled'] = scaler.fit_transform(df[['temp']])
    
    # 풍속 처리
    df['windspeed'].replace({0 : np.nan}, inplace = True)
    df['windspeed'].fillna(df.groupby(['weather', 'season'])['windspeed'].transform('mean'), inplace = True)
    
    # 만약 train 데이터라면 count를 log 처리 해줌.
    if is_it_train :
        df['log_count'] = np.log1p(df['count'])
        
    # 데이터프레임 반환
    return df

<b>이해를 돕기 위한 예시</b> 

preprocess라는 함수를 `틀`이라고 생각하시면 편합니다. train 또는 test가 들어와서 이 `틀`을 거치면 동일한 처리를 거치게 해서 만들어주는 것입니다.

In [5]:
# 위에서 만든 함수를 사용해 전처리함. 이때 train이라면 Is it train = True로 선언하여 count를 log로 변환해준다.
train = preprocess(train, is_it_train = True)
train.head()

Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,count,year,month,day,hour,dayofweek,temp_scaled,log_count
0,2011-01-01 00:00:00,1,0,0,1,9.84,14.395,81,17.191688,3,13,16,2011,1,1,0,5,0.22449,2.833213
1,2011-01-01 01:00:00,1,0,0,1,9.02,13.635,80,17.191688,8,32,40,2011,1,1,1,5,0.204082,3.713572
2,2011-01-01 02:00:00,1,0,0,1,9.02,13.635,80,17.191688,5,27,32,2011,1,1,2,5,0.204082,3.496508
3,2011-01-01 03:00:00,1,0,0,1,9.84,14.395,75,17.191688,3,10,13,2011,1,1,3,5,0.22449,2.639057
4,2011-01-01 04:00:00,1,0,0,1,9.84,14.395,75,17.191688,0,1,1,2011,1,1,4,5,0.22449,0.693147


In [6]:
# 마찬가지로 위에서 만든 함수를 사용한다. 하지만 test 데이터셋이므로 Is it train은 False로 처리해주자
test = preprocess(test, is_it_train=False)
test.head()

Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,year,month,day,hour,dayofweek,temp_scaled
0,2011-01-20 00:00:00,1,0,1,1,10.66,11.365,56,26.0027,2011,1,20,0,3,0.25
1,2011-01-20 01:00:00,1,0,1,1,10.66,13.635,56,16.320835,2011,1,20,1,3,0.25
2,2011-01-20 02:00:00,1,0,1,1,10.66,13.635,56,16.320835,2011,1,20,2,3,0.25
3,2011-01-20 03:00:00,1,0,1,1,10.66,12.88,56,11.0014,2011,1,20,3,3,0.25
4,2011-01-20 04:00:00,1,0,1,1,10.66,12.88,56,11.0014,2011,1,20,4,3,0.25


In [7]:
# 파생변수 및 더미 변수를 생성하는 함수
def encorder(df):
    # 비가 내린 날
    df['rainyday'] = pd.get_dummies(df['weather'], prefix='w').drop(['w_1','w_2'], axis=1)
    # 자전거 타기 좋은 날
    df['ideal'] = df[['temp', 'windspeed']].apply(lambda x: (0, 1)[15 <= x['temp'] <= 22  and x['windspeed'] < 30], axis = 1)
    # 자전거 타기 별로인 날
    df['sticky'] = df[['humidity', 'temp']].apply(lambda x: (0, 1)[x['temp'] >= 30 and x['humidity'] >= 60], axis = 1)
    # 사람 많이 타는 날
    df['peak'] = df[['hour', 'workingday']].apply(lambda x: [0, 1][(x['workingday'] == 1 and  ( x['hour'] == 8 or 17 <= x['hour'] <= 18 or 12 <= x['hour'] <= 13)) or (x['workingday'] == 0 and  10 <= x['hour'] <= 19)], axis = 1)
    
    # hour의 더미 변수들 생성
    dummies  = pd.get_dummies(df['hour'], prefix ='h')
    
    # 데이터프레임을 합쳐줌
    df = pd.concat([df, dummies], axis=1)
    
    # 파생변수 생성
    for idx, name in enumerate(dummies):
        df['hw_'+str(idx)] = df[name] * df['workingday']
        
    return df

[enumerate에 대한 설명](https://wikidocs.net/16045)

In [8]:
# 각 데이터셋에 encorder 함수 적용
train = encorder(train)
test = encorder(test)

In [9]:
# csv 파일로 생성해줌.
train.to_csv('data/train(전처리 완료).csv', index = False)
test.to_csv('data/test (전처리 완료).csv', index = False)