In [1]:
import pandas as pd

import warnings
warnings.filterwarnings('ignore')

In [2]:
train = pd.read_csv("train_new1.csv")
test = pd.read_csv("test_new1.csv")

In [3]:
train.head()

Unnamed: 0,id,date,bus_route_id,in_out,station_code,station_name,latitude,longitude,6~7_ride,7~8_ride,...,dis_seongsan,dis_seoguipo,location_name,bus_route_type,holiday,num_business_2017,num_employee_2017,population_2018,am_ride,am_takeoff
0,0,2019-09-01,4270000,1,344,제주썬호텔,33.4899,126.49373,0.0,1.0,...,37.695909,27.841879,jeju,4,0,4968,25997,43217,16.0,0.0
1,1,2019-09-01,4270000,1,357,한라병원,33.48944,126.48508,1.0,4.0,...,38.448058,27.996179,jeju,4,0,4968,25997,43217,22.0,0.0
2,2,2019-09-01,4270000,1,432,정존마을,33.48181,126.47352,1.0,1.0,...,39.241905,27.496551,jeju,4,0,4718,22028,56223,4.0,0.0
3,3,2019-09-01,4270000,0,1579,제주국제공항(600번),33.50577,126.49252,0.0,17.0,...,38.36934,29.579404,jeju,4,0,1271,7273,15673,79.0,0.0
4,4,2019-09-01,4270000,0,1646,중문관광단지입구,33.25579,126.4126,0.0,0.0,...,45.881475,14.269792,seoguipo,4,0,448,3695,4414,0.0,1.0


In [4]:
train.shape, test.shape

((415423, 35), (228170, 34))

# 1. 날씨 추가하기  
- 지점 및 날짜별 기온, 강수량, 풍속, 습도를 추가함  
  - 지점은 제주도의 네 개 기상대로 제주, 서귀포, 고산, 성산에 위치  
- [날씨 자료 출처](https://data.kma.go.kr/cmmn/main.do)

#### 해당 기간의 날씨 자료를 다운받아 저장해둔 raining1.csv를 활용  

In [5]:
weather = pd.read_csv("raining1.csv", engine="python")

In [6]:
weather.head()

Unnamed: 0,지점,일시,기온(°C),강수량(mm),풍속(m/s),습도(%)
0,184,2019-09-01 06:00,23.4,,1.2,73
1,184,2019-09-01 07:00,23.5,0.0,0.9,71
2,184,2019-09-01 08:00,24.0,0.0,0.7,69
3,184,2019-09-01 09:00,24.9,0.0,1.3,68
4,184,2019-09-01 10:00,26.2,,1.8,64


In [7]:
weather['지점'].unique()

array([184, 185, 188, 189], dtype=int64)

In [8]:
weather['지점'].value_counts()

189    1087
185    1087
188    1087
184    1087
Name: 지점, dtype: int64

#### 지점을 location_name으로 바꿔주기  
- 184:jeju, 185:gosan, 188:seongsan, 189:seoguipo  

In [9]:
weather['지점'] = weather['지점'].apply(str)
weather.dtypes

지점          object
일시          object
기온(°C)     float64
강수량(mm)    float64
풍속(m/s)    float64
습도(%)        int64
dtype: object

In [10]:
weather['지점'] = ['jeju' if i=='184' else i for i in weather['지점']]
weather['지점'] = ['gosan' if i=='185' else i for i in weather['지점']]
weather['지점'] = ['seongsan' if i=='188' else i for i in weather['지점']]
weather['지점'] = ['seoguipo' if i=='189' else i for i in weather['지점']]

weather.head()

Unnamed: 0,지점,일시,기온(°C),강수량(mm),풍속(m/s),습도(%)
0,jeju,2019-09-01 06:00,23.4,,1.2,73
1,jeju,2019-09-01 07:00,23.5,0.0,0.9,71
2,jeju,2019-09-01 08:00,24.0,0.0,0.7,69
3,jeju,2019-09-01 09:00,24.9,0.0,1.3,68
4,jeju,2019-09-01 10:00,26.2,,1.8,64


#### weather의 일시를 split해서 일시와 time으로 나눠주고, 오전 6부터 오후 12이전 시간대의 데이터만 추출하기

In [11]:
weather['time'] = [int(i.split(" ")[1].split(":")[0]) for i in weather['일시']]
weather['일시'] = [i.split(" ")[0] for i in weather['일시']]
weather2 = weather[(weather['time']>=6) & (weather['time']<12)]

In [12]:
weather2.head()

Unnamed: 0,지점,일시,기온(°C),강수량(mm),풍속(m/s),습도(%),time
0,jeju,2019-09-01,23.4,,1.2,73,6
1,jeju,2019-09-01,23.5,0.0,0.9,71,7
2,jeju,2019-09-01,24.0,0.0,0.7,69,8
3,jeju,2019-09-01,24.9,0.0,1.3,68,9
4,jeju,2019-09-01,26.2,,1.8,64,10


#### 지점, 일시별 평균값 구하기

In [13]:
weather3 = weather2.groupby(['지점', '일시'])[['기온(°C)', '강수량(mm)', '풍속(m/s)', '습도(%)']].mean()
weather3 = weather3.reset_index()
weather3.head()

Unnamed: 0,지점,일시,기온(°C),강수량(mm),풍속(m/s),습도(%)
0,gosan,2019-09-01,23.916667,0.0,1.65,80.166667
1,gosan,2019-09-02,23.7,12.1,5.333333,96.666667
2,gosan,2019-09-03,23.55,1.9,3.233333,100.0
3,gosan,2019-09-04,23.533333,12.84,4.033333,99.666667
4,gosan,2019-09-05,26.033333,0.0,5.45,93.333333


#### NaN 확인

In [14]:
weather3.isnull().sum()

지점           0
일시           0
기온(°C)       0
강수량(mm)    118
풍속(m/s)      2
습도(%)        0
dtype: int64

- 기상청 홈페이지를 찾아보았지만 NaN에 무엇을 넣어주어야 할 지 찾지 못해 직접 할당해주기로 했다

#### 우선 null값이 2개뿐인 풍속의 missing value 채워주기

In [15]:
weather3.loc[weather3['풍속(m/s)'].isnull()]

Unnamed: 0,지점,일시,기온(°C),강수량(mm),풍속(m/s),습도(%)
161,seongsan,2019-09-24,19.9,,,73.0
162,seongsan,2019-09-25,21.4,,,70.833333


- **살펴보니 둘 다 seongsan에서 9월 24, 25일의 데이터이므로 seongsan의 9월 23, 26일 풍속 평균 값을 할당해주기로 한다**

In [16]:
weather4 = weather3.iloc[160:164, :]
del weather4['강수량(mm)']
weather4

Unnamed: 0,지점,일시,기온(°C),풍속(m/s),습도(%)
160,seongsan,2019-09-23,19.833333,2.15,84.166667
161,seongsan,2019-09-24,19.9,,73.0
162,seongsan,2019-09-25,21.4,,70.833333
163,seongsan,2019-09-26,24.266667,4.466667,78.666667


In [17]:
wind_mean = weather4['풍속(m/s)'].mean()
wind_mean

3.3083333333333327

In [18]:
weather4 = weather4.fillna(wind_mean)

In [19]:
weather4.isnull().sum()

지점         0
일시         0
기온(°C)     0
풍속(m/s)    0
습도(%)      0
dtype: int64

In [20]:
weather3['풍속(m/s)'] = weather3['풍속(m/s)'].fillna(wind_mean)
weather3.isnull().sum()

지점           0
일시           0
기온(°C)       0
강수량(mm)    118
풍속(m/s)      0
습도(%)        0
dtype: int64

- 풍속의 null값이 사라진 것을 확인할 수 있다

#### 강수량의 missing value 채워주기  
- **RandomForestClassifier를 이용하자**  

In [21]:
from sklearn.ensemble import RandomForestClassifier

def predict_rain(data):    
    dataNull = data.loc[data['강수량(mm)'].isnull()]
    dataNotNull = data.loc[data['강수량(mm)'].notnull()]
    
    #####rain 예측할 피처를 선택
    wCol = ['기온(°C)', '풍속(m/s)', '습도(%)']

    dataNotNull['강수량(mm)'] = dataNotNull['강수량(mm)'].astype('str')
    
    rfModel_rain = RandomForestClassifier()
    rfModel_rain.fit(dataNotNull[wCol], dataNotNull['강수량(mm)'])
    rainNullValues = rfModel_rain.predict(X = dataNull[wCol])
    
    # 예측한 값과 비교하기 위해 예측한 값을 넣을 데이터 프레임 생성
    predictRainNull = dataNull
    predictRainNotNull = dataNotNull
    
    predictRainNull['강수량(mm)'] = rainNullValues
    
    data = predictRainNotNull.append(predictRainNull)
    data['강수량(mm)'] = data['강수량(mm)'].astype('float')
    
    data.reset_index(inplace=True)
    data.drop('index', axis=1, inplace=True)
    
    return data

In [22]:
weather3 = predict_rain(weather3)
weather3.head()

Unnamed: 0,지점,일시,기온(°C),강수량(mm),풍속(m/s),습도(%)
0,gosan,2019-09-01,23.916667,0.0,1.65,80.166667
1,gosan,2019-09-02,23.7,12.1,5.333333,96.666667
2,gosan,2019-09-03,23.55,1.9,3.233333,100.0
3,gosan,2019-09-04,23.533333,12.84,4.033333,99.666667
4,gosan,2019-09-05,26.033333,0.0,5.45,93.333333


In [23]:
weather3.isnull().sum()

지점         0
일시         0
기온(°C)     0
강수량(mm)    0
풍속(m/s)    0
습도(%)      0
dtype: int64

- missing value가 모두 채워진 것을 확인

In [24]:
weather3 = weather3.rename(columns={'지점':'location_name', '일시':'date', '기온(°C)':'temperature', '강수량(mm)':'rain',
                                   '풍속(m/s)':'windspeed', '습도(%)':'humidity'})
weather3.head()

Unnamed: 0,location_name,date,temperature,rain,windspeed,humidity
0,gosan,2019-09-01,23.916667,0.0,1.65,80.166667
1,gosan,2019-09-02,23.7,12.1,5.333333,96.666667
2,gosan,2019-09-03,23.55,1.9,3.233333,100.0
3,gosan,2019-09-04,23.533333,12.84,4.033333,99.666667
4,gosan,2019-09-05,26.033333,0.0,5.45,93.333333


#### weather3과 train,test merge해주기

In [31]:
train = pd.merge(train, weather3, how='left', on=['location_name', 'date'])
train.shape

(415423, 39)

In [32]:
train.head()

Unnamed: 0,id,date,bus_route_id,in_out,station_code,station_name,latitude,longitude,6~7_ride,7~8_ride,...,holiday,num_business_2017,num_employee_2017,population_2018,am_ride,am_takeoff,temperature,rain,windspeed,humidity
0,0,2019-09-01,4270000,1,344,제주썬호텔,33.4899,126.49373,0.0,1.0,...,0,4968,25997,43217,16.0,0.0,24.65,0.0,1.433333,67.5
1,1,2019-09-01,4270000,1,357,한라병원,33.48944,126.48508,1.0,4.0,...,0,4968,25997,43217,22.0,0.0,24.65,0.0,1.433333,67.5
2,2,2019-09-01,4270000,1,432,정존마을,33.48181,126.47352,1.0,1.0,...,0,4718,22028,56223,4.0,0.0,24.65,0.0,1.433333,67.5
3,3,2019-09-01,4270000,0,1579,제주국제공항(600번),33.50577,126.49252,0.0,17.0,...,0,1271,7273,15673,79.0,0.0,24.65,0.0,1.433333,67.5
4,4,2019-09-01,4270000,0,1646,중문관광단지입구,33.25579,126.4126,0.0,0.0,...,0,448,3695,4414,0.0,1.0,23.783333,0.0,1.333333,86.0


In [33]:
train['rain'].value_counts()

0.000000     217900
0.250000      17659
0.100000      17447
0.133333      13037
2.900000      12395
0.200000      10424
1.250000      10342
3.100000       9373
1.116667       9090
4.740000       8983
6.980000       8975
1.080000       7021
1.750000       7007
1.380000       6846
0.200000       6804
0.040000       6450
0.050000       5096
7.766667       3980
1.725000       3431
2.800000       3348
4.420000       3177
9.816667       2440
1.075000       2437
0.916667       2379
12.100000      2215
0.200000       2093
0.600000       1425
1.900000       1411
0.566667       1393
4.750000       1357
4.600000       1353
3.000000       1301
2.000000       1293
12.840000      1286
5.100000       1200
1.440000       1112
0.300000       1070
0.720000        551
5.350000        322
Name: rain, dtype: int64

In [34]:
test = pd.merge(test, weather3, how='left', on=['location_name', 'date'])
test.shape

(228170, 38)

In [35]:
train.to_csv("train_new2.csv", index=False)
test.to_csv("test_new2.csv", index=False)