In [1]:
import pandas as pd
import numpy as np
import geopy.distance
import warnings
warnings.filterwarnings('ignore')

In [2]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

In [3]:
train.shape, test.shape

((415423, 21), (228170, 20))

In [4]:
train.dtypes

id                 int64
date              object
bus_route_id       int64
in_out            object
station_code       int64
station_name      object
latitude         float64
longitude        float64
6~7_ride         float64
7~8_ride         float64
8~9_ride         float64
9~10_ride        float64
10~11_ride       float64
11~12_ride       float64
6~7_takeoff      float64
7~8_takeoff      float64
8~9_takeoff      float64
9~10_takeoff     float64
10~11_takeoff    float64
11~12_takeoff    float64
18~20_ride       float64
dtype: object

In [5]:
test.dtypes

id                 int64
date              object
bus_route_id       int64
in_out            object
station_code       int64
station_name      object
latitude         float64
longitude        float64
6~7_ride         float64
7~8_ride         float64
8~9_ride         float64
9~10_ride        float64
10~11_ride       float64
11~12_ride       float64
6~7_takeoff      float64
7~8_takeoff      float64
8~9_takeoff      float64
9~10_takeoff     float64
10~11_takeoff    float64
11~12_takeoff    float64
dtype: object

# 1. date  
- 날짜  
- object type인 date를 datetime으로 변환한 후  
  - dt.weekday를 이용해 weekday를 추출하여 새로운 컬럼으로 추가  
  - dt.day를 이용해 day를 추출하여 새로운 컬럼으로 추가

In [6]:
train['date'] = pd.to_datetime(train['date'])
train['weekday'] = train['date'].dt.weekday
train['day'] = train['date'].dt.day

test에도 똑같이 적용

In [7]:
test['date'] = pd.to_datetime(train['date'])
test['weekday'] = test['date'].dt.weekday
test['day'] = test['date'].dt.day

확인

In [8]:
train.shape, test.shape

((415423, 23), (228170, 22))

# 2. in_out  
- 시내버스/시외버스 구분  
- value_counts()에서 더 많은 쪽을 0, 적은 쪽을 1로 변경 

In [9]:
train['in_out'].value_counts()

시내    408500
시외      6923
Name: in_out, dtype: int64

In [10]:
train['in_out'] = train['in_out'].map({'시내':0, '시외':1})

In [11]:
test['in_out'] = test['in_out'].map({'시내':0, '시외':1})

In [12]:
train.shape, test.shape

((415423, 23), (228170, 22))

# 3. 승차, 하차 인원   
- 6:00:00-11:59:59까지 승/하차 인원  
- 더욱 강한 상관관계를 위해 시간대별 승하차 인원을 각각 오전시간 승차인원, 하차인원으로 합해준다   

In [13]:
train['a68_ride']=train['6~7_ride']+train['7~8_ride']
train['a810_ride']=train['8~9_ride']+train['9~10_ride']
train['a1012_ride']=train['10~11_ride']+train['11~12_ride']

train['a68_takeoff']=train['6~7_takeoff']+train['7~8_takeoff']
train['a810_takeoff']=train['8~9_takeoff']+train['9~10_takeoff']
train['a1012_takeoff']=train['10~11_takeoff']+train['11~12_takeoff']

In [14]:
test['a68_ride']=test['6~7_ride']+test['7~8_ride']
test['a810_ride']=test['8~9_ride']+test['9~10_ride']
test['a1012_ride']=test['10~11_ride']+test['11~12_ride']

test['a68_takeoff']=test['6~7_takeoff']+test['7~8_takeoff']
test['a810_takeoff']=test['8~9_takeoff']+test['9~10_takeoff']
test['a1012_takeoff']=test['10~11_takeoff']+test['11~12_takeoff']

확인

In [15]:
train.shape, test.shape

((415423, 29), (228170, 28))

# 4. 좌표 데이터 활용하기  
1. 네 군데 기상대와의 거리와 가장 가까운 기상대 이름을 feature로 추가해주기   

  - 제주 측정소의 위.경도 : 33.51411, 126.52969  
  - 고산 측정소의 위.경도 : 33.29382, 126.16283  
  - 성산 측정소의 위.경도 : 33.38677, 126.880  
  - 서귀포 측정소의 위.경도 : 33.24616, 126.5653  
  
2. reverse_geocode를 이용해 주소 구하고 feature로 추가해주기

## 4.1 네 군데 기상대와의 거리와 가장 가까운 기상대 이름을 feature로 추가해주기  
- 해당 버스 정류장으로부터 네 기상대와의 거리는 각각 dis_jeju, dis_gosan, dis_seongsan, dis_seoguipo, dis_seoguipo  
- 해당 버스 정류장으로부터 가장 가까운 기상대 이름은 location_name으로 추가  

In [16]:
jeju = (33.51411, 126.52969)
gosan = (33.29382, 126.16283)
seongsan = (33.38677, 126.880)
seoguipo = (33.24616, 126.5653)

#### 중복 제거한 데이터 만들기  
- 정류장별 데이터를 구하면 되기 때문에 원활한 작업을 위해 중복을 제거  
- station_name, latitude, longitude 기준으로 중복 제거

In [17]:
data2 = train[['station_code','latitude', 'longitude']].drop_duplicates(keep='first')
data2.head()

Unnamed: 0,station_code,latitude,longitude
0,344,33.4899,126.49373
1,357,33.48944,126.48508
2,432,33.48181,126.47352
3,1579,33.50577,126.49252
4,1646,33.25579,126.4126


#### 각 정류장과 제주도에 존재하는 4군데의 기상 측정소와의 거리를 계산하여 t1~t4에 할당

In [18]:
t1 = [geopy.distance.vincenty((i,j), jeju).km for i,j in list(zip(data2['latitude'],data2['longitude']))]
t2 = [geopy.distance.vincenty((i,j), gosan).km for i,j in list(zip(data2['latitude'],data2['longitude']))]
t3 = [geopy.distance.vincenty((i,j), seongsan).km for i,j in list(zip(data2['latitude'],data2['longitude']))]
t4 = [geopy.distance.vincenty((i,j), seoguipo).km for i,j in list(zip(data2['latitude'],data2['longitude']))]

data2['dis_jeju']=t1
data2['dis_gosan']=t2
data2['dis_seongsan']=t3
data2['dis_seoguipo']=t4

#### 각 정류소에서 가장 가까운 곳에 있는 측정소를 location_name으로 추가해주기

In [19]:
total=pd.DataFrame(list(zip( t1,t2,t3,t4)),columns=['dis_jeju','dis_gosan','dis_seongsan','dis_seoguipo'])
total['location_name'] = total.apply(lambda x: x.argmin(), axis=1)
total['location_name'] = total['location_name'].apply(lambda x: x[4:])

total.head()

Unnamed: 0,dis_jeju,dis_gosan,dis_seongsan,dis_seoguipo,location_name
0,4.286659,37.692766,37.695909,27.841879,jeju
1,4.966848,37.008492,38.448058,27.996179,jeju
2,6.330746,35.642126,39.241905,27.496551,jeju
3,3.575272,38.643401,38.36934,29.579404,jeju
4,30.652098,23.648361,45.881475,14.269792,seoguipo


In [20]:
data2 = pd.merge(data2, total, on=['dis_jeju', 'dis_gosan', 'dis_seongsan', 'dis_seoguipo'])
data2.shape

(3563, 8)

#### train에 data2를 merge

In [21]:
train = pd.merge(train, data2, how='left', on=['station_code', 'latitude', 'longitude'])
train.head()

Unnamed: 0,id,date,bus_route_id,in_out,station_code,station_name,latitude,longitude,6~7_ride,7~8_ride,...,a810_ride,a1012_ride,a68_takeoff,a810_takeoff,a1012_takeoff,dis_jeju,dis_gosan,dis_seongsan,dis_seoguipo,location_name
0,0,2019-09-01,4270000,1,344,제주썬호텔,33.4899,126.49373,0.0,1.0,...,7.0,8.0,0.0,0.0,0.0,4.286659,37.692766,37.695909,27.841879,jeju
1,1,2019-09-01,4270000,1,357,한라병원,33.48944,126.48508,1.0,4.0,...,6.0,11.0,0.0,0.0,0.0,4.966848,37.008492,38.448058,27.996179,jeju
2,2,2019-09-01,4270000,1,432,정존마을,33.48181,126.47352,1.0,1.0,...,2.0,0.0,0.0,0.0,0.0,6.330746,35.642126,39.241905,27.496551,jeju
3,3,2019-09-01,4270000,0,1579,제주국제공항(600번),33.50577,126.49252,0.0,17.0,...,32.0,30.0,0.0,0.0,0.0,3.575272,38.643401,38.36934,29.579404,jeju
4,4,2019-09-01,4270000,0,1646,중문관광단지입구,33.25579,126.4126,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,30.652098,23.648361,45.881475,14.269792,seoguipo


In [22]:
train.shape

(415423, 34)

#### test에도 똑같이 적용

In [23]:
data3 = test[['station_code','latitude', 'longitude']].drop_duplicates(keep='first')
data3.head()

Unnamed: 0,station_code,latitude,longitude
0,344,33.4899,126.49373
1,357,33.48944,126.48508
2,432,33.48181,126.47352
3,1579,33.50577,126.49252
4,1636,33.24872,126.41032


In [24]:
t1 = [geopy.distance.vincenty((i,j), jeju).km for i,j in list(zip(data3['latitude'],data3['longitude']))]
t2 = [geopy.distance.vincenty((i,j), gosan).km for i,j in list(zip(data3['latitude'],data3['longitude']))]
t3 = [geopy.distance.vincenty((i,j), seongsan).km for i,j in list(zip(data3['latitude'],data3['longitude']))]
t4 = [geopy.distance.vincenty((i,j), seoguipo).km for i,j in list(zip(data3['latitude'],data3['longitude']))]

data3['dis_jeju']=t1
data3['dis_gosan']=t2
data3['dis_seongsan']=t3
data3['dis_seoguipo']=t4

In [25]:
total=pd.DataFrame(list(zip( t1,t2,t3,t4)),columns=['dis_jeju','dis_gosan','dis_seongsan','dis_seoguipo'])
total['location_name'] = total.apply(lambda x: x.argmin(), axis=1)
total['location_name'] = total['location_name'].apply(lambda x: x[4:])
total.shape

(3505, 5)

In [26]:
data3 = pd.merge(data3, total, how='left', on=['dis_jeju', 'dis_gosan', 'dis_seongsan', 'dis_seoguipo'])
data3.shape

(3507, 8)

In [27]:
# 이유는 모르겠으나 중복이 있어서 제거
data3 = data3[['station_code', 'latitude', 'longitude', 'dis_jeju', 'dis_gosan',
       'dis_seongsan', 'dis_seoguipo', 'location_name']].drop_duplicates(keep='first')
data3.shape

(3505, 8)

In [28]:
test = pd.merge(test, data3, how='left', on=['station_code','latitude','longitude'])
test.shape

(228170, 33)

## 4.2.  reverse_geocode를 이용해 주소 구하고 feature로 추가해주기
- googlemaps의 `reverse_geocode` 를 활용  

In [29]:
import googlemaps

gmaps_key = 'API Key'
gmaps = googlemaps.Client(key=gmaps_key)

In [30]:
df1 = train[["station_code", "latitude", "longitude"]]
df2 = test[["station_code", "latitude", "longitude"]]

df = pd.concat([df1, df2])
df = df[['station_code', 'latitude', 'longitude']].drop_duplicates(keep='first')
df.shape

(3601, 3)

#### 전체 주소가 들은 station_address list 생성

In [31]:
station_address = []
for lat, lng in zip(df['latitude'], df['longitude']):
    address = gmaps.reverse_geocode((lat,lng), language='ko')[0]['formatted_address']
    station_address.append(address)

In [32]:
df['address'] = station_address
df.head()

Unnamed: 0,station_code,latitude,longitude,address
0,344,33.4899,126.49373,대한민국 제주시 연동 제주 썬호텔
1,357,33.48944,126.48508,대한민국 제주특별자치도 제주시 연동 도령로
2,432,33.48181,126.47352,대한민국 제주시 노형동 정존마을
3,1579,33.50577,126.49252,대한민국 제주특별자치도 제주시 용담2동 1984-5
4,1646,33.25579,126.4126,대한민국 서귀포시 색달동 중문관광단지입구


#### test, train에 merge

In [33]:
train = pd.merge(train, df, how='left', on=['station_code', 'latitude', 'longitude'])
train.shape

(415423, 35)

In [34]:
test = pd.merge(test, df, how='left', on=['station_code', 'latitude', 'longitude'])
test.shape

(228170, 34)

# 5. bus_route_id  
- 주어진 bus_route_id는 실제 제주 버스 노선에 존재하진 않았으나 Padding된 숫자인 것 같음  
- 제주도 버스 노선 정보에 대해 검색해본 결과 제주도 버스 노선은 앞자리로 분류 가능하다는 사실 확인  
  - [출처](https://www.wooleelife.com/88)  

$\Rightarrow$ bus_route_id의 앞자리만 따로 추출하여 bus_route_type이라는 feature로 추가  
- 우선 bus_route_id의 data type인 int에서는 indexing이 되지 않으므로 str으로 변경부터 해줌   

In [35]:
train['bus_route_str'] = train['bus_route_id'].apply(str)
train['bus_route_type'] = [train['bus_route_str'].iloc[i][0] for i in range(len(train))]
train.head()

Unnamed: 0,id,date,bus_route_id,in_out,station_code,station_name,latitude,longitude,6~7_ride,7~8_ride,...,a810_takeoff,a1012_takeoff,dis_jeju,dis_gosan,dis_seongsan,dis_seoguipo,location_name,address,bus_route_str,bus_route_type
0,0,2019-09-01,4270000,1,344,제주썬호텔,33.4899,126.49373,0.0,1.0,...,0.0,0.0,4.286659,37.692766,37.695909,27.841879,jeju,대한민국 제주시 연동 제주 썬호텔,4270000,4
1,1,2019-09-01,4270000,1,357,한라병원,33.48944,126.48508,1.0,4.0,...,0.0,0.0,4.966848,37.008492,38.448058,27.996179,jeju,대한민국 제주특별자치도 제주시 연동 도령로,4270000,4
2,2,2019-09-01,4270000,1,432,정존마을,33.48181,126.47352,1.0,1.0,...,0.0,0.0,6.330746,35.642126,39.241905,27.496551,jeju,대한민국 제주시 노형동 정존마을,4270000,4
3,3,2019-09-01,4270000,0,1579,제주국제공항(600번),33.50577,126.49252,0.0,17.0,...,0.0,0.0,3.575272,38.643401,38.36934,29.579404,jeju,대한민국 제주특별자치도 제주시 용담2동 1984-5,4270000,4
4,4,2019-09-01,4270000,0,1646,중문관광단지입구,33.25579,126.4126,0.0,0.0,...,1.0,0.0,30.652098,23.648361,45.881475,14.269792,seoguipo,대한민국 서귀포시 색달동 중문관광단지입구,4270000,4


In [36]:
test['bus_route_str'] = test['bus_route_id'].apply(str)
test['bus_route_type'] = [test['bus_route_str'].iloc[i][0] for i in range(len(test))]
test.head()

Unnamed: 0,id,date,bus_route_id,in_out,station_code,station_name,latitude,longitude,6~7_ride,7~8_ride,...,a810_takeoff,a1012_takeoff,dis_jeju,dis_gosan,dis_seongsan,dis_seoguipo,location_name,address,bus_route_str,bus_route_type
0,415423,2019-09-01,4270000,1,344,제주썬호텔,33.4899,126.49373,4.0,4.0,...,0.0,1.0,4.286659,37.692766,37.695909,27.841879,jeju,대한민국 제주시 연동 제주 썬호텔,4270000,4
1,415424,2019-09-01,4270000,1,357,한라병원,33.48944,126.48508,1.0,6.0,...,0.0,0.0,4.966848,37.008492,38.448058,27.996179,jeju,대한민국 제주특별자치도 제주시 연동 도령로,4270000,4
2,415425,2019-09-01,4270000,1,432,정존마을,33.48181,126.47352,2.0,4.0,...,0.0,0.0,6.330746,35.642126,39.241905,27.496551,jeju,대한민국 제주시 노형동 정존마을,4270000,4
3,415426,2019-09-01,4270000,0,1579,제주국제공항(600번),33.50577,126.49252,1.0,11.0,...,0.0,0.0,3.575272,38.643401,38.36934,29.579404,jeju,대한민국 제주특별자치도 제주시 용담2동 1984-5,4270000,4
4,415427,2019-09-01,4270000,0,1636,롯데호텔,33.24872,126.41032,0.0,0.0,...,1.0,0.0,31.460686,23.59401,46.337724,14.445605,seoguipo,대한민국 제주특별자치도 서귀포시 색달동 2812-4,4270000,4


In [37]:
del train['bus_route_str']
del test['bus_route_str']

In [38]:
train.shape, test.shape

((415423, 36), (228170, 35))

# 6. holiday 추가  
- 9월= 주말+추석(12,13), 10월= 주말+(3,9)  
- 공휴일이면 0, 아니면 1

In [39]:
date_index = pd.date_range(start='20190901', end='20191016')
date_index

DatetimeIndex(['2019-09-01', '2019-09-02', '2019-09-03', '2019-09-04',
               '2019-09-05', '2019-09-06', '2019-09-07', '2019-09-08',
               '2019-09-09', '2019-09-10', '2019-09-11', '2019-09-12',
               '2019-09-13', '2019-09-14', '2019-09-15', '2019-09-16',
               '2019-09-17', '2019-09-18', '2019-09-19', '2019-09-20',
               '2019-09-21', '2019-09-22', '2019-09-23', '2019-09-24',
               '2019-09-25', '2019-09-26', '2019-09-27', '2019-09-28',
               '2019-09-29', '2019-09-30', '2019-10-01', '2019-10-02',
               '2019-10-03', '2019-10-04', '2019-10-05', '2019-10-06',
               '2019-10-07', '2019-10-08', '2019-10-09', '2019-10-10',
               '2019-10-11', '2019-10-12', '2019-10-13', '2019-10-14',
               '2019-10-15', '2019-10-16'],
              dtype='datetime64[ns]', freq='D')

In [40]:
date_df = pd.DataFrame(columns={'date'})
date_df['date'] = date_index
date_df.head()

Unnamed: 0,date
0,2019-09-01
1,2019-09-02
2,2019-09-03
3,2019-09-04
4,2019-09-05


In [41]:
holiday = ['2019-09-01', '2019-09-07', '2019-09-08', '2019-09-12', '2019-09-13', '2019-09-14', '2019-09-15', '2019-09-21', '2019-09-22', '2019-09-28', '2019-09-29',
          '2019-10-03', '2019-10-05', '2019-10-06', '2019-10-09','2019-10-12', '2019-10-13', '2019-10-19', '2019-10-20', '2019-10-26', '2019-10-27']

In [42]:
len(holiday)

21

In [43]:
holiday_df = pd.DataFrame()

In [44]:
holiday_df['date'] = holiday
holiday_df['date'] = pd.to_datetime(holiday_df['date'])
holiday_df.head()

Unnamed: 0,date
0,2019-09-01
1,2019-09-07
2,2019-09-08
3,2019-09-12
4,2019-09-13


In [45]:
zero = len(holiday)
holiday_df['holiday'] = np.zeros(zero)
holiday_df['holiday'] = holiday_df['holiday'].apply(int)
holiday_df.head()

Unnamed: 0,date,holiday
0,2019-09-01,0
1,2019-09-07,0
2,2019-09-08,0
3,2019-09-12,0
4,2019-09-13,0


In [46]:
holiday_df.shape

(21, 2)

In [47]:
holidays_df = pd.merge(date_df, holiday_df, how='left', on=['date'])
holidays_df = holidays_df.fillna(1)
holidays_df['holiday'] = holidays_df['holiday'].apply(int)
holidays_df.head()

Unnamed: 0,date,holiday
0,2019-09-01,0
1,2019-09-02,1
2,2019-09-03,1
3,2019-09-04,1
4,2019-09-05,1


#### train, test에 merge

In [48]:
train = pd.merge(train, holidays_df, on=['date'])
train.head()

Unnamed: 0,id,date,bus_route_id,in_out,station_code,station_name,latitude,longitude,6~7_ride,7~8_ride,...,a810_takeoff,a1012_takeoff,dis_jeju,dis_gosan,dis_seongsan,dis_seoguipo,location_name,address,bus_route_type,holiday
0,0,2019-09-01,4270000,1,344,제주썬호텔,33.4899,126.49373,0.0,1.0,...,0.0,0.0,4.286659,37.692766,37.695909,27.841879,jeju,대한민국 제주시 연동 제주 썬호텔,4,0
1,1,2019-09-01,4270000,1,357,한라병원,33.48944,126.48508,1.0,4.0,...,0.0,0.0,4.966848,37.008492,38.448058,27.996179,jeju,대한민국 제주특별자치도 제주시 연동 도령로,4,0
2,2,2019-09-01,4270000,1,432,정존마을,33.48181,126.47352,1.0,1.0,...,0.0,0.0,6.330746,35.642126,39.241905,27.496551,jeju,대한민국 제주시 노형동 정존마을,4,0
3,3,2019-09-01,4270000,0,1579,제주국제공항(600번),33.50577,126.49252,0.0,17.0,...,0.0,0.0,3.575272,38.643401,38.36934,29.579404,jeju,대한민국 제주특별자치도 제주시 용담2동 1984-5,4,0
4,4,2019-09-01,4270000,0,1646,중문관광단지입구,33.25579,126.4126,0.0,0.0,...,1.0,0.0,30.652098,23.648361,45.881475,14.269792,seoguipo,대한민국 서귀포시 색달동 중문관광단지입구,4,0


In [49]:
test = pd.merge(test, holidays_df, on=['date'])
test.head()

Unnamed: 0,id,date,bus_route_id,in_out,station_code,station_name,latitude,longitude,6~7_ride,7~8_ride,...,a810_takeoff,a1012_takeoff,dis_jeju,dis_gosan,dis_seongsan,dis_seoguipo,location_name,address,bus_route_type,holiday
0,415423,2019-09-01,4270000,1,344,제주썬호텔,33.4899,126.49373,4.0,4.0,...,0.0,1.0,4.286659,37.692766,37.695909,27.841879,jeju,대한민국 제주시 연동 제주 썬호텔,4,0
1,415424,2019-09-01,4270000,1,357,한라병원,33.48944,126.48508,1.0,6.0,...,0.0,0.0,4.966848,37.008492,38.448058,27.996179,jeju,대한민국 제주특별자치도 제주시 연동 도령로,4,0
2,415425,2019-09-01,4270000,1,432,정존마을,33.48181,126.47352,2.0,4.0,...,0.0,0.0,6.330746,35.642126,39.241905,27.496551,jeju,대한민국 제주시 노형동 정존마을,4,0
3,415426,2019-09-01,4270000,0,1579,제주국제공항(600번),33.50577,126.49252,1.0,11.0,...,0.0,0.0,3.575272,38.643401,38.36934,29.579404,jeju,대한민국 제주특별자치도 제주시 용담2동 1984-5,4,0
4,415427,2019-09-01,4270000,0,1636,롯데호텔,33.24872,126.41032,0.0,0.0,...,1.0,0.0,31.460686,23.59401,46.337724,14.445605,seoguipo,대한민국 제주특별자치도 서귀포시 색달동 2812-4,4,0


In [50]:
train.shape, test.shape

((415423, 37), (228170, 36))

In [51]:
train['holiday'].value_counts()

1    297247
0    118176
Name: holiday, dtype: int64

In [52]:
test['holiday'].value_counts()

1    152924
0     75246
Name: holiday, dtype: int64

# train, test 저장

In [53]:
train.to_csv("train_given.csv", index=False)
test.to_csv("test_given.csv", index=False)