# Dacon Jeju Bigdata Competiton
## 1차 회의 - 데이터 확인

In [1]:
%matplotlib inline
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np


In [2]:
bus = pd.read_csv("bus_bts.csv")
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

## 변수 설명
### bus 
user_card_id : 해당 승객의 버스카드ID  
bus_route_id : 노선ID  
vhc_id : 차량ID  
geton_date : 해당 승객이 탑승한 날짜  
geton_time : 해당 승객이 탑승한 시간  
geton_station_code : 승차정류소의 ID  
geton_station_name : 승차정류소의 이름  
getoff_date : 해당 승객이 하차한 날짜 (하차태그 없는 경우, NaN)  
getoff_time : 해당 승객이 하차한 시간 (하차태그 없는 경우, NaN)  
getoff_station_code : 하차정류소의 ID (하차태그 없는 경우, NaN)  
getoff_station_name : 하차정류소의 이름 (하차태그 없는 경우, NaN)  
user_category : 승객 구분 (1:일반, 2:어린이, 4:청소년, 6:경로, 27:장애일반, 28:장애동반, 29:유동일반, 30:유공동반)  
user_count : 해당 버스카드로 계산한 인원수 ( ex- 3은 3명 분의 버스비를 해당 카드 하나로 계산한 것)
### train / test
id : 해당 데이터에서의 고유한 ID(train, test와의 중복은 없음)  
date : 날짜  
bus_route_id : 노선ID  
in_out : 시내버스, 시외버스 구분  
station_code : 해당 승하차 정류소의 ID  
station_name : 해당 승하차 정류소의 이름  
latitude : 해당 버스 정류장의 위도 (같은 정류장 이름이어도 버스의 진행 방향에 따라 다를 수 있음)  
longitude : 해당 버스 정류장의 경도 (같은 정류장 이름이어도 버스의 진행 방향에 따라 다를 수 있음)  
6-7_ride : 6:00:00부터 6:59:59까지 승차한 인원 수  
7-8_ride : 7:00:00부터 7:59:59까지 승차한 인원 수  
8-9_ride : 8:00:00부터 8:59:59까지 승차한 인원 수  
9-10_ride : 9:00:00부터 9:59:59까지 승차한 인원 수  
10-11_ride : 10:00:00부터 10:59:59까지 승차한 인원 수  
11-12_ride : 11:00:00부터 11:59:59까지 승차한 인원 수  
6-7_takeoff : 6:00:00부터 6:59:59까지 하차한 인원 수  
7-8_takeoff : 7:00:00부터 7:59:59까지 하차한 인원 수  
8-9_takeoff : 8:00:00부터 8:59:59까지 하차한 인원 수  
9-10_takeoff : 9:00:00부터 9:59:59까지 하차한 인원 수  
10-11_takeoff : 10:00:00부터 10:59:59까지 하차한 인원 수  
11-12_takeoff : 11:00:00부터 11:59:59까지 하차한 인원 수  
18-20_ride : 18:00:00부터 19:59:59까지 승차한 인원 수(train data에만 존재)

## 데이터 크기

In [3]:
bus.shape

(2409414, 13)

In [4]:
train.shape

(415423, 21)

In [5]:
test.shape

(228170, 20)

## 결측치 확인

In [6]:
bus.isnull().sum()

user_card_id                0
bus_route_id                0
vhc_id                      0
geton_date                  0
geton_time                  0
geton_station_code          0
geton_station_name         49
getoff_date            895736
getoff_time            895736
getoff_station_code    895736
getoff_station_name    895775
user_category               0
user_count                  0
dtype: int64

In [7]:
train.isnull().sum()

id               0
date             0
bus_route_id     0
in_out           0
station_code     0
station_name     0
latitude         0
longitude        0
6~7_ride         0
7~8_ride         0
8~9_ride         0
9~10_ride        0
10~11_ride       0
11~12_ride       0
6~7_takeoff      0
7~8_takeoff      0
8~9_takeoff      0
9~10_takeoff     0
10~11_takeoff    0
11~12_takeoff    0
18~20_ride       0
dtype: int64

In [8]:
test.isnull().sum()

id               0
date             0
bus_route_id     0
in_out           0
station_code     0
station_name     0
latitude         0
longitude        0
6~7_ride         0
7~8_ride         0
8~9_ride         0
9~10_ride        0
10~11_ride       0
11~12_ride       0
6~7_takeoff      0
7~8_takeoff      0
8~9_takeoff      0
9~10_takeoff     0
10~11_takeoff    0
11~12_takeoff    0
dtype: int64

### bus테이블에만 결측치 있음
- 하차 태그를 하지 않아서 생긴 결측치 895736건
    - 예측을 통해 채워넣어야함
- 정류소 이름 결측치 49건

## geton_station_name 결측치 채우기

In [4]:
# 정류소 이름 결측치 채우기 위해 정류소 코드 이용
no_station_name = list(bus[bus['geton_station_name'].isnull()]['geton_station_code'].unique())

In [5]:
# 총 27종류의 정류소 이름이 필요
len(no_station_name)

27

In [6]:
train[train['station_code'].isin(no_station_name)]

Unnamed: 0,id,date,bus_route_id,in_out,station_code,station_name,latitude,longitude,6~7_ride,7~8_ride,...,9~10_ride,10~11_ride,11~12_ride,6~7_takeoff,7~8_takeoff,8~9_takeoff,9~10_takeoff,10~11_takeoff,11~12_takeoff,18~20_ride


In [7]:
test[test['station_code'].isin(no_station_name)]

Unnamed: 0,id,date,bus_route_id,in_out,station_code,station_name,latitude,longitude,6~7_ride,7~8_ride,8~9_ride,9~10_ride,10~11_ride,11~12_ride,6~7_takeoff,7~8_takeoff,8~9_takeoff,9~10_takeoff,10~11_takeoff,11~12_takeoff
6740,422163,2019-10-01,27000000,시내,1341,조천도서관,33.53719,126.6667,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
20672,436095,2019-10-02,24370000,시내,3339,디케이서비스,33.46377,126.5208,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
34454,449877,2019-10-03,27660000,시내,4131,방지물,33.29693,126.22997,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
36964,452387,2019-10-03,29270000,시내,2677,위미리,33.27573,126.6664,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
55602,471025,2019-10-04,32680000,시내,2961,소길장밭,33.4425,126.38427,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
56894,472317,2019-10-05,22470000,시내,1597,고상동,33.50859,126.67321,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
61909,477332,2019-10-05,27240000,시내,3381,유건이오름,33.41698,126.84021,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
70213,485636,2019-10-06,22470000,시내,1597,고상동,33.50859,126.67321,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
74080,489503,2019-10-06,25000000,시내,3170,별내린전망대,33.24723,126.41686,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
74645,490068,2019-10-06,27000000,시내,1341,조천도서관,33.53719,126.6667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [8]:
nan_name = test[test['station_code'].isin(no_station_name)][['station_code', 'station_name']]

In [9]:
nan_name = nan_name .drop_duplicates()
len(nan_name)
# 27개의 결측치 정보를 다 가지고 있음

27

In [10]:
# 결측치를 채우기 위해 nan_name테이블 Join
bus = pd.merge(bus, nan_name, left_on = 'geton_station_code',right_on='station_code', how='left')

In [11]:
# geton_station_name의 null값을 station_name으로 대체
bus['geton_station_name'] = np.where(bus['geton_station_name'].isnull(), bus['station_name'], bus['geton_station_name'] )

In [12]:
# Join했던 nan_name테이블의 column 삭제
bus = bus.drop(['station_code','station_name'], axis = 1)

In [13]:
# null값이 있는지 다시 확인
bus.isnull().sum()

user_card_id                0
bus_route_id                0
vhc_id                      0
geton_date                  0
geton_time                  0
geton_station_code          0
geton_station_name          0
getoff_date            895736
getoff_time            895736
getoff_station_code    895736
getoff_station_name    895775
user_category               0
user_count                  0
dtype: int64

## getoff_station_name 결측치 채우기

In [14]:
no_station_name2 = list(bus[bus['getoff_station_name'].isnull()]['getoff_station_code'].unique())

In [15]:
del no_station_name[0]
no_station_name2

[nan,
 4070.0,
 3399.0,
 4106.0,
 4149.0,
 4091.0,
 4073.0,
 1888.0,
 3553.0,
 1428.0,
 4182.0,
 4190.0,
 1597.0,
 3312.0,
 3502.0,
 3339.0,
 3493.0,
 2331.0,
 1341.0,
 3381.0,
 1285.0,
 4121.0,
 2677.0,
 3377.0,
 2041.0]

In [16]:
len(no_station_name2)

25

In [17]:
train[train['station_code'].isin(no_station_name2)]

Unnamed: 0,id,date,bus_route_id,in_out,station_code,station_name,latitude,longitude,6~7_ride,7~8_ride,...,9~10_ride,10~11_ride,11~12_ride,6~7_takeoff,7~8_takeoff,8~9_takeoff,9~10_takeoff,10~11_takeoff,11~12_takeoff,18~20_ride


In [18]:
test[test['station_code'].isin(no_station_name2)]

Unnamed: 0,id,date,bus_route_id,in_out,station_code,station_name,latitude,longitude,6~7_ride,7~8_ride,8~9_ride,9~10_ride,10~11_ride,11~12_ride,6~7_takeoff,7~8_takeoff,8~9_takeoff,9~10_takeoff,10~11_takeoff,11~12_takeoff
6740,422163,2019-10-01,27000000,시내,1341,조천도서관,33.53719,126.6667,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
20672,436095,2019-10-02,24370000,시내,3339,디케이서비스,33.46377,126.5208,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
34089,449512,2019-10-03,26330000,시내,4190,도순거린내,33.25941,126.47365,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
34205,449628,2019-10-03,27180000,시내,4121,행원리오션뷰2차,33.5269,126.78918,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
36964,452387,2019-10-03,29270000,시내,2677,위미리,33.27573,126.6664,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
46684,462107,2019-10-04,26050000,시내,1888,학림동,33.29013,126.60706,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
47329,462752,2019-10-04,27230000,시내,3377,영주산,33.39828,126.80358,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
56894,472317,2019-10-05,22470000,시내,1597,고상동,33.50859,126.67321,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
61909,477332,2019-10-05,27240000,시내,3381,유건이오름,33.41698,126.84021,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
69697,485120,2019-10-06,21930000,시내,4182,소남동산,33.2944,126.63187,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0


In [26]:
nan_name2 = test[test['station_code'].isin(no_station_name2)][['station_code', 'station_name']]

In [27]:
nan_name2 = nan_name2.drop_duplicates()
len(nan_name2)
# 24개의 결측치 중 18개의 정보만 가지고 있음

18

In [28]:
# 결측치를 채우기 위해 nan_name테이블 Join
bus = pd.merge(bus, nan_name2, left_on = 'getoff_station_code',right_on='station_code', how='left')

In [29]:
# geton_station_name의 null값을 station_name으로 대체
bus['getoff_station_name'] = np.where(bus['getoff_station_name'].isnull(), bus['station_name'], bus['getoff_station_name'] )

In [30]:
# Join했던 nan_name테이블의 column 삭제
bus = bus.drop(['station_code','station_name'], axis = 1)

In [31]:
# null값이 있는지 다시 확인
bus.isnull().sum()

user_card_id                0
bus_route_id                0
vhc_id                      0
geton_date                  0
geton_time                  0
geton_station_code          0
geton_station_name          0
getoff_date            895736
getoff_time            895736
getoff_station_code    895736
getoff_station_name    895744
user_category               0
user_count                  0
dtype: int64

In [None]:
# 39개의 결측치(중복제외 24개) / 8개의 결측값(중복제외 6개) 해결 X

In [32]:
import os
import warnings
warnings.filterwarnings('ignore')

In [33]:
train.head()

Unnamed: 0,id,date,bus_route_id,in_out,station_code,station_name,latitude,longitude,6~7_ride,7~8_ride,...,9~10_ride,10~11_ride,11~12_ride,6~7_takeoff,7~8_takeoff,8~9_takeoff,9~10_takeoff,10~11_takeoff,11~12_takeoff,18~20_ride
0,0,2019-09-01,4270000,시외,344,제주썬호텔,33.4899,126.49373,0.0,1.0,...,5.0,2.0,6.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,2019-09-01,4270000,시외,357,한라병원,33.48944,126.48508,1.0,4.0,...,2.0,5.0,6.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0
2,2,2019-09-01,4270000,시외,432,정존마을,33.48181,126.47352,1.0,1.0,...,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0
3,3,2019-09-01,4270000,시내,1579,제주국제공항(600번),33.50577,126.49252,0.0,17.0,...,26.0,14.0,16.0,0.0,0.0,0.0,0.0,0.0,0.0,53.0
4,4,2019-09-01,4270000,시내,1646,중문관광단지입구,33.25579,126.4126,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


# 1. date 변수 변환
- date 컬럼을 판다스의 datetime 변수형으로 변환한다.

In [34]:
train['date'] = pd.to_datetime(train['date'])

In [35]:
train['weekday'] = train['date'].dt.weekday

파이썬 도큐멘테이션에는 weekday를 아래와 같이 설명하고 있다.

Return the day of the week as an integer, where Monday is 0 and Sunday is 6.

weekday 변수 생성 후 pandas의 get_dummies 함수를 활용하여 weekday에 대한 one hot encoding을 실행한다.

In [36]:
train = pd.get_dummies(train,columns=['weekday'])

In [37]:
test['date'] = pd.to_datetime(test['date'])
test['weekday'] = test['date'].dt.weekday
test = pd.get_dummies(test,columns=['weekday'])

In [40]:
# 2. in_out 변수 변환

In [38]:
train['in_out'].value_counts()

시내    408500
시외      6923
Name: in_out, dtype: int64

In [39]:
train['in_out'] = train['in_out'].map({'시내':0,'시외':1})

In [40]:
test['in_out'] = test['in_out'].map({'시내':0,'시외':1})

# 3. 좌표 데이터를 이용한 Feature Engineering
- 제주도의 인구는 서귀포시와 제주시에 몰려 있는 경우가 많다. 그러므로 해당 지역과의 거리를 각각 feature로 추가하겠다.

- dis_jejusi : 버스정류장과 제주시와의 거리

- dis_seoquipo : 버스정류장과 서귀포시와의 거리

- 제주시의 위.경도 : 33.500770, 126.522761

- 서귀포시의 위.경도 : 33.259429, 126.558217

- 2 개의 좌표 간의 거리를 구하는 함수는 파이썬의 geopy 패키지에 존재한다.

In [42]:
import geopy.distance

coords_jejusi = (33.500770, 126.522761) #제주시의 위도 경도
coords_seoquipo = (33.259429, 126.558217) #서귀포시의 위도 경도


train['dis_jejusi'] = [geopy.distance.vincenty((train['latitude'].iloc[i],train['longitude'].iloc[i]), coords_jejusi).km for i in range(len(train))]
train['dis_seoquipo'] = [geopy.distance.vincenty((train['latitude'].iloc[i],train['longitude'].iloc[i]), coords_seoquipo).km for i in range(len(train))]

In [45]:
train['dis_jejusi'].corr(train['18~20_ride'])

-0.12380548460763131

In [44]:
train['dis_seoquipo'].corr(train['18~20_ride'])

0.022087013666359644

0.022087013666357798
제주시와의 거리가 멀면 멀수록 승차인원이 줄어드는 경향을 보이나 서귀포시와의 거리는 모호한 상관성을 띄고 있다.

test 데이터에 대해서도 위와 똑같은 변환을 해준다.

In [47]:
test['dis_jejusi'] = [geopy.distance.vincenty((test['latitude'].iloc[i],test['longitude'].iloc[i]), coords_jejusi).km for i in range(len(test))]
test['dis_seoquipo'] = [geopy.distance.vincenty((test['latitude'].iloc[i],test['longitude'].iloc[i]), coords_seoquipo).km for i in range(len(test))]

  """Entry point for launching an IPython kernel.
  


# 4. randomforest을 활용한 모델링
- sklearn 내부의 randomforest 함수를 이용해 모델링을 진행한다.

- 머신러닝 학습에 활용할 input 변수와 target 변수를 정의한다.

In [62]:
from sklearn.linear_model import LinearRegression
model =  LinearRegression()
# 선미님 조언 : regressor 해보고, 타임시리즈 rms, 페이스북 꺼 써서 해보기 

In [57]:
input_var=['in_out','latitude','longitude','6~7_ride', '7~8_ride', '8~9_ride', '9~10_ride',
       '10~11_ride', '11~12_ride', '6~7_takeoff', '7~8_takeoff', '8~9_takeoff',
       '9~10_takeoff', '10~11_takeoff', '11~12_takeoff','weekday_0', 'weekday_1', 'weekday_2', 'weekday_3', 'weekday_4',
       'weekday_5', 'weekday_6', 'dis_jejusi', 'dis_seoquipo']
# 알아서 스플릿 쓰고 싶을 때는 input_var에 train [] 달아주기
target=['18~20_ride']

In [64]:
X_train=train[input_var]
y_train=train[target]

X_test=test[input_var]

In [65]:
model.fit(X_train,y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

- randomforest model 정의

In [None]:
forest = RandomForestClassifier(random_state=42, n_estimators=20)

In [None]:
forest.fit(x_train, y_train)

In [None]:
test['18~20_ride'] = forest.predict(x_test)

In [None]:
test.head()

In [None]:
test[['id','18~20_ride']].to_csv("dacon_base_line.csv",index=False)

In [None]:
# acc 알아보는 과정 

In [None]:
prediction_proba = forest.predict_proba(x_test)[:,1] # [:,1] 

In [None]:
y_test = pd.DataFrame(y_test)
y_test.columns = ['Actual']

In [None]:
y_test['Predict'] = prediction
y_test['Predict_proba'] = prediction_proba

In [None]:
y_test.head()

In [None]:
y_test.groupby(['Actual', 'Predict']).size()

In [None]:
plt.figure(figsize=(15, 8))
sns.distplot(y_test[y_test['Actual']==1]['Predict_proba'], color="skyblue", label="Survived")
sns.distplot(y_test[y_test['Actual']==0]['Predict_proba'], color="Red", label="Dead")

In [None]:
from sklearn.metrics import classification_report
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score

In [None]:
forest.predict_proba(x_test) 

In [None]:
y_proba = forest.predict_proba(x_test)[:,1]

In [None]:
y_test.head()

test['18~20_ride'] = rf.predict(X_test)
test[['id','18~20_ride']].to_csv("dacon_base_line.csv",index=False)

In [None]:
test.head()

In [None]:
test['18~20_ride'] = rf.predict(X_test)
test[['id','18~20_ride']].to_csv("dacon_base_line.csv",index=False)

In [None]:
# 여기까지 일단 해보기...왜 acc가 안 될까? 낼 질문하기

In [None]:
from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor(random_state=1217)

In [None]:
# 모델 학습 
rf.fit(X_train,y_train)

In [None]:
from sklearn.metrics import accuracy_score

test['18~20_ride'] = rf.predict(X_test)
accuracy = accuracy_score(X_test, test['18~20_ride'])

print(f'Out-of-bag score estimate: {rf.oob_score_:.3}')
print(f'Mean accuracy score: {accuracy:.3}')

In [None]:
test.head()

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_vail['Actual'], y_vail['Predict']))

In [None]:

학습한 모델을 이용해 예측을 진행하고 이를 submission.csv라는 파일로 저장한다.
test['18~20_ride'] = rf.predict(X_test)
test[['id','18~20_ride']].to_csv("dacon_base_line.csv",index=False)