In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pandas as pd
import numpy as np
import featureGenerators as f
import dataHandler as data
import deltaRegressors as reg



In [3]:
rf=reg.GetRegressor("RandomForestRegressor")

In [4]:
def read_data(train_path, test_path, routes_path, stops_path):
    # read_data
    train_tmp = pd.read_csv(train_path)
    test = pd.read_csv(test_path)
    routes = pd.read_csv(routes_path)
    stops = pd.read_csv(stops_path)
    # merge
    train = train_tmp.merge(stops, on = 'BUSSTOP_ID', how = 'left')
    return train, test, routes, stops

In [5]:
def generate_features(df):
    # RECORD_DATE - г datetime формат руу хөрвүүлэх
    df['RECORD_DATE'] = pd.to_datetime(df['RECORD_DATE'])
    df['ymd'] = df['RECORD_DATE'].dt.strftime('%Y-%m-%d')
    # Буудал хоорондох хугацааг тооцох
    df = df.sort_values(by = ['BUS_ID','TIMESTAMP'], ascending = ['False', 'False'])
    df['time_spent'] = df.groupby(['BUS_ID', 'ymd'])['TIMESTAMP'].diff()
    df['busstop_lag'] = df.groupby(['BUS_ID', 'ymd'])['BUSSTOP_NMMN'].shift(1)
    df['busstop_from_to'] = df["busstop_lag"] + " >> " + df["BUSSTOP_NMMN"]
    df['deltaX'] = df.groupby(['BUS_ID', 'ymd'])['GPS_COORDX'].diff()
    df['deltaY'] = df.groupby(['BUS_ID', 'ymd'])['GPS_COORDY'].diff()
    df['distance'] = df.apply(lambda row: np.sqrt(row['deltaX']**2 + row['deltaY']**2), axis=1)
    del df['busstop_lag']
    del df['deltaX']
    del df['deltaY']
    return df

In [6]:
train_path = '../data/main/training.csv'
test_path = '../data/main/test.csv'
routes_path = '../data/main/fixed_routes.csv'
stops_path = '../data/main/stops.csv'

In [7]:
tr_data, tst_data, rt_data, stp_data = read_data(train_path, test_path, routes_path, stops_path)

In [8]:
tr_data = generate_features(tr_data)
tr_data.head()

Unnamed: 0,index,RECORD_DATE,BUS_ID,BUSROUTE_ID,BUSSTOP_ID,BUSSTOP_SEQ,TIMESTAMP,BUSSTOP_NMMN,GPS_COORDX,GPS_COORDY,ymd,time_spent,busstop_from_to,distance
174725,173467,2020-01-06 17:03:34,2053,11100010,380,21,1578330214,Офицеруудын ордон,106.97022,47.91616,2020-01-06,,,
1636680,1624542,2020-01-13 07:28:25,2053,11100010,18,2,1578900505,Монгол хюндэй киа компани,106.8266,47.91059,2020-01-13,,,
1637290,1625151,2020-01-13 07:30:22,2053,11100010,11,3,1578900622,32-р байр,106.83359,47.9111,2020-01-13,117.0,Монгол хюндэй киа компани >> 32-р байр,0.007009
1637940,1625791,2020-01-13 07:32:25,2053,11100010,9,4,1578900745,Хар хорин,106.84054,47.91166,2020-01-13,123.0,32-р байр >> Хар хорин,0.006973
1638576,1626425,2020-01-13 07:34:26,2053,11100010,6,5,1578900866,Цамбагарав,106.84737,47.91216,2020-01-13,121.0,Хар хорин >> Цамбагарав,0.006848


In [9]:
weather_data = pd.read_csv('../data/weather.csv')
weather_data['date'] = pd.to_datetime(weather_data['date'])
weather_data['ymd'] = weather_data['date'].dt.strftime('%Y-%m-%d')
weather_data.head()

Unnamed: 0,date,temp_avg,precipitation,ymd
0,2020-01-05,-14.64,0.0,2020-01-05
1,2020-01-06,-21.86,0.0,2020-01-06
2,2020-01-07,-23.67,0.0,2020-01-07
3,2020-01-08,-23.4,0.0,2020-01-08
4,2020-01-09,-20.24,0.0,2020-01-09


In [10]:
#tr_data["month"] = tr_data['RECORD_DATE'].dt.month
tr_data["weekday"] = tr_data['RECORD_DATE'].dt.dayofweek
tr_data = tr_data.merge(weather_data[['temp_avg', 'ymd']], on = 'ymd', how = 'left')

In [11]:
#tr_data = pd.read_csv("../data/main/training.csv",index_col=0).sort_values(["BUS_ID","TIMESTAMP"])
tr_data = tr_data.sort_values(["BUS_ID", "TIMESTAMP"])
tr_data.head()

Unnamed: 0,index,RECORD_DATE,BUS_ID,BUSROUTE_ID,BUSSTOP_ID,BUSSTOP_SEQ,TIMESTAMP,BUSSTOP_NMMN,GPS_COORDX,GPS_COORDY,ymd,time_spent,busstop_from_to,distance,weekday,temp_avg
0,173467,2020-01-06 17:03:34,2053,11100010,380,21,1578330214,Офицеруудын ордон,106.97022,47.91616,2020-01-06,,,,0,-21.86
1,1624542,2020-01-13 07:28:25,2053,11100010,18,2,1578900505,Монгол хюндэй киа компани,106.8266,47.91059,2020-01-13,,,,0,-26.81
2,1625151,2020-01-13 07:30:22,2053,11100010,11,3,1578900622,32-р байр,106.83359,47.9111,2020-01-13,117.0,Монгол хюндэй киа компани >> 32-р байр,0.007009,0,-26.81
3,1625791,2020-01-13 07:32:25,2053,11100010,9,4,1578900745,Хар хорин,106.84054,47.91166,2020-01-13,123.0,32-р байр >> Хар хорин,0.006973,0,-26.81
4,1626425,2020-01-13 07:34:26,2053,11100010,6,5,1578900866,Цамбагарав,106.84737,47.91216,2020-01-13,121.0,Хар хорин >> Цамбагарав,0.006848,0,-26.81


In [12]:
# calcTimeDelta deer ijil udur shalgah
td=data.calcTimeDelta(tr_data)
td.head()

0    570291.0
1       117.0
2       123.0
3       121.0
4       136.0
Name: TIMESTAMP, dtype: float64

In [13]:
#test feature
def calc(X):
    return X.loc[:, ~X.columns.isin(['TIMESTAMP', 'RECORD_DATE', 'index', 'ymd', 'time_spent', 'busstop_from_to', 'BUSSTOP_NMMN', 'distance'])]

In [None]:
features = calc(tr_data)
rf.train(features, td)



In [None]:
rf.regressor.feature_importances_

In [None]:
tst_data.head()