# Забродина Дарья
# BASELINE 2


In [38]:
import pandas as pd

from sklearn.preprocessing import OneHotEncoder
from lightgbm import LGBMClassifier
from sklearn.neighbors import NearestNeighbors, KNeighborsClassifier
from sklearn.metrics import roc_auc_score
import numpy as np
import datetime
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.model_selection import cross_val_score, cross_val_predict, StratifiedKFold

Считываем данные:

In [39]:
data_train = pd.read_csv('train_data.csv', parse_dates=['due'])
target_train = pd.read_csv('train_target.csv')
data_train['target'] = target_train
data_test = pd.read_csv('test_data.csv', parse_dates=['due'])

Сортируем данные по времени:

In [40]:
data_train.sort_values(by='due', inplace=True)

In [41]:
data_train.head()

Unnamed: 0,dist,due,f_class,lat,lon,s_class,t_class,target
823150,17990.125431,2014-01-01 00:09:32,econom,55.75013,37.823242,,,1
685195,2322.180999,2014-01-01 00:10:00,econom,55.711488,37.884183,,,0
996389,14608.577392,2014-01-01 00:10:00,econom,55.633404,37.797595,,,0
497811,20055.569096,2014-01-01 00:10:00,econom,55.78038,37.64524,,,0
812346,20585.444093,2014-01-01 00:10:00,econom,55.675676,37.500387,,,1


In [42]:
df_train = data_train['due']
df_test = data_test['due']

In [43]:
data_test.shape

(510937, 7)

## Добавление базовых признаков на основе ноутбука с безлайном 1

Функция по добавлению признаков:
    
* делим время на составные части
* обрабатываем пропущенные значения

In [44]:
def add_features(data_train, data_test):
    for data in [data_train, data_test]:
        data['is_dist'] = (data['dist'] == -1).astype(float)
        
        # add tmp features
        data.loc[:,'time_as_str'] = data['due'].astype(str).apply(lambda x: x[x.find(' ') + 1:-4])
        data.loc[:, 'date_as_str'] = data['due'].astype(str).apply(lambda x: x[:x.find(' ')])
        
        # add date features
        data_date_structure = pd.to_datetime(data['date_as_str'])
        data.loc[:, 'date'] =  data_date_structure 
        data.loc[:, 'week_day'] = data_date_structure.dt.dayofweek
        data.loc[:, 'weekend'] = ((data['week_day'] == 5) | (data['week_day'] == 6) ).astype(int)
        
        # add time features
        start_time = datetime.datetime(2014, 1, 1, 0, 0)
        data_tmp = pd.to_datetime(data['due'])
        data['time_as_datetime'] = data_tmp
        data.loc[:, 'time_in_seconds'] = pd.to_timedelta(data['time_as_str']).dt.total_seconds()
        data.loc[:, 'time_in_minutes'] = data['time_in_seconds'] // 60
        data.loc[:, 'time_in_hours'] = data['time_in_minutes'] // 60
        data.loc[:, 'total_minutes'] = (pd.to_timedelta(data['time_as_datetime'] - start_time).dt.total_seconds()//60).astype('float')
      
        
        # fillna for cat features
        data.fillna({
            'f_class': 'unknown_f',
            's_class': 'unknown_s',
            't_class': 'unknown_t',
        }, inplace=True)
        
        # drop tmp features
        data.drop(['time_as_str', 'date_as_str'], axis=1, inplace=True)
    
    for column in ['f_class', 's_class', 't_class', 'week_day']:
        data_train[column] = data_train[column].astype('category')
        data_test[column] = data_test[column].astype('category')
        data_test[column] = (
            data_test[column]
            .cat
            .set_categories(data_train[column].cat.categories)
        )
    
    data_train = pd.get_dummies(data_train)
    data_test = pd.get_dummies(data_test)
    
    return data_train, data_test

In [45]:
data_train, data_test = add_features(data_train, data_test)

In [46]:
data_train.head()

Unnamed: 0,dist,due,lat,lon,target,is_dist,date,weekend,time_as_datetime,time_in_seconds,...,t_class_econom,t_class_unknown_t,t_class_vip,week_day_0,week_day_1,week_day_2,week_day_3,week_day_4,week_day_5,week_day_6
823150,17990.125431,2014-01-01 00:09:32,55.75013,37.823242,1,0.0,2014-01-01,0,2014-01-01 00:09:32,572.0,...,0,1,0,0,0,1,0,0,0,0
685195,2322.180999,2014-01-01 00:10:00,55.711488,37.884183,0,0.0,2014-01-01,0,2014-01-01 00:10:00,600.0,...,0,1,0,0,0,1,0,0,0,0
996389,14608.577392,2014-01-01 00:10:00,55.633404,37.797595,0,0.0,2014-01-01,0,2014-01-01 00:10:00,600.0,...,0,1,0,0,0,1,0,0,0,0
497811,20055.569096,2014-01-01 00:10:00,55.78038,37.64524,0,0.0,2014-01-01,0,2014-01-01 00:10:00,600.0,...,0,1,0,0,0,1,0,0,0,0
812346,20585.444093,2014-01-01 00:10:00,55.675676,37.500387,1,0.0,2014-01-01,0,2014-01-01 00:10:00,600.0,...,0,1,0,0,0,1,0,0,0,0


## City

Все точки можно разбить на кластеры, соответствующие пяти городам: Москве, Казани, Нижнему Новогороду, Санкт-Петербургу, Воронежу. Будем рассматривать центры этих городов как центры кластеров. Затем для каждой точки с помощью метода ближайшего соседа найдем, к какому кластеру она относилась, а также найдем расстояние до центра этого кластера. 

In [47]:
cities = pd.DataFrame([['moscow', 55.755814, 37.617635],
                       ['kazan',  55.796289, 49.108795],
                       ['nnovgorod',  56.326797, 44.006516],
                       ['spb',  59.939095, 30.315868],
                       ['voronezh',  51.660781, 39.200269]], columns = ['city', 'lat', 'lon'] ) 

NN = NearestNeighbors(n_neighbors=1, metric='euclidean')
NN.fit(cities[['lat', 'lon']])

NearestNeighbors(algorithm='auto', leaf_size=30, metric='euclidean',
         metric_params=None, n_jobs=1, n_neighbors=1, p=2, radius=1.0)

In [48]:
def get_city_features(X):
    dist, neighbours = NN.kneighbors(X[['lat', 'lon']])
    X['centr_dist'] = dist[:, 0]
    X['city'] = neighbours[:, 0]
    X['city_name'] = X['city'].map(cities.city)

In [49]:
get_city_features(data_train)
get_city_features(data_test)

In [50]:
data_train.head()

Unnamed: 0,dist,due,lat,lon,target,is_dist,date,weekend,time_as_datetime,time_in_seconds,...,week_day_0,week_day_1,week_day_2,week_day_3,week_day_4,week_day_5,week_day_6,centr_dist,city,city_name
823150,17990.125431,2014-01-01 00:09:32,55.75013,37.823242,1,0.0,2014-01-01,0,2014-01-01 00:09:32,572.0,...,0,0,1,0,0,0,0,0.205686,0,moscow
685195,2322.180999,2014-01-01 00:10:00,55.711488,37.884183,0,0.0,2014-01-01,0,2014-01-01 00:10:00,600.0,...,0,0,1,0,0,0,0,0.270208,0,moscow
996389,14608.577392,2014-01-01 00:10:00,55.633404,37.797595,0,0.0,2014-01-01,0,2014-01-01 00:10:00,600.0,...,0,0,1,0,0,0,0,0.217646,0,moscow
497811,20055.569096,2014-01-01 00:10:00,55.78038,37.64524,0,0.0,2014-01-01,0,2014-01-01 00:10:00,600.0,...,0,0,1,0,0,0,0,0.036953,0,moscow
812346,20585.444093,2014-01-01 00:10:00,55.675676,37.500387,1,0.0,2014-01-01,0,2014-01-01 00:10:00,600.0,...,0,0,1,0,0,0,0,0.142018,0,moscow


## Atipic hour

Смотрим, как много вызовов такси было совершено в данный день и час в данном городе по отношению к среднему количеству вызовов в этот час по всем дням в этом городе.

In [51]:
data_train['time'] = data_train.due.dt.floor("H")
data_test['time'] = data_test.due.dt.floor("H")

In [52]:
def get_atipic_hour(X):
    ctc = X.groupby(['city', 'time']).due.count().rename('counts_per_hour').reset_index()
    ctc['hour'] = ctc.time.dt.hour
    ctc['mean_count'] = ctc.groupby(['city', 'hour']).counts_per_hour.transform('mean')
    ctc['atipic_hour'] = ctc.counts_per_hour/ctc.mean_count
    return pd.merge(X, ctc[['city', 'time', 'atipic_hour']], on = ['city', 'time'], how = 'left')
    

In [53]:
data_train = get_atipic_hour(data_train)
data_test = get_atipic_hour(data_test)

## KNN with lat and lon

Воспользуемся методом ближайших соседей, чтобы для каждой точки из тестовой выборки на основе 100 ближайших соседей из тренировочной выборки найти вероятность принадлежности классу 0 или 1.

In [54]:
KNC = KNeighborsClassifier(metric='euclidean', n_neighbors=100)

In [55]:
KNC.fit(data_train[['lat', 'lon']], data_train.target)
test_prediction = KNC.predict_proba(data_test[['lat', 'lon']])
data_test['knn_prediction'] = test_prediction[:,0]

Для того, чтобы посмотреть результат на валидационной выборке, для тренировочных данных тоже нужно найти предсказания, что делается с помощью разбиения на фолды.

In [56]:
prediction  = cross_val_predict(KNC, data_train[['lat', 'lon']], data_train.target, cv=10, method='predict_proba')
data_train['knn_prediction'] = prediction[:,0]

## KNN with time

Предыдущий KNN дал небольшой прирост результата. Возникла идея использовать при подсчете расстояния еще и время, предварительно его нормировав. Это соотносится со следующим бытовым рассуждением: часто после отмены такси человек снова делает заказ через пару минут и в относительно близкой точке. Расстояние до ближайшего соседа в таких координатах оказалось признаком, который дал большой прирост.

In [57]:
print(data_test['total_minutes'].max())
print(data_train['total_minutes'].max())

129595.0
92640.0


In [58]:
data_train['z'] = data_train['total_minutes'] * 60 / 129595
data_test['z'] = data_test['total_minutes'] * 60 / 129595

In [59]:
train_shape = data_train.shape[0]

In [60]:
samples_train = data_train[['lat', 'lon', 'z']]
samples_test = data_test[['lat', 'lon', 'z']]
X = pd.concat([samples_train, samples_test])

In [61]:
neigh = NearestNeighbors(n_neighbors=1)
neigh.fit(X)
dist, neighbor = neigh.kneighbors() 
X['neighbor'] = neighbor
X['dist_t'] = dist

In [62]:
data_train['dist_t'] = X['dist_t'][:train_shape]
data_test['dist_t'] = X['dist_t'][train_shape:]

## Features

Тут создаем список признаков, которые и будут использованы в модели.

In [63]:
feature_columns = ['dist', 'centr_dist', 'lat', 'lon',  'time_in_hours', 'weekend',  'atipic_hour', 'time_in_seconds', 'f_class_business', 'f_class_econom',
       'f_class_vip', 's_class_business', 's_class_econom',
      's_class_vip',  'dist_t', 'knn_prediction']
target_column = ['target']

## Validation

In [67]:
train_size = int(0.7 * data_train.shape[0])
train = data_train.iloc[:train_size]
valid = data_train.iloc[train_size:]

In [68]:
clf = LGBMClassifier(n_estimators=1600, learning_rate=0.04, num_leaves=63, max_depth = 20)
clf.fit(
    train[feature_columns], train[target_column].values.ravel(),
    eval_set=[(valid[feature_columns], valid[target_column].values.ravel())],
    eval_metric='auc',
    verbose=True,
    early_stopping_rounds=50,
)

[1]	valid_0's auc: 0.716668	valid_0's binary_logloss: 0.420286
Training until validation scores don't improve for 50 rounds
[2]	valid_0's auc: 0.717293	valid_0's binary_logloss: 0.416537
[3]	valid_0's auc: 0.718213	valid_0's binary_logloss: 0.413229
[4]	valid_0's auc: 0.718695	valid_0's binary_logloss: 0.410259
[5]	valid_0's auc: 0.719979	valid_0's binary_logloss: 0.407625
[6]	valid_0's auc: 0.720962	valid_0's binary_logloss: 0.40524
[7]	valid_0's auc: 0.721393	valid_0's binary_logloss: 0.403105
[8]	valid_0's auc: 0.72234	valid_0's binary_logloss: 0.401108
[9]	valid_0's auc: 0.722562	valid_0's binary_logloss: 0.399314
[10]	valid_0's auc: 0.722808	valid_0's binary_logloss: 0.397674
[11]	valid_0's auc: 0.723362	valid_0's binary_logloss: 0.396092
[12]	valid_0's auc: 0.723814	valid_0's binary_logloss: 0.39467
[13]	valid_0's auc: 0.72464	valid_0's binary_logloss: 0.393361
[14]	valid_0's auc: 0.725453	valid_0's binary_logloss: 0.392115
[15]	valid_0's auc: 0.725949	valid_0's binary_logloss: 0

[130]	valid_0's auc: 0.73867	valid_0's binary_logloss: 0.37052
[131]	valid_0's auc: 0.7387	valid_0's binary_logloss: 0.370503
[132]	valid_0's auc: 0.738741	valid_0's binary_logloss: 0.370486
[133]	valid_0's auc: 0.738773	valid_0's binary_logloss: 0.370465
[134]	valid_0's auc: 0.738789	valid_0's binary_logloss: 0.370452
[135]	valid_0's auc: 0.738847	valid_0's binary_logloss: 0.370436
[136]	valid_0's auc: 0.738856	valid_0's binary_logloss: 0.370421
[137]	valid_0's auc: 0.738874	valid_0's binary_logloss: 0.37041
[138]	valid_0's auc: 0.738893	valid_0's binary_logloss: 0.370402
[139]	valid_0's auc: 0.73894	valid_0's binary_logloss: 0.370374
[140]	valid_0's auc: 0.738976	valid_0's binary_logloss: 0.370362
[141]	valid_0's auc: 0.739004	valid_0's binary_logloss: 0.370341
[142]	valid_0's auc: 0.739021	valid_0's binary_logloss: 0.370329
[143]	valid_0's auc: 0.73905	valid_0's binary_logloss: 0.370303
[144]	valid_0's auc: 0.739069	valid_0's binary_logloss: 0.370289
[145]	valid_0's auc: 0.739109	va

[258]	valid_0's auc: 0.740323	valid_0's binary_logloss: 0.369623
[259]	valid_0's auc: 0.74033	valid_0's binary_logloss: 0.369619
[260]	valid_0's auc: 0.74034	valid_0's binary_logloss: 0.369616
[261]	valid_0's auc: 0.740343	valid_0's binary_logloss: 0.369613
[262]	valid_0's auc: 0.740338	valid_0's binary_logloss: 0.369613
[263]	valid_0's auc: 0.740335	valid_0's binary_logloss: 0.369613
[264]	valid_0's auc: 0.740341	valid_0's binary_logloss: 0.369609
[265]	valid_0's auc: 0.740339	valid_0's binary_logloss: 0.369609
[266]	valid_0's auc: 0.740343	valid_0's binary_logloss: 0.369606
[267]	valid_0's auc: 0.740338	valid_0's binary_logloss: 0.369608
[268]	valid_0's auc: 0.740341	valid_0's binary_logloss: 0.369607
[269]	valid_0's auc: 0.740345	valid_0's binary_logloss: 0.369604
[270]	valid_0's auc: 0.740351	valid_0's binary_logloss: 0.3696
[271]	valid_0's auc: 0.740361	valid_0's binary_logloss: 0.369594
[272]	valid_0's auc: 0.740362	valid_0's binary_logloss: 0.369593
[273]	valid_0's auc: 0.740361

[386]	valid_0's auc: 0.740573	valid_0's binary_logloss: 0.369442
[387]	valid_0's auc: 0.740572	valid_0's binary_logloss: 0.369439
[388]	valid_0's auc: 0.740574	valid_0's binary_logloss: 0.36944
[389]	valid_0's auc: 0.740566	valid_0's binary_logloss: 0.369441
[390]	valid_0's auc: 0.74057	valid_0's binary_logloss: 0.369439
[391]	valid_0's auc: 0.74057	valid_0's binary_logloss: 0.369438
[392]	valid_0's auc: 0.74057	valid_0's binary_logloss: 0.369438
[393]	valid_0's auc: 0.740566	valid_0's binary_logloss: 0.369437
[394]	valid_0's auc: 0.740562	valid_0's binary_logloss: 0.369439
[395]	valid_0's auc: 0.740569	valid_0's binary_logloss: 0.369435
[396]	valid_0's auc: 0.740577	valid_0's binary_logloss: 0.36943
[397]	valid_0's auc: 0.740579	valid_0's binary_logloss: 0.369429
[398]	valid_0's auc: 0.740581	valid_0's binary_logloss: 0.369428
[399]	valid_0's auc: 0.740584	valid_0's binary_logloss: 0.369426
[400]	valid_0's auc: 0.740591	valid_0's binary_logloss: 0.369424
[401]	valid_0's auc: 0.740582	

[513]	valid_0's auc: 0.740732	valid_0's binary_logloss: 0.369306
[514]	valid_0's auc: 0.740733	valid_0's binary_logloss: 0.369306
[515]	valid_0's auc: 0.740735	valid_0's binary_logloss: 0.369304
[516]	valid_0's auc: 0.74074	valid_0's binary_logloss: 0.369303
[517]	valid_0's auc: 0.740736	valid_0's binary_logloss: 0.369305
[518]	valid_0's auc: 0.740735	valid_0's binary_logloss: 0.369305
[519]	valid_0's auc: 0.740732	valid_0's binary_logloss: 0.369306
[520]	valid_0's auc: 0.740728	valid_0's binary_logloss: 0.369307
[521]	valid_0's auc: 0.740721	valid_0's binary_logloss: 0.369311
[522]	valid_0's auc: 0.74072	valid_0's binary_logloss: 0.36931
[523]	valid_0's auc: 0.740719	valid_0's binary_logloss: 0.369311
[524]	valid_0's auc: 0.740724	valid_0's binary_logloss: 0.369309
[525]	valid_0's auc: 0.740736	valid_0's binary_logloss: 0.3693
[526]	valid_0's auc: 0.740726	valid_0's binary_logloss: 0.369303
[527]	valid_0's auc: 0.740724	valid_0's binary_logloss: 0.369304
[528]	valid_0's auc: 0.740724	

LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
        importance_type='split', learning_rate=0.04, max_depth=20,
        min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
        n_estimators=1600, n_jobs=-1, num_leaves=63, objective=None,
        random_state=None, reg_alpha=0.0, reg_lambda=0.0, silent=True,
        subsample=1.0, subsample_for_bin=200000, subsample_freq=0)

## Results

In [64]:
clf = LGBMClassifier(n_estimators=492, learning_rate=0.04, num_leaves=63, max_depth = 20)
clf.fit(
    data_train[feature_columns], data_train[target_column].values.ravel(),
    eval_metric='auc',
    verbose=True,
)

predictions = clf.predict_proba(data_test[feature_columns])

In [65]:
predictions[:, 1]

array([0.04484322, 0.07797082, 0.10175929, ..., 0.20971501, 0.32055829,
       0.05838681])

In [66]:
prediction_df = pd.DataFrame(predictions[:, 1], columns=['target'])
prediction_df = prediction_df.reset_index()

In [32]:
prediction_df.to_csv('second_baseline.csv', index=0)