In [27]:
import pandas as pd
import numpy as np
from sklearn.metrics import roc_auc_score

import chart_studio.plotly as ply
from plotly.offline import iplot

import cufflinks
cufflinks.go_offline()

import plotly.express as px

import datetime

from sklearn.preprocessing import OneHotEncoder

from sklearn.model_selection import train_test_split

from sklearn.cluster import KMeans
from sklearn.neighbors import NearestNeighbors

In [28]:
def count_weekdays(start_date, n):
    t = start_date.weekday()
    res = 7*[0]
    for i in range(7):
        if i <  n % 7:
            res[(i + t) % 7] = n // 7 + 1
        else:
            res[(i + t) % 7] = n // 7
    return np.array(res)

def add_features(train_data, test_data):
    for data in [train_data, test_data]:
        data.loc[:,'time_as_str'] = data['due'].apply(lambda x: x[x.find(' ') + 1: -4])
        data.loc[:, 'date_as_str'] = data['due'].apply(lambda x: x[: x.find(' ')])
        
        null_time = datetime.datetime(2014, 1, 1, 0, 0)
        data_d_s = pd.to_datetime(data['due'])
        data['time_as_dt'] = data_d_s
        time_t = pd.to_datetime(data['time_as_str'])
        week_day = data_d_s.dt.dayofweek
        data.loc[:, 'day_of_week'] = week_day
        data.loc[:, 'day'] = data_d_s.dt.day
        data.loc[:, 'month'] = data_d_s.dt.month
        data.loc[:, 'year'] = data_d_s.dt.year
        data.loc[:, 'hour'] = time_t.dt.hour
        data.loc[:, 'is_night'] = (data['hour'] < 6).astype('int')
        data.loc[:, 'is_weekend'] = (data['day_of_week'] > 4).astype('int')
        data.loc[:, 'total_minutes'] = (pd.to_timedelta(data['time_as_dt'] - null_time).dt.total_seconds()//60).astype('float')
        data['total_minutes'] = data['total_minutes'] / (89*24)
        data.loc[:, 'time_in_sec'] = pd.to_timedelta(data['time_as_str']).dt.total_seconds()

        data.fillna({
            'f_class': 'unknown_f',
            's_class': 'unknown_s',
            't_class': 'unknown_t'
        })
        
        data.drop(['time_as_str', 'date_as_str', 'due'], axis=1, inplace=True)
    
    for column in ['f_class', 's_class', 't_class']:
        train_data[column] = train_data[column].astype('category')
        test_data[column] = test_data[column].astype('category')
        test_data[column] = (
        test_data[column]
                .cat
                .set_categories(train_data[column].cat.categories)
        )
    
    train_data = pd.get_dummies(train_data)
    test_data = pd.get_dummies(test_data)
    
    return train_data, test_data
    

In [29]:
def num_orders_to_average_by_weekday(data):
#     первая дата в выборке:
    start = datetime.datetime(np.min(data['year']), np.min(data['month']), np.min(data['day']))
#     последняя дата в выборке:
    end = datetime.datetime(np.max(data['year']), np.max(data['month']), np.max(data['day']))
#     всего дней в выборке
    n = (end - start).days
#     сколько каждого из дней недели в выборке
    num_days = count_weekdays(start, n)
#     сколько объектов за каждый отдельный день недели суммарно
    num_orders = np.array(data.groupby('day_of_week').count().iloc[:, 1])
    
#     print('1', data.loc[[1, 55, 5673, 80333, 554342, 700321], 'lat'])
#     в среднем за каждый день недели:
    average_num_orders = num_orders/num_days
#     сколько объектов за каждую дату в выборке
    orders_in_day = data.groupby(['year', 'month', 'day']).count().iloc[:,1].astype(float)
#     print('2', data.loc[[1, 55, 5673, 80333, 554342, 700321], 'lat'])

#     искомое отношение: (кол-во объектов в конкретный день)/(среднее кол-во объектов за этот д. н.)
    for date in np.array(orders_in_day.index):
        wd = datetime.datetime(date[0], date[1], date[2]).weekday()
        orders_in_day[date] /= average_num_orders[wd]
    
#     объединяем датафреймы
    v = pd.DataFrame(orders_in_day)
    v['feat_1'] = v['lat']
    v.drop(['lat'], axis=1, inplace=True)
#     print('3', data.loc[[1, 55, 5673, 80333, 554342, 700321], 'lat'])
    return data.join(v, on = ['year', 'month', 'day'])

In [30]:
def num_orders_to_average_by_hour(data):
#     первая дата в выборке:
    start = datetime.datetime(np.min(data['year']), np.min(data['month']), np.min(data['day']))
#     последняя дата в выборке:
    end = datetime.datetime(np.max(data['year']), np.max(data['month']), np.max(data['day']))
#     всего дней в выборке
    n = (end - start).days
#     сколько каждого из дней недели в выборке
    num_weekdays = count_weekdays(start, n)
#     сколько объектов за каждый отдельный час суммарно
    num_orders = np.array(data.groupby('hour').count().iloc[:, 1])
    
#     в среднем за каждый час:
    average_num_orders = num_orders/n
#     сколько объектов за каждый час в выборке
    orders_in_hour = data.groupby(['year', 'month', 'day', 'hour']).count().iloc[:,1].astype(float)

#     искомое отношение: (кол-во объектов в конкретный час)/(среднее кол-во объектов за этот час по всем датам)
    for hour in np.array(orders_in_hour.index):
        orders_in_hour[hour] /= average_num_orders[hour[3]]
    
#     объединяем датафреймы
    v = pd.DataFrame(orders_in_hour)
    v['feat_2'] = v['lat']
    v.drop(['lat'], axis=1, inplace=True)
#     print('3', data.loc[[1, 55, 5673, 80333, 554342, 700321], 'lat'])
    return data.join(v, on = ['year', 'month', 'day', 'hour'])

In [31]:
def dist_to_clusters(data):
    kmeans_model = KMeans(n_clusters=5, random_state=1).fit(data[['lon', 'lat']])

    clusters = kmeans_model.fit_predict(data[['lon', 'lat']])
    data['cluster'] = clusters
    
    data.loc[:,'centroid_1'] = data['cluster'].apply(lambda x: kmeans_model.cluster_centers_[x][0])
    data.loc[:,'centroid_2'] = data['cluster'].apply(lambda x: kmeans_model.cluster_centers_[x][1])
#     L_1 distance
    distances = abs(data['lon'] - data['centroid_1']) + abs(data['lat'] - data['centroid_2'])
    data['dist_to_centroid'] = distances
    return data

In [32]:
def kNN_features(data, train_target):
    h = train_target.shape[0]
    X = data[['lon', 'lat', 'total_minutes']]
    X_train = X[:h]
    y_train = train_target
    X_test = X[h:]
    
    neigh = NearestNeighbors(2, 0.01)
    neigh.fit(X_train)
    neighbors = neigh.kneighbors(X, 2, return_distance=True)
    
    X_train['neighbor'] = neighbors[1][:X_train.shape[0],1]
    X_train['dist_to_nn'] = neighbors[0][:X_train.shape[0],1]

    X_test['neighbor'] = neighbors[1][X_train.shape[0]:,0]
    X_test['dist_to_nn'] = neighbors[0][X_train.shape[0]:,0]
    
    X = pd.concat([X_train, X_test])
    
    data['kNN_feat'] = X['neighbor'].apply(lambda x: train_target.iloc[x])
    data['neighbor'] = X['neighbor']
    data['dist_to_nn'] = X['dist_to_nn']
    return data

In [33]:
def kNN_features(data):
    X = data[['lon', 'lat', 'total_minutes']]
    
    neigh = NearestNeighbors(2, 0.01)
    neigh.fit(X)
    neighbors = neigh.kneighbors(X, 2, return_distance=True)
    
    X['neighbor'] = neighbors[1][:, 1]
    X['dist_to_nn'] = neighbors[0][:, 1]

    data['neighbor'] = X['neighbor']
    data['dist_to_nn'] = X['dist_to_nn']
    return data

In [34]:
def split_by_time(data, target, p = 0.7):
    N = data.shape[0]
    data.loc[:,'total_time'] = pd.to_datetime(data['due'])
    data.loc[:, 'target'] = target
    data.sort_values(by = 'total_time', inplace=True)
    y_train = data['target'][:int(p*N)]
    y_test = data['target'][int(p*N):]
    data.drop(['target', 'total_time'], axis=1, inplace=True)
    X_train = data[:int(p*N)]
    X_test = data[int(p*N):]
    return X_train, X_test, y_train, y_test, int(p*N)

In [35]:
def category_encode(data, categories = ['day_of_week', 'cluster']):
    for category in categories:
        data[category] = data[category].astype('category')
        data = pd.get_dummies(data)
    return data

In [36]:
train_data = pd.read_csv('ozonmasters-ml2-2020-c1/1_data/train_data.csv')
test_data = pd.read_csv('ozonmasters-ml2-2020-c1/1_data/test_data.csv')
train_target = pd.read_csv('ozonmasters-ml2-2020-c1/1_data/train_target.csv')

In [37]:
X_train, X_test, y_train = train_data, test_data, train_target

In [38]:
X_train, X_test = add_features(X_train, X_test)

In [39]:
X = pd.concat([X_train, X_test])
X = num_orders_to_average_by_weekday(X)

In [40]:
X = num_orders_to_average_by_hour(X)

In [41]:
X = dist_to_clusters(X)

In [42]:
X = kNN_features(X)

In [43]:
X = category_encode(X)

In [44]:
X.columns

Index(['dist', 'lat', 'lon', 'time_as_dt', 'day', 'month', 'year', 'hour',
       'is_night', 'is_weekend', 'total_minutes', 'time_in_sec',
       'f_class_business', 'f_class_econom', 'f_class_vip', 's_class_business',
       's_class_econom', 's_class_vip', 't_class_business', 't_class_econom',
       't_class_vip', 'feat_1', 'feat_2', 'centroid_1', 'centroid_2',
       'dist_to_centroid', 'neighbor', 'dist_to_nn', 'day_of_week_0',
       'day_of_week_1', 'day_of_week_2', 'day_of_week_3', 'day_of_week_4',
       'day_of_week_5', 'day_of_week_6', 'cluster_0', 'cluster_1', 'cluster_2',
       'cluster_3', 'cluster_4'],
      dtype='object')

In [45]:
p = X_train.shape[0]
p

1187461

In [46]:
from lightgbm import LGBMClassifier

In [47]:
data = X.drop(['centroid_1', 'centroid_2', 'neighbor', 'hour', 'feat_1',
       'time_as_dt', 'day', 'month', 'year', 'total_minutes'], axis=1, inplace=False)

In [48]:
train_data = data[:p]
test_data = data[p:]
train_target = y_train

In [49]:
clf = LGBMClassifier(n_estimators=1577, learning_rate=0.05, num_leaves=63, max_depth=5)

In [50]:
clf.fit(
    train_data, train_target.values.ravel(),
    eval_metric='auc',
    verbose=True,
)

LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
               importance_type='split', learning_rate=0.05, max_depth=5,
               min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
               n_estimators=1577, n_jobs=-1, num_leaves=63, objective=None,
               random_state=None, reg_alpha=0.0, reg_lambda=0.0, silent=True,
               subsample=1.0, subsample_for_bin=200000, subsample_freq=0)

In [51]:
predictions = clf.predict_proba(test_data)

In [26]:
test_target = y_test

NameError: name 'y_test' is not defined

In [None]:
roc_auc_score(test_target, predictions[:,1])

In [196]:
test_target = y_test
roc_auc_score(test_target, predictions[:,1])

0.7367386372628475

In [118]:
train_data.columns

Index(['dist', 'lat', 'lon', 'hour', 'is_night', 'is_weekend', 'time_in_sec',
       'f_class_business', 'f_class_econom', 'f_class_vip', 's_class_business',
       's_class_econom', 's_class_vip', 't_class_business', 't_class_econom',
       't_class_vip', 'dist_to_centroid', 'day_of_week_0', 'day_of_week_1',
       'day_of_week_2', 'day_of_week_3', 'day_of_week_4', 'day_of_week_5',
       'day_of_week_6'],
      dtype='object')

In [52]:
prediction_df = pd.DataFrame(predictions[:, 1], columns=['target'])
prediction_df = prediction_df.reset_index()
prediction_df.to_csv('baseline_9.csv', index=0)

In [123]:
data2 = pd.concat([train_data, test_data])
y = pd.concat([y_train, y_test])

In [125]:
X_tr, X_te, y_tr, y_te = train_test_split(data2, y, test_size=0.3, shuffle=True )

In [126]:
clf.fit(
    X_tr, y_tr.values.ravel(),
    eval_metric='auc',
    verbose=True,
)

LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
               importance_type='split', learning_rate=0.05, max_depth=5,
               min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
               n_estimators=1577, n_jobs=-1, num_leaves=63, objective=None,
               random_state=None, reg_alpha=0.0, reg_lambda=0.0, silent=True,
               subsample=1.0, subsample_for_bin=200000, subsample_freq=0)

In [127]:
predictions = clf.predict_proba(X_te)
roc_auc_score(y_te, predictions[:,1])

0.5020841129263518

In [180]:
X[p:]['month'].unique()

array([2, 3])