In [4]:
import numpy as np
import pandas as pd
from datetime import datetime
from autogluon.tabular import TabularDataset, TabularPredictor
from autogluon.core.metrics import make_scorer
from sklearn.model_selection import train_test_split

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
np.random.seed(10)

In [6]:
train_data = TabularDataset("./GS-homework/dataset/train.csv")
val_data = TabularDataset("./GS-homework/dataset/validation.csv")
test_data = TabularDataset("./GS-homework/dataset/test.csv")
train_data.shape, val_data.shape, test_data.shape

((296727, 7), (5000, 7), (5000, 6))

In [7]:
train_data['date'] = train_data['pickup_date'].apply(lambda x: datetime.strptime(x[:10], "%Y-%m-%d"))
val_data['date'] = val_data['pickup_date'].apply(lambda x: datetime.strptime(x[:10], "%Y-%m-%d"))
test_data['date'] = test_data['pickup_date'].apply(lambda x: datetime.strptime(x[:10], "%Y-%m-%d"))

train_data.drop(columns=['pickup_date'], inplace = True)
val_data.drop(columns=['pickup_date'], inplace = True)
test_data.drop(columns=['pickup_date'], inplace = True)

In [8]:
train_data = train_data[train_data['date'] >= datetime.strptime('2022-01-01', "%Y-%m-%d")]

#train_data = train_data[(train_data['date'] >= datetime.strptime('2022-06-01', "%Y-%m-%d")) &
#                       (train_data['date'] < datetime.strptime('2022-08-15', "%Y-%m-%d"))]

train_data.shape

(79046, 7)

In [9]:
label = "rate"

In [10]:
def loss(y_true, y_pred):
    return np.mean(abs(1- y_pred/y_true))*100

custom_loss = make_scorer(name='mape',
                                 score_func=loss,
                                 optimum=0,
                                 greater_is_better=False)

In [11]:
predictor = TabularPredictor(label = label, 
                             eval_metric = 'r2').fit(train_data, 
                                                     presets='best_quality',
                                                     num_bag_folds = 5, 
                                                     num_gpus=1)

No path specified. Models will be saved in: "AutogluonModels/ag-20231115_090843/"
Presets specified: ['best_quality']
Stack configuration (auto_stack=True): num_stack_levels=1, num_bag_folds=5, num_bag_sets=1
Beginning AutoGluon training ...
AutoGluon will save models to "AutogluonModels/ag-20231115_090843/"
AutoGluon Version:  0.8.2
Python Version:     3.10.8
Operating System:   Linux
Platform Machine:   x86_64
Platform Version:   #1 SMP Sun Oct 22 17:17:17 UTC 2023
Disk Space Avail:   9223372004.00 GB / 9223372036.85 GB (100.0%)
Train Data Rows:    79046
Train Data Columns: 6
Label Column: rate
Preprocessing data ...
AutoGluon infers your prediction problem is: 'regression' (because dtype of label-column == float and many unique label-values observed).
	Label info (max, min, mean, stddev): (50.1753, 1.3101, 5.02853, 2.72448)
	If 'regression' is not the correct problem_type, please manually specify the problem_type parameter during predictor init (You may specify problem_type as one o

In [12]:
# Подготовим данные дл оценки качества на train и validation
y_train = train_data[label]
X_train = train_data.drop(columns = [label])

y_val = val_data[label]
X_val = val_data.drop(columns = [label])

In [13]:
# Посмотрим, какие модели и с каким качеством обучились:
predictor.leaderboard(val_data)

                     model  score_test  score_val  pred_time_test  pred_time_val     fit_time  pred_time_test_marginal  pred_time_val_marginal  fit_time_marginal  stack_level  can_infer  fit_order
0      WeightedEnsemble_L3    0.931418   0.943824       32.102076     163.377951  1905.581378                 0.006753                0.001671           1.377511            3       True         22
1     ExtraTreesMSE_BAG_L2    0.930660   0.943427       29.335546     157.718831  1430.445319                 2.031018                3.324810          26.332818            2       True         17
2   NeuralNetFastAI_BAG_L2    0.930143   0.940194       27.835766     155.753033  1662.059415                 0.531237                1.359011         257.946914            2       True         18
3      WeightedEnsemble_L2    0.929836   0.940811       20.378273     132.846777   552.558825                 0.008303                0.001565           1.542549            2       True         12
4   RandomFores

Unnamed: 0,model,score_test,score_val,pred_time_test,pred_time_val,fit_time,pred_time_test_marginal,pred_time_val_marginal,fit_time_marginal,stack_level,can_infer,fit_order
0,WeightedEnsemble_L3,0.931418,0.943824,32.102076,163.377951,1905.581378,0.006753,0.001671,1.377511,3,True,22
1,ExtraTreesMSE_BAG_L2,0.93066,0.943427,29.335546,157.718831,1430.445319,2.031018,3.32481,26.332818,2,True,17
2,NeuralNetFastAI_BAG_L2,0.930143,0.940194,27.835766,155.753033,1662.059415,0.531237,1.359011,257.946914,2,True,18
3,WeightedEnsemble_L2,0.929836,0.940811,20.378273,132.846777,552.558825,0.008303,0.001565,1.542549,2,True,12
4,RandomForestMSE_BAG_L2,0.9279,0.94222,29.258909,158.34519,1607.746285,1.95438,3.951168,203.633784,2,True,15
5,LightGBMLarge_BAG_L1,0.927132,0.931701,3.270555,18.433616,38.271194,3.270555,18.433616,38.271194,1,True,11
6,LightGBM_BAG_L1,0.926689,0.935995,2.754091,23.333495,49.694715,2.754091,23.333495,49.694715,1,True,4
7,XGBoost_BAG_L2,0.926117,0.940213,27.578687,154.741291,1416.290351,0.274158,0.347269,12.17785,2,True,19
8,LightGBMXT_BAG_L1,0.924781,0.933085,7.765994,90.051283,125.878833,7.765994,90.051283,125.878833,1,True,3
9,LightGBMLarge_BAG_L2,0.924349,0.936799,27.637659,155.565322,1417.319451,0.333131,1.1713,13.20695,2,True,21


In [14]:
pred_train = predictor.predict(X_train)
pred_val = predictor.predict(X_val)

In [15]:
print(f"Качество модели на train: {round(loss(y_train, pred_train), 3)}, на validation: {round(loss(y_val, pred_val), 3)}")

Качество модели на train: 3.931, на validation: 8.491


Между двумя датасетами видно проседание по метрике MAPE на 3%. Можно предположить, что чем дальше будут отстоять друг от друга train и test датасеты, тем выше будет становится ошибка. Поэтому обучим второй predictor, который будет содержать в качестве тренировочных данных train+validation датасеты

In [16]:
train_data2 = pd.concat([
    train_data,
    val_data
])

train_data2.shape

(84046, 7)

In [17]:
predictor2 = TabularPredictor(label = label, 
                              eval_metric = 'r2').fit(train_data2, 
                                                     presets='best_quality',
                                                     num_bag_folds = 5, 
                                                     num_gpus=1)

No path specified. Models will be saved in: "AutogluonModels/ag-20231115_100031/"
Presets specified: ['best_quality']
Stack configuration (auto_stack=True): num_stack_levels=1, num_bag_folds=5, num_bag_sets=1
Beginning AutoGluon training ...
AutoGluon will save models to "AutogluonModels/ag-20231115_100031/"
AutoGluon Version:  0.8.2
Python Version:     3.10.8
Operating System:   Linux
Platform Machine:   x86_64
Platform Version:   #1 SMP Sun Oct 22 17:17:17 UTC 2023
Disk Space Avail:   9223372001.25 GB / 9223372036.85 GB (100.0%)
Train Data Rows:    84046
Train Data Columns: 6
Label Column: rate
Preprocessing data ...
AutoGluon infers your prediction problem is: 'regression' (because dtype of label-column == float and many unique label-values observed).
	Label info (max, min, mean, stddev): (50.1753, 1.3101, 5.07693, 2.77141)
	If 'regression' is not the correct problem_type, please manually specify the problem_type parameter during predictor init (You may specify problem_type as one o

In [18]:
y_train2 = train_data2[label]
X_train2 = train_data2.drop(columns = [label])

pred_train2 = predictor2.predict(X_train2)
print(f"Качество модели на объединенном train+val датасете: {round(loss(y_train2, pred_train2), 3)}")

Качество модели на объединенном train+val датасете: 3.886


In [None]:
# Подготовим данные по получения предсказания обученными predictors на test датасете
ag_train_val_predict = predictor2.predict(test_data)

In [None]:
# Формируем датасет для проверки
test_initial = pd.read_csv("./GS-homework/dataset/test.csv")
test_initial['ag_train_val_predict'] = ag_train_val_predict

test_initial

In [None]:
test_initial.to_csv('./autogluon_test_predict_2.csv', index = False)