LightAutoML (LAMA) - фрейморк для автоматического машинного обучения от Сбера

Основная ссылка: https://developers.sber.ru/portal/products/lightautoml

Github: https://github.com/sb-ai-lab/LightAutoML

LAMA поможет анализировать данные Python-разработчикам, проверит данные витрин и облегчит работу Data-инженерам, ускорит проверку гипотез для исследователей и повысит качество готовых решений Data Scientist.


In [1]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [2]:
# Standard python libraries
import os
import requests

# Essential DS libraries
import numpy as np
import pandas as pd
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
import torch

# LightAutoML presets, task and report generation
from lightautoml.automl.presets.tabular_presets import TabularAutoML, TabularUtilizedAutoML
from lightautoml.tasks import Task
from lightautoml.report.report_deco import ReportDeco, ReportDecoUtilized
from lightautoml.addons.tabular_interpretation import SSWARM

'nlp' extra dependecy package 'gensim' isn't installed. Look at README.md in repo 'LightAutoML' for installation instructions.
'nlp' extra dependecy package 'nltk' isn't installed. Look at README.md in repo 'LightAutoML' for installation instructions.
'nlp' extra dependecy package 'transformers' isn't installed. Look at README.md in repo 'LightAutoML' for installation instructions.
'nlp' extra dependecy package 'gensim' isn't installed. Look at README.md in repo 'LightAutoML' for installation instructions.
'nlp' extra dependecy package 'nltk' isn't installed. Look at README.md in repo 'LightAutoML' for installation instructions.
'nlp' extra dependecy package 'transformers' isn't installed. Look at README.md in repo 'LightAutoML' for installation instructions.


  from .autonotebook import tqdm as notebook_tqdm


In [3]:
DATASET_NAME = 'jobs_train.csv'

In [4]:
N_THREADS = 4
N_FOLDS = 5
RANDOM_STATE = 42
TEST_SIZE = 0.2
TIMEOUT = 300
TARGET_NAME = 'target'

In [5]:
data = pd.read_csv(DATASET_NAME)
data.head()

Unnamed: 0,enrollee_id,city,city_development_index,gender,relevent_experience,enrolled_university,education_level,major_discipline,experience,company_size,company_type,last_new_job,training_hours,target
0,8949,city_103,0.92,Male,Has relevent experience,no_enrollment,Graduate,STEM,21.0,,,1.0,36,1.0
1,29725,city_40,0.776,Male,No relevent experience,no_enrollment,Graduate,STEM,15.0,99.0,Pvt Ltd,5.0,47,0.0
2,11561,city_21,0.624,,No relevent experience,Full time course,Graduate,STEM,5.0,,,0.0,83,0.0
3,33241,city_115,0.789,,No relevent experience,,Graduate,Business Degree,0.0,,Pvt Ltd,0.0,52,1.0
4,666,city_162,0.767,Male,Has relevent experience,no_enrollment,Masters,STEM,21.0,99.0,Funded Startup,4.0,8,0.0


In [6]:
train_data, test_data = train_test_split(
    data,
    test_size=TEST_SIZE,
    stratify=data[TARGET_NAME],
    random_state=RANDOM_STATE
)

print(f'Data is splitted. Parts sizes: train_data = {train_data.shape}, test_data = {test_data.shape}')

train_data.head()

Data is splitted. Parts sizes: train_data = (15326, 14), test_data = (3832, 14)


Unnamed: 0,enrollee_id,city,city_development_index,gender,relevent_experience,enrolled_university,education_level,major_discipline,experience,company_size,company_type,last_new_job,training_hours,target
17855,6135,city_21,0.624,,Has relevent experience,no_enrollment,Graduate,STEM,10.0,99.0,Pvt Ltd,1.0,90,0.0
17664,6455,city_103,0.92,Male,No relevent experience,no_enrollment,Primary School,,5.0,,,0.0,15,0.0
13404,5856,city_50,0.896,Male,Has relevent experience,no_enrollment,Graduate,STEM,12.0,4999.0,NGO,5.0,36,0.0
13366,27191,city_103,0.92,Male,Has relevent experience,no_enrollment,Graduate,STEM,5.0,500.0,Funded Startup,1.0,53,0.0
15670,16751,city_67,0.855,Female,Has relevent experience,Full time course,Graduate,STEM,5.0,,,1.0,158,1.0


In [7]:
task = Task('binary')

In [8]:
roles = {
    'target': TARGET_NAME,
    'drop': ['enrollee_id']
}

In [9]:
automl = TabularAutoML(
    task = task,
    timeout = TIMEOUT,
    cpu_limit = N_THREADS,
    reader_params = {'n_jobs': N_THREADS, 'cv': N_FOLDS, 'random_state': RANDOM_STATE},
)

In [10]:
out_of_fold_predictions = automl.fit_predict(train_data, roles = roles, verbose = 1)

[16:34:21] Stdout logging level is INFO.
[16:34:21] Copying TaskTimer may affect the parent PipelineTimer, so copy will create new unlimited TaskTimer
[16:34:21] Task: binary

[16:34:21] Start automl preset with listed constraints:
[16:34:21] - time: 300.00 seconds
[16:34:21] - CPU: 4 cores
[16:34:21] - memory: 16 GB

[16:34:21] [1mTrain data shape: (15326, 14)[0m

[16:34:29] Layer [1m1[0m train process start. Time left 291.84 secs
[16:34:31] Start fitting [1mLvl_0_Pipe_0_Mod_0_LinearL2[0m ...
[16:34:37] Fitting [1mLvl_0_Pipe_0_Mod_0_LinearL2[0m finished. score = [1m0.7917995237840509[0m
[16:34:37] [1mLvl_0_Pipe_0_Mod_0_LinearL2[0m fitting and predicting completed
[16:34:37] Time left 284.41 secs

[16:34:38] [1mSelector_LightGBM[0m fitting and predicting completed
[16:34:39] Start fitting [1mLvl_0_Pipe_1_Mod_0_LightGBM[0m ...
[16:34:50] Fitting [1mLvl_0_Pipe_1_Mod_0_LightGBM[0m finished. score = [1m0.8034272792245175[0m
[16:34:50] [1mLvl_0_Pipe_1_Mod_0_LightGBM[0m

In [11]:
out_of_fold_predictions


array([[0.5914293 ],
       [0.02775727],
       [0.10596498],
       ...,
       [0.05211184],
       [0.53407186],
       [0.49258888]], dtype=float32)

In [12]:
type(out_of_fold_predictions)

lightautoml.dataset.np_pd_dataset.NumpyDataset

In [13]:
forecast = out_of_fold_predictions.to_pandas()

In [14]:
test_predictions = automl.predict(test_data)
print(f'Prediction for test_data:\n{test_predictions}\nShape = {test_predictions.shape}')

Prediction for test_data:
array([[0.06230244],
       [0.6367193 ],
       [0.05620067],
       ...,
       [0.15038466],
       [0.1004193 ],
       [0.06352545]], dtype=float32)
Shape = (3832, 1)


In [15]:
forecast2 = test_predictions.to_pandas()

In [16]:
forecast2.data

Unnamed: 0,WeightedBlend_0
0,0.062302
1,0.636719
2,0.056201
3,0.118127
4,0.547522
...,...
3827,0.603223
3828,0.156477
3829,0.150385
3830,0.100419
