## Загрузим нужные библиотеки

In [1]:
# Standard python libraries
import os
import time

# DS libraries
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import torch
import numpy as np

# LightAutoML presets, task and report generation
from lightautoml.automl.presets.tabular_presets import TabularAutoML, TabularUtilizedAutoML
from lightautoml.tasks import Task
from lightautoml.report.report_deco import ReportDeco

# ML libraries
from sklearn.model_selection import train_test_split
from sklearn.metrics import recall_score, precision_score

%matplotlib inline

In [9]:
df = pd.read_csv("train.csv")
mass_object = df.dtypes[df.dtypes == "object"].index.values

## Трансформируем данные

In [10]:
df = df.fillna(0)

In [11]:
df.astype({'month_id': str})
df['month'] = df['month_id'].apply(lambda x: int(x.split('/')[0]))

df['promo_bin'] = df['promo'].apply(lambda x: -1 if str(x)=='-' else 1)

df['ABC_num'] = df['ABC'].apply(lambda x: ord(x)-ord('A'))

df = df.drop(mass_object, axis = 1)

In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200000 entries, 0 to 199999
Data columns (total 52 columns):
 #   Column                     Non-Null Count   Dtype  
---  ------                     --------------   -----  
 0   id                         200000 non-null  int64  
 1   age_indicator              200000 non-null  float64
 2   student_id                 200000 non-null  int64  
 3   program_id                 200000 non-null  int64  
 4   spent_time_total           200000 non-null  float64
 5   spent_time_to_complete_hw  200000 non-null  float64
 6   completed_hw               200000 non-null  float64
 7   failed_hw                  200000 non-null  float64
 8   reworked_hw                200000 non-null  float64
 9   interacted_hw              200000 non-null  float64
 10  avg_hw_mark                200000 non-null  float64
 11  test_with_good_mark        200000 non-null  float64
 12  test_with_great_mark       200000 non-null  float64
 13  webinars                   20

## Обучим модель

In [13]:
N_THREADS = 4
N_FOLDS = 5
RANDOM_STATE = 42
TEST_SIZE = 0.2
TIMEOUT = 500
TARGET_NAME = 'target'

In [14]:
np.random.seed(RANDOM_STATE)
torch.set_num_threads(N_THREADS)

In [15]:
tr_data, te_data = train_test_split(
    df,
    test_size=TEST_SIZE,
    stratify=df[TARGET_NAME],
    random_state=RANDOM_STATE
)

print(f'Data splitted. Parts sizes: tr_data = {tr_data.shape}, te_data = {te_data.shape}')

tr_data.head()

Data splitted. Parts sizes: tr_data = (160000, 52), te_data = (40000, 52)


Unnamed: 0,id,age_indicator,student_id,program_id,spent_time_total,spent_time_to_complete_hw,completed_hw,failed_hw,reworked_hw,interacted_hw,...,support_feedback_avg,feedback_avg_d1,feedback_avg_d2,feedback_avg_d3,feedback_avg_d4,feedback_avg_d5,target,month,promo_bin,ABC_num
185667,142763,3.0,6933216,549,40.0,0.0,0.0,0.0,0.0,2.0,...,0.0,0.0,0.0,0.0,5.0,0.0,0,9,-1,3
143045,168772,14.0,6770745,643,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0,10,-1,3
85400,231564,26.0,1993664,1330,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,4.2,0.0,0.0,0.0,0.0,0,6,1,3
147465,201565,25.0,7351541,1374,67.0,0.0,3.0,0.0,0.0,5.0,...,0.0,0.0,4.715,0.0,0.0,0.0,0,11,-1,1
93950,140238,13.0,6336587,229,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,5,12,-1,0


In [16]:
task = Task('multiclass')
roles = {
    'target': TARGET_NAME
}

automl = TabularAutoML(
    task = task,
    timeout = TIMEOUT,
    cpu_limit = N_THREADS,
    reader_params = {'n_jobs': N_THREADS, 'cv': N_FOLDS, 'random_state': RANDOM_STATE}
)

In [17]:
%%time

oof_pred = automl.fit_predict(tr_data, roles = roles, verbose = 1)

[15:14:32] Stdout logging level is INFO.
[15:14:32] Copying TaskTimer may affect the parent PipelineTimer, so copy will create new unlimited TaskTimer
[15:14:32] Task: multiclass

[15:14:32] Start automl preset with listed constraints:
[15:14:32] - time: 500.00 seconds
[15:14:32] - CPU: 4 cores
[15:14:32] - memory: 16 GB

[15:14:32] [1mTrain data shape: (160000, 52)[0m

[15:14:45] Layer [1m1[0m train process start. Time left 486.48 secs
[15:14:51] Start fitting [1mLvl_0_Pipe_0_Mod_0_LinearL2[0m ...
[15:17:25] Time limit exceeded after calculating fold 0

[15:17:25] Fitting [1mLvl_0_Pipe_0_Mod_0_LinearL2[0m finished. score = [1m-0.26530964959055797[0m
[15:17:25] [1mLvl_0_Pipe_0_Mod_0_LinearL2[0m fitting and predicting completed
[15:17:25] Time left 326.79 secs

[15:20:56] [1mSelector_LightGBM[0m fitting and predicting completed
[15:21:03] Start fitting [1mLvl_0_Pipe_1_Mod_0_LightGBM[0m ...
[15:24:40] Time limit exceeded after calculating fold 0

[15:24:40] Fitting [1mLv

In [18]:
%%time

te_pred = automl.predict(te_data)
print(f'Prediction for te_data:\n{te_pred}\nShape = {te_pred.shape}')

Prediction for te_data:
array([[9.90185022e-01, 4.11859294e-03, 3.52807867e-04, 3.10509786e-05,
        5.13206189e-03, 1.80394331e-04],
       [7.52765462e-02, 9.18743968e-01, 1.15737098e-03, 3.09123634e-03,
        7.41992029e-04, 9.88815096e-04],
       [9.88224268e-01, 1.14649311e-02, 7.87004174e-05, 7.35549111e-05,
        3.48902176e-05, 1.23602164e-04],
       ...,
       [9.99307215e-01, 5.16731292e-04, 4.75311572e-05, 2.12256036e-05,
        5.73034704e-05, 5.00082206e-05],
       [9.99278188e-01, 1.03522143e-04, 2.84976704e-06, 1.33024341e-05,
        6.14673918e-05, 5.40601846e-04],
       [9.78815913e-01, 1.92782674e-02, 1.08199274e-04, 4.52221793e-05,
        1.67225045e-03, 8.00943235e-05]], dtype=float32)
Shape = (40000, 6)
Wall time: 41.5 s


## Оценим точность

In [19]:
y_test = te_data[TARGET_NAME].values
pred = np.argmax(te_pred.data, axis=1)
0.2* recall_score(y_test, pred, average='macro') + 0.8* precision_score(y_test, pred, average='macro')

0.8448196283849545

In [20]:
recall_score(y_test, pred, average='macro')

0.6371593363051463

In [21]:
precision_score(y_test, pred, average='macro')

0.8967347014049064

## Сделаем прогноз на тестовых данных

In [22]:
df_test = pd.read_csv("case_files.csv")

In [23]:
df_test = df_test.fillna(0)

df_test.astype({'month_id': str})
df_test['month'] = df_test['month_id'].apply(lambda x: int(x.split('/')[0]))

df_test['promo_bin'] = df_test['promo'].apply(lambda x: -1 if str(x)=='-' else 1)

df_test['ABC_num'] = df_test['ABC'].apply(lambda x: ord(x)-ord('A'))

df_test = df_test.drop(mass_object, axis = 1)

In [24]:
%%time

test_pred = automl.predict(df_test)
test_pred = np.argmax(test_pred.data, axis=1)
test_pred

Wall time: 1min 28s


array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [25]:
df_test['target']=test_pred
df_test[['target']].value_counts()

target
0         78070
1          3693
4          1003
5           935
3           897
2           399
dtype: int64

In [26]:
df_test[['id', 'target']].to_csv(r'out2.csv', index = False, header = True)