In [5]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [6]:
%cd drive/MyDrive/Courses/GBC_MLOps/project_example/notebooks/

/content/drive/MyDrive/Courses/GBC_MLOps/project_example/notebooks


### 1. Импорт необходимых библиотек

In [1]:
!pip install tqdm
!pip install fasttext

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting fasttext
  Downloading fasttext-0.9.2.tar.gz (68 kB)
[K     |████████████████████████████████| 68 kB 3.2 MB/s 
[?25hCollecting pybind11>=2.2
  Using cached pybind11-2.10.0-py3-none-any.whl (213 kB)
Building wheels for collected packages: fasttext
  Building wheel for fasttext (setup.py) ... [?25l[?25hdone
  Created wheel for fasttext: filename=fasttext-0.9.2-cp37-cp37m-linux_x86_64.whl size=3165900 sha256=248255fd139d2028c623af7a9f38fd6162af195183493e6426d80447eb1c26dc
  Stored in directory: /root/.cache/pip/wheels/4e/ca/bf/b020d2be95f7641801a6597a29c8f4f19e38f9c02a345bab9b
Successfully built fasttext
Installing collected packages: pybind11, fasttext
Successfully installed fasttext-0.9.2 pybind11-2.10.0


In [2]:
import fasttext
import pandas as pd
from tqdm import tqdm
from sklearn.metrics import classification_report, f1_score

### 2. Дополнительные необходимые функции

In [3]:
def classification_rep_excel(valid_file, model, class_rep_excel_file_path):
    """
    Функция подготовки отчета по обучению модели
    :param valid_file: путь к файлу с тестовым датасетом
    :param model: обученная модель fasttext
    :param class_rep_excel_file_path: путь к файлу для сохранения отчета
    :return: отчет в формате str, отчет в формате pandas.DataFrame
    """
    
    txt_file = open(valid_file, "r", encoding='utf-8')
    
    Lines = txt_file.readlines()

    labels_true = []
    labels_pred = []

    count = 0
    for line in tqdm(Lines):
        line_label = line.strip().split(' ')[0]
        line_txt = ' '.join(line.strip().split(' ')[1:])
        line_label_pred,_ = model.predict(line_txt)
        labels_true.append(line_label[9:])
        labels_pred.append(line_label_pred[0][9:])

    txt_file.close()
    
    class_report = classification_report(labels_true, labels_pred, output_dict=True)
    class_rep_df = pd.DataFrame(class_report).transpose()
    class_rep_df.to_excel(class_rep_excel_file_path)
    
    return class_report, class_rep_df

### 3. Обучаем модель fasttext на подготовленном датасете

In [7]:
model = fasttext.train_supervised(input='../data/processed/train_dataset_prep.txt',
                                  epoch=5,
                                  lr=2.78897,
                                  dim=71,
                                  minCount=1,
                                  wordNgrams=4,
                                  minn=4,
                                  maxn=5,
                                  bucket=982998)

### 4. Сохранение модели

In [8]:
model.save_model("../models/model_fasttext_prep.bin")

### 3. Смотрим основные метрики на тестовом датасете

In [9]:
result = model.test('../data/processed/test_dataset_prep.txt')

print("Test sample size: {}".format(result[0]))
print("Precision at 1: {:.3f}".format(result[1]))
print("Recall on a test set: {:.3f}".format(result[2]))

Test sample size: 3925
Precision at 1: 0.866
Recall on a test set: 0.866


### 4. Генерируем отчет по обучению модели

In [11]:
class_rep, class_rep_df = classification_rep_excel('../data/processed/test_dataset_prep.txt',
                       model, '../reports/class_rep_model.xlsx')

100%|██████████| 3925/3925 [00:00<00:00, 26512.75it/s]


In [12]:
class_rep_df

Unnamed: 0,precision,recall,f1-score,support
алкоголь,0.921569,0.912621,0.917073,103.0
бакалея,0.909091,0.888889,0.898876,315.0
гастрономия,0.850746,0.86692,0.858757,263.0
дети,0.816176,0.735099,0.773519,151.0
для_дома,0.834254,0.792651,0.812921,381.0
животные,0.944444,0.894737,0.918919,57.0
здоровье,0.836538,0.769912,0.801843,113.0
кафе,0.684211,0.614173,0.647303,127.0
компьютер,0.333333,0.266667,0.296296,15.0
косметика,0.757143,0.768116,0.76259,69.0
