In [2]:
import numpy as np
import fasttext as ft
import pandas as pd


import os
import json

from ascii_preprocessing import (
    prepare_fasttext_input,
    preprocess_set_to_ascii
)

In [3]:
ENUM_DATA = '''  TGLANG_LANGUAGE_OTHER,
  TGLANG_LANGUAGE_1S_ENTERPRISE,
  TGLANG_LANGUAGE_ABAP,
  TGLANG_LANGUAGE_ACTIONSCRIPT,
  TGLANG_LANGUAGE_ADA,
  TGLANG_LANGUAGE_APACHE_GROOVY,
  TGLANG_LANGUAGE_APEX,
  TGLANG_LANGUAGE_APPLESCRIPT,
  TGLANG_LANGUAGE_ASP,
  TGLANG_LANGUAGE_ASSEMBLY,
  TGLANG_LANGUAGE_AUTOHOTKEY,
  TGLANG_LANGUAGE_AWK,
  TGLANG_LANGUAGE_BASIC,
  TGLANG_LANGUAGE_BATCH,
  TGLANG_LANGUAGE_BISON,
  TGLANG_LANGUAGE_C,
  TGLANG_LANGUAGE_CLOJURE,
  TGLANG_LANGUAGE_CMAKE,
  TGLANG_LANGUAGE_COBOL,
  TGLANG_LANGUAGE_COFFESCRIPT,
  TGLANG_LANGUAGE_COMMON_LISP,
  TGLANG_LANGUAGE_CPLUSPLUS,
  TGLANG_LANGUAGE_CRYSTAL,
  TGLANG_LANGUAGE_CSHARP,
  TGLANG_LANGUAGE_CSS,
  TGLANG_LANGUAGE_CSV,
  TGLANG_LANGUAGE_D,
  TGLANG_LANGUAGE_DART,
  TGLANG_LANGUAGE_DELPHI,
  TGLANG_LANGUAGE_DOCKER,
  TGLANG_LANGUAGE_ELIXIR,
  TGLANG_LANGUAGE_ELM,
  TGLANG_LANGUAGE_ERLANG,
  TGLANG_LANGUAGE_FIFT,
  TGLANG_LANGUAGE_FORTH,
  TGLANG_LANGUAGE_FORTRAN,
  TGLANG_LANGUAGE_FSHARP,
  TGLANG_LANGUAGE_FUNC,
  TGLANG_LANGUAGE_GAMS,
  TGLANG_LANGUAGE_GO,
  TGLANG_LANGUAGE_GRADLE,
  TGLANG_LANGUAGE_GRAPHQL,
  TGLANG_LANGUAGE_HACK,
  TGLANG_LANGUAGE_HASKELL,
  TGLANG_LANGUAGE_HTML,
  TGLANG_LANGUAGE_ICON,
  TGLANG_LANGUAGE_IDL,
  TGLANG_LANGUAGE_INI,
  TGLANG_LANGUAGE_JAVA,
  TGLANG_LANGUAGE_JAVASCRIPT,
  TGLANG_LANGUAGE_JSON,
  TGLANG_LANGUAGE_JULIA,
  TGLANG_LANGUAGE_KEYMAN,
  TGLANG_LANGUAGE_KOTLIN,
  TGLANG_LANGUAGE_LATEX,
  TGLANG_LANGUAGE_LISP,
  TGLANG_LANGUAGE_LOGO,
  TGLANG_LANGUAGE_LUA,
  TGLANG_LANGUAGE_MAKEFILE,
  TGLANG_LANGUAGE_MARKDOWN,
  TGLANG_LANGUAGE_MATLAB,
  TGLANG_LANGUAGE_NGINX,
  TGLANG_LANGUAGE_NIM,
  TGLANG_LANGUAGE_OBJECTIVE_C,
  TGLANG_LANGUAGE_OCAML,
  TGLANG_LANGUAGE_OPENEDGE_ABL,
  TGLANG_LANGUAGE_PASCAL,
  TGLANG_LANGUAGE_PERL,
  TGLANG_LANGUAGE_PHP,
  TGLANG_LANGUAGE_PL_SQL,
  TGLANG_LANGUAGE_POWERSHELL,
  TGLANG_LANGUAGE_PROLOG,
  TGLANG_LANGUAGE_PROTOBUF,
  TGLANG_LANGUAGE_PYTHON,
  TGLANG_LANGUAGE_QML,
  TGLANG_LANGUAGE_R,
  TGLANG_LANGUAGE_RAKU,
  TGLANG_LANGUAGE_REGEX,
  TGLANG_LANGUAGE_RUBY,
  TGLANG_LANGUAGE_RUST,
  TGLANG_LANGUAGE_SAS,
  TGLANG_LANGUAGE_SCALA,
  TGLANG_LANGUAGE_SCHEME,
  TGLANG_LANGUAGE_SHELL,
  TGLANG_LANGUAGE_SMALLTALK,
  TGLANG_LANGUAGE_SOLIDITY,
  TGLANG_LANGUAGE_SQL,
  TGLANG_LANGUAGE_SWIFT,
  TGLANG_LANGUAGE_TCL,
  TGLANG_LANGUAGE_TEXTILE,
  TGLANG_LANGUAGE_TL,
  TGLANG_LANGUAGE_TYPESCRIPT,
  TGLANG_LANGUAGE_UNREALSCRIPT,
  TGLANG_LANGUAGE_VALA,
  TGLANG_LANGUAGE_VBSCRIPT,
  TGLANG_LANGUAGE_VERILOG,
  TGLANG_LANGUAGE_VISUAL_BASIC,
  TGLANG_LANGUAGE_WOLFRAM,
  TGLANG_LANGUAGE_XML,
  TGLANG_LANGUAGE_YAML'''
id_to_cat = dict((i, c.replace(',', '').strip()) for i, c in enumerate(ENUM_DATA.split("\n")))
cat_to_id = {v: k for k, v in id_to_cat.items()}

In [4]:
train_data = pd.read_json("train_set_all_clean.json")
val_data = pd.read_json("val_set_all_clean.json")

In [5]:
train_data_cleaned = preprocess_set_to_ascii(train_data)
val_data_cleaned = preprocess_set_to_ascii(val_data)


100%|██████████| 6994/6994 [00:06<00:00, 1098.77it/s]
100%|██████████| 6994/6994 [00:04<00:00, 1526.90it/s]
100%|██████████| 2333/2333 [00:02<00:00, 1061.13it/s]
100%|██████████| 2333/2333 [00:01<00:00, 1411.87it/s]


In [6]:
train_dataset_ft_path = "train_data_ft.txt"
val_dataset_ft_path = "val_data_ft.txt"

prepare_fasttext_input(train_data_cleaned, train_dataset_ft_path)
prepare_fasttext_input(val_data_cleaned, val_dataset_ft_path)

In [7]:
### training

model = ft.train_supervised(
    input=train_dataset_ft_path,
    thread=4,
    autotuneValidationFile=val_dataset_ft_path,
    autotuneDuration=600,
    autotuneModelSize='5M'
)

Progress: 100.0% Trials:    7 Best score:  0.786970 ETA:   0h 0m 0s
Training again with best arguments
Read 20M words
Number of words:  324464
Number of labels: 92
Progress: 100.0% words/sec/thread:  427329 lr:  0.000000 avg.loss:  3.398440 ETA:   0h 0m 0s
Progress: 100.0% words/sec/thread: 1310890 lr:  0.000000 avg.loss:  0.980873 ETA:   0h 0m 0s 23.7% words/sec/thread: 1269555 lr:  0.332100 avg.loss:  1.907902 ETA:   0h 0m33s 62.0% words/sec/thread: 1292511 lr:  0.165571 avg.loss:  1.257525 ETA:   0h 0m16s


In [8]:
### evaluation
from sklearn.metrics import f1_score, classification_report

from tqdm import tqdm

predicted = [model.predict(sentence)[0][0] for sentence in tqdm(val_data_cleaned.text_cleaned_ascii)]
f1_score(val_data_cleaned['class'], predicted, average='macro')

100%|██████████| 2333/2333 [00:11<00:00, 208.89it/s]


0.7849885875181206