In [None]:
!pip install ninja

In [None]:
!pip install -U fastai

In [None]:
!pip install langdetect

In [None]:
! git clone https://github.com/n-waves/multifit.git

In [None]:
! pip install multifit/

In [None]:
! pip install -r multifit/requirements.txt

In [None]:
from google.cloud import bigquery
from langdetect import detect
from fastai.text import *
import pathlib
import multifit
from multifit.datasets import ULMFiTDataset, Dataset
from google.colab import auth

In [None]:
auth.authenticate_user()

In [None]:
project_id = 'tlac-vision'

client = bigquery.Client(project=project_id)
df = client.query('''
  SELECT
    *
  FROM
    `tlac-vision.book_backend.train_categories`
''').to_dataframe()

In [None]:
# sorting dataframe by book category, from A to Z
df = df.sort_values(['category'], ascending=True)
# dropping duplicate rows
df = df.drop_duplicates(['title']).reset_index(drop=True)
df

Unnamed: 0,category,title,description
0,anthropology,El otro es mi espejo,"""Después de 60 años como misionero en el Perú ..."
1,anthropology,Diccionario de antropología,A lo largo de los últimos cien años la antropo...
2,anthropology,La cultura como praxis,"En este libro, uno de los principales teoricos..."
3,anthropology,"El fuego. Mitos, ritos y realidades","El fuego, presente ya en los inicios del proce..."
4,anthropology,Tierra encantada,Este ambicioso tratado sobre el fenómeno relig...
...,...,...,...
2753,sociology,Las tres culturas,An lisis de la relaci n llena de tensiones ent...
2754,sociology,Antología de Orlando Fals Borda,Presenta una selección de escritos del sociólo...
2755,sociology,Filosofía y sociología en Jesús Ibáñez,“Pocas veces un intelectual ha metabolizado co...
2756,sociology,Después del divorcio,El presente trabajo es un estudio de los efect...


In [None]:
# deleting results that are in other languages different to spanish
df['language'] = ""
for index, row in df.iterrows():
    row['language'] = detect(row['description'])
df = df[df.language == 'es']

cat_count = df.iloc[:, 0:2].groupby(
    'category').count().rename(columns={'title': 'count'})  # total of registers per category

In [None]:
# 75% of data is for training
cat_count['training'] = round(0.75 * cat_count['count'], 0)
# organizing indexes to split data
cat_count['acum'] = cat_count['count'].cumsum()
cat_count['init_idx'] = cat_count['acum'] - cat_count['count']
cat_count['train_idx'] = cat_count['init_idx'] + cat_count['training']

# creating training set with 75% of data per category
train_set = pd.DataFrame(data=None, columns=df.columns)
for i in range(len(cat_count)):
    i_idx = int(cat_count.iloc[i, 3])
    f_idx = int(cat_count.iloc[i, 4])
    train_set = train_set.append(df.iloc[i_idx:f_idx, :])

train_set_f = train_set.loc[:, ['category', 'description']]

In [None]:
# creating validation set with 25% of data per category
val_set = pd.DataFrame(data=None, columns=df.columns)
for i in range(len(cat_count)):
    i_idx = int(cat_count.iloc[i, 4])
    f_idx = int(cat_count.iloc[i, 2])
    val_set = val_set.append(df.iloc[i_idx:f_idx, :])

val_set_f = val_set.loc[:, ['category', 'description']]

In [None]:
path = pathlib.Path().absolute()
path

PosixPath('/content')

In [None]:
exp = multifit.from_pretrained("es_multifit_paper_version")

tok = Tokenizer(tok_func=SpacyTokenizer, lang='es')
data = TextLMDataBunch.from_df(path=path, train_df=train_set_f, valid_df=val_set_f,
                               tokenizer=tok, text_cols='description', label_cols='category', bs=exp.finetune_lm.bs)

In [None]:
learn = exp.finetune_lm.get_learner(data)

In [None]:
exp = multifit.from_pretrained("es_multifit_paper_version")
fa_config =  exp.pretrain_lm.tokenizer.get_fastai_config(add_open_file_processor=True)
data_lm = (TextList.from_folder(imdb_path, **fa_config)
            .filter_by_folder(include=['train', 'test', 'unsup']) 
            .split_by_rand_pct(0.1)
            .label_for_lm()           
            .databunch(bs=bs))
learn = exp.finetune_lm.get_learner(data_lm)
# learn is a preconfigured fastai learner with a pretrained model loaded
learn.fit_one_cycle(10)
learn.save_encoder("enc")