In [None]:
%%capture
! pip install flair
!pip install GPUtil
!pip install spacy-langdetect
# !python -m spacy download en_core_web_trf
import re

In [None]:
# from google.colab import drive
# drive.mount('/content/drive')

In [None]:
def clean_text(text):
    text = re.sub(r'[^\w\s]', '', text.lower().strip())
    text = re.sub(r"(?:\@|https?\://)\S+", "HTTPURL", str(text), flags=re.MULTILINE)
    text = re.sub(r"\s+", " ", text, flags=re.MULTILINE)
    return text.strip()

## Training

In [1]:
%%capture
! pip install flair
# !pip install GPUtil
# !pip install spacy-langdetect
# !python -m spacy download en_core_web_trf


In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# restart runtime until you get a P100
!nvidia-smi

In [None]:
# clean GPU Memory
# import torch
# from GPUtil import showUtilization as gpu_usage
# from numba import cuda


# def free_gpu_cache():
#     print("Initial GPU Usage")
#     gpu_usage()                             

#     torch.cuda.empty_cache()

#     cuda.select_device(0)
#     cuda.close()
#     cuda.select_device(0)

#     print("GPU Usage after emptying the cache")
#     gpu_usage()

# free_gpu_cache()     

In [None]:
import conllu
conllu.TokenList = conllu.models.TokenList
import pandas as pd
import numpy as np
import re
import torch
from flair.data import Corpus
# from flair.datasets import TREC_6
from flair.datasets import ClassificationCorpus
from flair.embeddings import TransformerDocumentEmbeddings
from flair.models import TextClassifier
from flair.trainers import ModelTrainer

In [None]:
path = '/content/drive/MyDrive/final_models'
model_name='roberta-large'
frac = 1
# # ernie, roberta-large, t5, bert, xlnet
# dataset = pd.read_csv('/content/drive/MyDrive/final_models/final_data.csv').sample(frac=1)
# dataset = dataset[['sdg', 'text']].rename(columns={'sdg': 'label'}).sample(frac=1)


# test = dataset.iloc[0:int(len(dataset)*0.1)]
# test.to_csv('/content/drive/MyDrive/final_models/final_test.csv')

# train_and_dev = dataset.iloc[int(len(dataset)*0.1):int(len(dataset))]
# train_and_dev.to_csv('/content/drive/MyDrive/final_models/train_and_dev.csv')

train_and_dev= pd.read_csv('/content/drive/MyDrive/final_models/train_and_dev.csv').sample(frac=frac)
train_and_dev = train_and_dev[['label', 'text']]
# train_and_dev['text'] = train_and_dev['text'].apply(lambda x: x.lower())


train = train_and_dev.iloc[:int(len(train_and_dev)*(8/9))]
dev = train_and_dev.iloc[int(len(train_and_dev)*(8/9)):len(train_and_dev)]

test = pd.read_csv('/content/drive/MyDrive/final_models/final_test.csv').sample(frac=frac)
test = test[['label', 'text']]


# # test['text'] = test['text'].apply(lambda x: x.lower())

# # # train.to_csv(path+'train.csv')

# # # dev.to_csv(path+'dev.csv')

train['label'] = '__label__' + train['label'].astype(str)
test['label'] = '__label__' + test['label'].astype(str)
dev['label'] = '__label__' + dev['label'].astype(str)

train.to_csv(f'{path}/{model_name}/train.txt', sep='\t', index=False, header=False)
test.to_csv(f'{path}/{model_name}/test.txt', sep='\t', index=False, header=False)
dev.to_csv(f'{path}/{model_name}/dev.txt', sep='\t', index=False, header=False)

# # # del dataset
# del train_and_dev
# del train
# del test
# del dev


In [None]:
label_type = 'SDG'

corpus: Corpus = ClassificationCorpus(f'{path}/{model_name}/',
                                      test_file='test.txt',
                                      dev_file='dev.txt',
                                      train_file='train.txt',
                                      label_type=label_type,
                                      )


document_embeddings = TransformerDocumentEmbeddings('roberta-large', fine_tune=True)
#document_embeddings = TransformerDocumentEmbeddings('roberta-large', fine_tune=True, dropout=0.3)


classifier = TextClassifier(document_embeddings, label_dictionary=corpus.make_label_dictionary(label_type=label_type
), label_type=label_type, multi_label=False)


trainer = ModelTrainer(classifier, corpus)


trainer.fine_tune(f'{path}/{model_name}/',
              learning_rate=1e-5,
              mini_batch_size=4,
              max_epochs=10,
              embeddings_storage_mode='none',
              write_weights=True
              )

#learning_rate=1e-6, 3e-6, 2e-5, 3e-5, 4e-5, 5e-5
#mini_batch_size=8
#max_epoch= 7


In [None]:
#plotting our training curves for visualization
from flair.visual.training_curves import Plotter

plotter = Plotter()
plotter.plot_training_curves(f'{path}/{model_name}/loss.tsv')
plotter.plot_weights(f'{path}/{model_name}/weights.txt')

# Section 5 - Hyperparameter Tuning
Hyperparameters were defined and tested automatically in order to find the best combination for our project.

In [None]:
from hyperopt import hp
from flair.hyperparameter.param_selection import SearchSpace, Parameter

search_space = SearchSpace()

# define training hyperparameters
search_space.add(Parameter.LEARNING_RATE, hp.choice, options=[1e-5, 3e-5, 1e-6, 3e-6 ])
search_space.add(Parameter.MINI_BATCH_SIZE, hp.choice, options=[4, 8])
search_space.add(Parameter.DROPOUT, hp.uniform, low=0.0, high=0.7)

# define transformer embedding hyperparameters
search_space.add(Parameter.TRANSFORMER_MODEL, hp.choice, options=['roberta-large'])

In [None]:
from flair.hyperparameter.param_selection import TextClassifierParamSelector, OptimizationValue

# what label do we want to predict?
label_type = 'SDG'

# create the parameter selector
param_selector = TextClassifierParamSelector(
    corpus,
    label_type,
    False,
    'resources/results',
    max_epochs=2,
    fine_tune=True,
    training_runs=3,
    optimization_value=OptimizationValue.DEV_SCORE
)
#training_runs=1, 2