In [None]:
from google.colab import auth

auth.authenticate_user()

In [None]:
%%bash
pip3 install flair

In [None]:
import numpy as np
import pandas as pd
import sklearn
from sklearn import metrics
from flair.data import Corpus
from flair.datasets import CSVClassificationCorpus
from flair.embeddings import WordEmbeddings, FlairEmbeddings,\
  DocumentLSTMEmbeddings, DocumentPoolEmbeddings, TransformerDocumentEmbeddings
from flair.models import TextClassifier
from flair.trainers import ModelTrainer
import tensorflow as tf

In [None]:
!gsutil cp "gs://cil_2023/train_pos_preprocessed.txt" .
!gsutil cp "gs://cil_2023/train_neg_preprocessed.txt" .

filename_train_pos = "train_pos_preprocessed.txt"
filename_train_neg = "train_neg_preprocessed.txt"

In [None]:
label_type='sentiment'

# read dataset
dataset_pos_pd = pd.read_fwf(filename_train_pos, sep='\n', header=None, names=['text'])
dataset_neg_pd = pd.read_fwf(filename_train_neg, sep='\n', header=None, names=['text'])
dataset_pos_pd[label_type] = "POSITIVE"
dataset_neg_pd[label_type] = "NEGATIVE"
dataset_pd = pd.concat([dataset_pos_pd, dataset_neg_pd])

# shuffle
dataset_pd = dataset_pd.sample(frac=1, random_state=0).reset_index(drop=True)[:1000]

# train-test-val split
N = len(dataset_pd)
train_size = int(0.8 * N)
val_size = int(0.1 * N)
test_size = N - train_size - val_size
dataset_train = dataset_pd[:train_size]
dataset_val = dataset_pd[train_size:(train_size + val_size)]
dataset_test = dataset_pd[(train_size + val_size):]

# load into corpus
dataset_train.to_csv('train.csv', sep='\t')
dataset_val.to_csv('dev.csv', sep='\t')
dataset_test.to_csv('test.csv', sep='\t')

In [None]:
corpus: Corpus = CSVClassificationCorpus("./", {1: 'text', 2: 'label'},
                                         skip_header=True,
                                         label_type=label_type,
                                         delimiter='\t',
                                         train_file='train.csv',
                                         dev_file='dev.csv',
                                         test_file='test.csv',
                                         )
label_dict = corpus.make_label_dictionary(label_type=label_type)

In [None]:
classifier = TextClassifier.load('en-sentiment')
trainer = ModelTrainer(classifier, corpus)

trainer.fine_tune('./', max_epochs=1, main_evaluation_metric=("macro avg", "f1-score"))

In [None]:
"""
word_embeddings = [FlairEmbeddings('news-forward-fast'), FlairEmbeddings('news-backward-fast')]

document_embeddings = DocumentLSTMEmbeddings(word_embeddings, hidden_size=512, reproject_words=True, reproject_words_dimension=256)

classifier = TextClassifier(document_embeddings, label_dictionary=label_dict, label_type=label_type)

trainer = ModelTrainer(classifier, corpus)

trainer.train('./', max_epochs=5)
"""


In [None]:
"""
from flair.data import Sentence
from sklearn.metrics import accuracy_score

sentences = [Sentence(t) for t in dataset_train['text'][:1000]]
classifier.predict(sentences)
predictions = [s.get_label().value for s in sentences]
accuracy_score(predictions, dataset_train['sentiment'][:1000])
"""
