<a href="https://colab.research.google.com/github/bogus1aw/text-classification-benchmark/blob/main/M_FastTextTests_PolEmo2_0_lemmas_with_pretrained_word_emmbedings.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

FastText benchmark for PolEmo 2.0 dataset https://clarin-pl.eu/dspace/handle/11321/710


In [None]:
 from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# install fastText
!git clone https://github.com/facebookresearch/fastText.git
%cd fastText
!sudo pip install .

Cloning into 'fastText'...
remote: Enumerating objects: 3854, done.[K
remote: Total 3854 (delta 0), reused 0 (delta 0), pack-reused 3854[K
Receiving objects: 100% (3854/3854), 8.23 MiB | 28.75 MiB/s, done.
Resolving deltas: 100% (2416/2416), done.
/content/fastText
Processing /content/fastText
Building wheels for collected packages: fasttext
  Building wheel for fasttext (setup.py) ... [?25l[?25hdone
  Created wheel for fasttext: filename=fasttext-0.9.2-cp36-cp36m-linux_x86_64.whl size=3082241 sha256=4b988c499cd63a9ea5acd1ea22826c8816e720ff1ea707bb88c46f54266bc65d
  Stored in directory: /tmp/pip-ephem-wheel-cache-qayttdvz/wheels/a1/9f/52/696ce6c5c46325e840c76614ee5051458c0df10306987e7443
Successfully built fasttext
Installing collected packages: fasttext
Successfully installed fasttext-0.9.2


In [None]:
import fasttext as ft
import pandas as pd
import datetime
from sklearn import model_selection
import csv
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
timestamp = datetime.datetime.now().replace(microsecond=0).isoformat().replace(':', '-')


In [None]:
!pwd

/content/fastText


In [None]:
#create work dir
WORK_PATH = '/content/data/'
%mkdir -p '/content/data/'

In [None]:
# PATH DEFINITION FOR CORPORA
TEST_FASTTEXT_FILE_PATH = WORK_PATH + 'TEST_FASTEXT_WORK.csv'
TRAIN_FASTTEXT_FILE_PATH = WORK_PATH + 'TRAIN_FASTEXT_WORK.csv'

In [None]:
def load_corpora_to_dataframe(corpora):
  data = open(corpora).read()
  labels, texts = [], []
  for i, line in enumerate(data.split("\n")):
      content = line.split()
      if len(content) > 0: 
        labels.append(content[-1])
        texts.append(" ".join(content[:-1]))

  # create a dataframe using texts and lables
  trainDF = pd.DataFrame()
  trainDF['text'] = texts
  trainDF['label'] = labels
  return trainDF

In [None]:
def process_benchmark(domain, train, test, wordNgrams):
  # save text file which will be used for all tests
  test.to_csv(TEST_FASTTEXT_FILE_PATH, header=False, index=False, encoding='utf-8', sep='\t', quoting=csv.QUOTE_NONE)

  full = []
  for repeat in range(repetitions):
      iteration = []
      for n_samples in no_samples_per_class:
          train_work = train.groupby(['label']).sample(n=n_samples, replace=True)
          train_work = train_work.sample(frac=1)
          train_work.to_csv(TRAIN_FASTTEXT_FILE_PATH, header=False, index=False, encoding='utf-8', sep='\t', quoting=csv.QUOTE_NONE)
          # model = ft.train_supervised(input=TRAIN_FASTTEXT_FILE_PATH, lr=1.0, epoch=25, wordNgrams=wordNgrams)
          model = ft.train_supervised(input=TRAIN_FASTTEXT_FILE_PATH, dim=300, lr=1.0, epoch=25, pretrainedVectors='/content/drive/MyDrive/models/cc.pl.300.vec', wordNgrams=wordNgrams)
          result = model.test(TEST_FASTTEXT_FILE_PATH)
          print('domain: ', domain, " repeat: ", repeat," sample: ", n_samples,' wordNgrams: ', wordNgrams, ' results: ', result)
          iteration.append(result[1])
      print(iteration)
      full.append(iteration)
  
  return full


In [None]:
no_samples_per_class = [1, 3, 5, 8, 10, 20, 30, 60, 100, 200] ###### FINALL SAMPLES LIST  

repetitions = 2
# grams = [1, 2]
grams = [1]

metrice_path = '/content/drive/MyDrive/metrics/FastText_2.0_PolEmo2_raw' + timestamp + '.txt'
fig_path = '/content/drive/MyDrive/figures/'
dataset_path = '/content/drive/MyDrive/master_datasets/dataset_col_lemmas_final/'

domains = [
           ('all', 'MDT-A'),
           ('hotels', 'SDT-H'),
           ('medicine', 'SDT-M'),
           ('products', 'SDT-P'),
           ('reviews', 'SDT-R')
          ]


In [None]:
def write_to_logs(values):
  with open(metrice_path, 'a') as f:
    f.write(values)

In [None]:
df = pd.DataFrame()
   
for domian, ix_name in domains:
  for ngram in grams :
    print('%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%')
    print('%%%%%%%%  ', domian,  ' ', ngram)
    print('%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%')
    
    CORPORA_TRAIN = dataset_path + domian + '.text.train.txt'
    CORPORA_TEST = dataset_path + domian + '.text.test.txt'
    train = load_corpora_to_dataframe(CORPORA_TRAIN)
    test = load_corpora_to_dataframe(CORPORA_TEST)

    results = process_benchmark(ix_name, train, test, ngram)

    df = df.append(pd.DataFrame(pd.DataFrame(results, columns=no_samples_per_class).mean(), columns=[ix_name + '_R_' + str(ngram)]).T)

df
### LEMMAS

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%   all   1
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
domain:  MDT-A  repeat:  0  sample:  1  wordNgrams:  1  results:  (820, 0.4097560975609756, 0.4097560975609756)
domain:  MDT-A  repeat:  0  sample:  3  wordNgrams:  1  results:  (820, 0.4219512195121951, 0.4219512195121951)
domain:  MDT-A  repeat:  0  sample:  5  wordNgrams:  1  results:  (820, 0.5024390243902439, 0.5024390243902439)
domain:  MDT-A  repeat:  0  sample:  8  wordNgrams:  1  results:  (820, 0.5609756097560976, 0.5609756097560976)
domain:  MDT-A  repeat:  0  sample:  10  wordNgrams:  1  results:  (820, 0.4926829268292683, 0.4926829268292683)
domain:  MDT-A  repeat:  0  sample:  20  wordNgrams:  1  results:  (820, 0.5609756097560976, 0.5609756097560976)
domain:  MDT-A  repeat:  0  sample:  30  wordNgrams:  1  results:  (820, 0.6268292682926829, 0.6268292682926829)
domain:  MDT-A  repeat:  0  sample:  60  wordNgrams:  1  results:  (820, 0.6439024390243903, 0.6439024390243903)
domain:  

Unnamed: 0,1,3,5,8,10,20,30,60,100,200
MDT-A_R_1,0.330488,0.412195,0.493293,0.538415,0.513415,0.54939,0.621341,0.62561,0.660366,0.70061
SDT-H_R_1,0.394937,0.491139,0.473418,0.486076,0.578481,0.598734,0.637975,0.675949,0.726582,0.73038
SDT-M_R_1,0.330275,0.449541,0.567278,0.584098,0.501529,0.579511,0.616208,0.685015,0.675841,0.7263
SDT-P_R_1,0.329787,0.542553,0.319149,0.510638,0.521277,0.510638,0.553191,0.56383,0.62766,0.617021
SDT-R_R_1,0.31,0.38,0.34,0.54,0.43,0.53,0.54,0.59,0.64,0.64
