<a href="https://colab.research.google.com/github/bogus1aw/text-classification-benchmark/blob/main/M_FastTextTests_wiki_pretrained.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

FastText benchmark for wiki dataset https://clarin-pl.eu/dspace/handle/11321/738

In [None]:
# install fastText
!git clone https://github.com/facebookresearch/fastText.git
%cd fastText
!sudo pip install .

Cloning into 'fastText'...
remote: Enumerating objects: 3854, done.[K
remote: Total 3854 (delta 0), reused 0 (delta 0), pack-reused 3854[K
Receiving objects: 100% (3854/3854), 8.23 MiB | 30.85 MiB/s, done.
Resolving deltas: 100% (2416/2416), done.
/content/fastText
Processing /content/fastText
Building wheels for collected packages: fasttext
  Building wheel for fasttext (setup.py) ... [?25l[?25hdone
  Created wheel for fasttext: filename=fasttext-0.9.2-cp36-cp36m-linux_x86_64.whl size=3082295 sha256=ef5643036c07b73765bdb201ae40f7d0c806d42072a275d92168f521eb7fd336
  Stored in directory: /tmp/pip-ephem-wheel-cache-e7lzcd_w/wheels/a1/9f/52/696ce6c5c46325e840c76614ee5051458c0df10306987e7443
Successfully built fasttext
Installing collected packages: fasttext
Successfully installed fasttext-0.9.2


In [None]:
import fasttext as ft
import pandas as pd
from sklearn import model_selection
import csv
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
#create work dir
WORK_PATH = '/content/data/wiki_experiments/'
%mkdir -p '/content/data/wiki_experiments/'

In [None]:
# PATH DEFINITION FOR CORPORA
CORPORA_RAW = '/content/drive/MyDrive/master_datasets/wiki_preprocessed/wikiInOneFileDataset.txt'
CORPORA_LEMMATIZED = '/content/drive/MyDrive/master_datasets/wiki_preprocessed/wikiInOneFileDataset_lemmas.txt'
TEST_FASTTEXT_FILE_PATH = WORK_PATH + 'TEST_FASTEXT_WORK.csv'
TRAIN_FASTTEXT_FILE_PATH = WORK_PATH + 'TRAIN_FASTEXT_WORK.csv'

repetitions = 2
no_samples_per_class = [1, 3, 5, 8, 10, 20, 30, 60, 100, 200] ###### FINALL SAMPLES LIST  


In [None]:
def load_corpora_to_dataframe(corpora):
  data = open(corpora).read()
  labels, texts = [], []
  for i, line in enumerate(data.split("\n")):
      content = line.split()
      labels.append(content[0])
      texts.append(" ".join(content[1:]))

  # create a dataframe using texts and lables
  trainDF = pd.DataFrame()
  trainDF['label'] = labels
  trainDF['text'] = texts
  return trainDF

In [None]:
def process_benchmark(train, test, wordNgrams):
  # save text file which will be used for all tests
  test.to_csv(TEST_FASTTEXT_FILE_PATH, header=False, index=False, encoding='utf-8', sep='\t', quoting=csv.QUOTE_NONE)



  full = []
  for repeat in range(repetitions):
      iteration = []
      for n_samples in no_samples_per_class:
          train_work = train.groupby(['label']).sample(n=n_samples, replace=True)
          train_work = train_work.sample(frac=1)
          train_work.to_csv(TRAIN_FASTTEXT_FILE_PATH, header=False, index=False, encoding='utf-8', sep='\t', quoting=csv.QUOTE_NONE)
          model = ft.train_supervised(input=TRAIN_FASTTEXT_FILE_PATH, dim=300, lr=1.0, epoch=25, pretrainedVectors='/content/drive/MyDrive/models/cc.pl.300.vec', wordNgrams=wordNgrams)
          result = model.test(TEST_FASTTEXT_FILE_PATH)
          print("repeat: ", repeat," sample: ", n_samples,' wordNgrams: ', wordNgrams, ' results: ', result)
          iteration.append(result[1])
      print(iteration)
      full.append(iteration)
  return full


In [None]:
# RAW monograms
trainDF = load_corpora_to_dataframe(CORPORA_RAW)
train, test = model_selection.train_test_split(trainDF, test_size=0.2, random_state=42)

results = process_benchmark(train, test, 1)

repeat:  0  sample:  1  wordNgrams:  1  results:  (1377, 0.12345679012345678, 0.12345679012345678)
repeat:  0  sample:  3  wordNgrams:  1  results:  (1377, 0.3137254901960784, 0.3137254901960784)
repeat:  0  sample:  5  wordNgrams:  1  results:  (1377, 0.3805374001452433, 0.3805374001452433)
repeat:  0  sample:  8  wordNgrams:  1  results:  (1377, 0.4596949891067538, 0.4596949891067538)
repeat:  0  sample:  10  wordNgrams:  1  results:  (1377, 0.5519244734931009, 0.5519244734931009)
repeat:  0  sample:  20  wordNgrams:  1  results:  (1377, 0.6289034132171387, 0.6289034132171387)
repeat:  0  sample:  30  wordNgrams:  1  results:  (1377, 0.6949891067538126, 0.6949891067538126)
repeat:  0  sample:  60  wordNgrams:  1  results:  (1377, 0.7639796659404503, 0.7639796659404503)
repeat:  0  sample:  100  wordNgrams:  1  results:  (1377, 0.7981118373275236, 0.7981118373275236)
repeat:  0  sample:  200  wordNgrams:  1  results:  (1377, 0.8148148148148148, 0.8148148148148148)
[0.12345679012345678

In [None]:
no_samples_per_class

[1, 3, 5, 8, 10, 20, 30, 60, 100, 200]

In [None]:

df = pd.DataFrame(results, columns=no_samples_per_class)
df

Unnamed: 0,1,3,5,8,10,20,30,60,100,200
0,0.123457,0.313725,0.380537,0.459695,0.551924,0.628903,0.694989,0.76398,0.798112,0.814815
1,0.100944,0.302832,0.386347,0.488017,0.519971,0.631808,0.672476,0.758896,0.791576,0.822803


In [None]:
df.describe()

Unnamed: 0,1,3,5,8,10,20,30,60,100,200
count,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0
mean,0.1122,0.308279,0.383442,0.473856,0.535948,0.630356,0.683733,0.761438,0.794844,0.818809
std,0.015919,0.007703,0.004108,0.020027,0.022595,0.002054,0.015919,0.003595,0.004622,0.005649
min,0.100944,0.302832,0.380537,0.459695,0.519971,0.628903,0.672476,0.758896,0.791576,0.814815
25%,0.106572,0.305556,0.38199,0.466776,0.527959,0.62963,0.678105,0.760167,0.79321,0.816812
50%,0.1122,0.308279,0.383442,0.473856,0.535948,0.630356,0.683733,0.761438,0.794844,0.818809
75%,0.117829,0.311002,0.384895,0.480937,0.543936,0.631082,0.689361,0.762709,0.796478,0.820806
max,0.123457,0.313725,0.386347,0.488017,0.551924,0.631808,0.694989,0.76398,0.798112,0.822803


In [None]:
# RAW 1-2grams
# trainDF = load_corpora_to_dataframe(CORPORA_RAW)
# train, test = model_selection.train_test_split(trainDF, test_size=0.2, random_state=42)

# results = process_benchmark(train, test, 2)

In [None]:
# df = pd.DataFrame(results, columns=no_samples_per_class)
# df

In [None]:
# df.describe()

In [None]:
# Lemmas monograms
trainDF = load_corpora_to_dataframe(CORPORA_LEMMATIZED)
train, test = model_selection.train_test_split(trainDF, test_size=0.2, random_state=42)

results = process_benchmark(train, test, 1)

repeat:  0  sample:  1  wordNgrams:  1  results:  (1377, 0.2403776325344953, 0.2403776325344953)
repeat:  0  sample:  3  wordNgrams:  1  results:  (1377, 0.3609295570079884, 0.3609295570079884)
repeat:  0  sample:  5  wordNgrams:  1  results:  (1377, 0.46042120551924476, 0.46042120551924476)
repeat:  0  sample:  8  wordNgrams:  1  results:  (1377, 0.5969498910675382, 0.5969498910675382)
repeat:  0  sample:  10  wordNgrams:  1  results:  (1377, 0.6368917937545389, 0.6368917937545389)
repeat:  0  sample:  20  wordNgrams:  1  results:  (1377, 0.756717501815541, 0.756717501815541)
repeat:  0  sample:  30  wordNgrams:  1  results:  (1377, 0.766158315177923, 0.766158315177923)
repeat:  0  sample:  60  wordNgrams:  1  results:  (1377, 0.8351488743645606, 0.8351488743645606)
repeat:  0  sample:  100  wordNgrams:  1  results:  (1377, 0.8489469862018881, 0.8489469862018881)
repeat:  0  sample:  200  wordNgrams:  1  results:  (1377, 0.8678286129266521, 0.8678286129266521)
[0.2403776325344953, 0.3

In [None]:
df = pd.DataFrame(results, columns=no_samples_per_class)
df

Unnamed: 0,1,3,5,8,10,20,30,60,100,200
0,0.240378,0.36093,0.460421,0.59695,0.636892,0.756718,0.766158,0.835149,0.848947,0.867829
1,0.1939,0.38199,0.499637,0.567175,0.597676,0.728395,0.783588,0.816993,0.847495,0.87146


In [None]:
df.describe()

Unnamed: 0,1,3,5,8,10,20,30,60,100,200
count,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0
mean,0.217139,0.37146,0.480029,0.582062,0.617284,0.742556,0.774873,0.826071,0.848221,0.869644
std,0.032865,0.014892,0.02773,0.021054,0.02773,0.020027,0.012324,0.012838,0.001027,0.002568
min,0.1939,0.36093,0.460421,0.567175,0.597676,0.728395,0.766158,0.816993,0.847495,0.867829
25%,0.205519,0.366195,0.470225,0.574619,0.60748,0.735476,0.770516,0.821532,0.847858,0.868736
50%,0.217139,0.37146,0.480029,0.582062,0.617284,0.742556,0.774873,0.826071,0.848221,0.869644
75%,0.228758,0.376725,0.489833,0.589506,0.627088,0.749637,0.77923,0.83061,0.848584,0.870552
max,0.240378,0.38199,0.499637,0.59695,0.636892,0.756718,0.783588,0.835149,0.848947,0.87146


In [None]:
# Lemmas 1-2grams
# trainDF = load_corpora_to_dataframe(CORPORA_LEMMATIZED)
# train, test = model_selection.train_test_split(trainDF, test_size=0.2, random_state=42)

# results = process_benchmark(train, test, 2)

In [None]:
# df = pd.DataFrame(results, columns=no_samples_per_class)
# df

In [None]:
# df.describe()

In [None]:



# import matplotlib.pyplot as plt
# plt.figure(figsize=(8, 6))
# df.boxplot()
# model.get_dimension()



# model.get_nearest_neighbors('rok')
# model.get_word_vector( "rok")
