<a href="https://colab.research.google.com/github/bogus1aw/text-classification-benchmark/blob/main/M_TD_IDF_PolEmo2_0_logisticregresion.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# TF-IDF benchmark for PolEmo 2.0 dataset https://clarin-pl.eu/dspace/handle/11321/710

gdrive mounted manually form GUI

In [9]:
from sklearn import model_selection, preprocessing, linear_model, naive_bayes, metrics, svm
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn import decomposition, ensemble

import pandas, xgboost, numpy, textblob, string
from keras.preprocessing import text, sequence
from keras import layers, models, optimizers

import matplotlib.pyplot as plt

import datetime
import time

timestamp = datetime.datetime.now().replace(microsecond=0).isoformat().replace(':', '-')
metrice_path = '/content/drive/MyDrive/metrics/tf-idf_PolEmo2.0_logisticregression_raw' + timestamp + '.txt'

# 1. Function definitions

In [10]:
def load_corpora_to_dataframe(corpora):
  data = open(corpora).read()
  labels, texts = [], []
  for i, line in enumerate(data.split("\n")):
      content = line.split()
      if len(content) > 0: 
        labels.append(content[-1])
        texts.append(" ".join(content[:-1]))

  # create a dataframe using texts and lables
  trainDF = pandas.DataFrame()
  trainDF['text'] = texts
  trainDF['label'] = labels
  return trainDF

In [11]:
def calculate_feature_vectors_tfifd(max_features, train_dataframe, valid_x, valid_y):
  # word level tf-idf
  tfidf_vect = TfidfVectorizer(analyzer='word', max_features=max_features)
  tfidf_vect.fit(train_dataframe['text']) 
  xtrain_tfidf =  tfidf_vect.transform(train_dataframe['text'])
  xvalid_tfidf =  tfidf_vect.transform(valid_x)
  # ngram level tf-idf 
  tfidf_vect_ngram = TfidfVectorizer(analyzer='word', ngram_range=(1,2 ), max_features=max_features)
  tfidf_vect_ngram.fit(train_dataframe['text'])
  xtrain_tfidf_ngram = tfidf_vect_ngram.transform(train_dataframe['text'])
  xvalid_tfidf_ngram = tfidf_vect_ngram.transform(valid_x)
  return xtrain_tfidf, xvalid_tfidf, xtrain_tfidf_ngram, xvalid_tfidf_ngram

In [12]:
def clasification_benchmark(txt_description, classifier, train_y, feature_vector_train, feature_vector_valid):
  classifier.fit(feature_vector_train, train_y)
  predictions = classifier.predict(feature_vector_valid)
  accuracy = metrics.accuracy_score(valid_y, predictions)
  print(txt_description, ' ', accuracy)
  return accuracy

In [13]:
def calculate_classifiers_accuracy(max_feature, train_dataframe, train_y, valid_x, valid_y):

  xtrain_tfidf, xvalid_tfidf, xtrain_tfidf_ngram, xvalid_tfidf_ngram = calculate_feature_vectors_tfifd(max_feature, train_dataframe, valid_x, valid_y)
  acc_arr = []
  acc_arr.append(clasification_benchmark('LR, word,       ', linear_model.LogisticRegression(), train_y, xtrain_tfidf, xvalid_tfidf)) 
  acc_arr.append(clasification_benchmark('LR, (1-2)gram   ', linear_model.LogisticRegression(), train_y, xtrain_tfidf_ngram, xvalid_tfidf_ngram))
  return acc_arr

# 2. Constants and paths defintion

In [14]:
fig_path = '/content/drive/MyDrive/figures/'
max_feature = 20000
no_samples_per_class = [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 120, 140, 160, 180, 200, 400, 800, 1200]
 

# 3. Benchmarks
## 3.1 Raw corpora benchmark

In [15]:
def benchamrk(CORPORA_TRAIN, CORPORA_TEST) :
  train = load_corpora_to_dataframe(CORPORA_TRAIN)
  test = load_corpora_to_dataframe(CORPORA_TEST)

  # test dataset will be always the same for ale tests 
  # train, test = model_selection.train_test_split(trainDF, test_size=0.2, random_state=42)
  valid_x = test['text']
  valid_y = test['label']

  encoder = preprocessing.LabelEncoder()
  valid_y = encoder.fit_transform(valid_y)

  accurracy_matrix = []

  for n_sample in no_samples_per_class:
    print(n_sample)
    dataset_fraction = train.groupby(['label']).sample(n=n_sample, replace=True)

    train_y = encoder.fit_transform(dataset_fraction['label'])
    accurracy_per_run = calculate_classifiers_accuracy(max_feature, dataset_fraction, train_y, valid_x, valid_y)
    accurracy_matrix.append([n_sample] + accurracy_per_run)
    with open(metrice_path, 'a') as f:
        f.write(str([n_sample] + accurracy_per_run))

  df = pandas.DataFrame(accurracy_matrix)
  return df

In [16]:
CORPORA_TRAIN = '/content/drive/MyDrive/master_datasets/dataset_conll/all.text.train.txt'
CORPORA_TEST = '/content/drive/MyDrive/master_datasets/dataset_conll/all.text.test.txt'
df = benchamrk(CORPORA_TRAIN, CORPORA_TEST)
df

10


NameError: ignored

In [None]:
CORPORA_TRAIN = '/content/drive/MyDrive/master_datasets/dataset_conll/all.sentence.train.txt'
CORPORA_TEST = '/content/drive/MyDrive/master_datasets/dataset_conll/all.sentence.test.txt'
df = benchamrk(CORPORA_TRAIN, CORPORA_TEST)
df

In [None]:
CORPORA_TRAIN = '/content/drive/MyDrive/master_datasets/dataset_conll/hotels.text.train.txt'
CORPORA_TEST = '/content/drive/MyDrive/master_datasets/dataset_conll/hotels.text.test.txt'
df = benchamrk(CORPORA_TRAIN, CORPORA_TEST)
df

In [None]:
CORPORA_TRAIN = '/content/drive/MyDrive/master_datasets/dataset_conll/hotels.sentence.train.txt'
CORPORA_TEST = '/content/drive/MyDrive/master_datasets/dataset_conll/hotels.sentence.test.txt'
df = benchamrk(CORPORA_TRAIN, CORPORA_TEST)
df

In [None]:
CORPORA_TRAIN = '/content/drive/MyDrive/master_datasets/dataset_conll/Nhotels.text.train.txt'
CORPORA_TEST = '/content/drive/MyDrive/master_datasets/dataset_conll/hotels.text.test.txt'
df = benchamrk(CORPORA_TRAIN, CORPORA_TEST)
df

In [None]:
CORPORA_TRAIN = '/content/drive/MyDrive/master_datasets/dataset_conll/Nhotels.sentence.train.txt'
CORPORA_TEST = '/content/drive/MyDrive/master_datasets/dataset_conll/hotels.sentence.test.txt'
df = benchamrk(CORPORA_TRAIN, CORPORA_TEST)
df