<a href="https://colab.research.google.com/github/bogus1aw/text-classification-benchmark/blob/main/M_TD_IDF_PolEmo2_0_logisticregresion.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# TF-IDF benchmark for PolEmo 2.0 dataset https://clarin-pl.eu/dspace/handle/11321/710

gdrive mounted manually form GUI

In [1]:
from sklearn import model_selection, preprocessing, linear_model, naive_bayes, metrics, svm
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn import decomposition, ensemble

import pandas, xgboost, numpy, textblob, string
from keras.preprocessing import text, sequence
from keras import layers, models, optimizers

import matplotlib.pyplot as plt

import datetime
import time

timestamp = datetime.datetime.now().replace(microsecond=0).isoformat().replace(':', '-')

# 1. Function definitions

In [2]:
def load_corpora_to_dataframe(corpora):
  data = open(corpora).read()
  labels, texts = [], []
  for i, line in enumerate(data.split("\n")):
      content = line.split()
      if len(content) > 0: 
        labels.append(content[-1])
        texts.append(" ".join(content[:-1]))

  # create a dataframe using texts and lables
  trainDF = pandas.DataFrame()
  trainDF['text'] = texts
  trainDF['label'] = labels
  return trainDF

In [3]:
def calculate_feature_vectors_tfifd(max_features, train_dataframe, valid_x, valid_y):
  # word level tf-idf
  tfidf_vect = TfidfVectorizer(analyzer='word', max_features=max_features)
  tfidf_vect.fit(train_dataframe['text']) 
  xtrain_tfidf =  tfidf_vect.transform(train_dataframe['text'])
  xvalid_tfidf =  tfidf_vect.transform(valid_x)
  # ngram level tf-idf 
  tfidf_vect_ngram = TfidfVectorizer(analyzer='word', ngram_range=(1,2 ), max_features=max_features)
  tfidf_vect_ngram.fit(train_dataframe['text'])
  xtrain_tfidf_ngram = tfidf_vect_ngram.transform(train_dataframe['text'])
  xvalid_tfidf_ngram = tfidf_vect_ngram.transform(valid_x)
  return xtrain_tfidf, xvalid_tfidf, xtrain_tfidf_ngram, xvalid_tfidf_ngram

In [4]:
def clasification_benchmark(txt_description, classifier, train_y, valid_y, feature_vector_train, feature_vector_valid):
  classifier.fit(feature_vector_train, train_y)
  predictions = classifier.predict(feature_vector_valid)
  accuracy = metrics.accuracy_score(valid_y, predictions)
  print(txt_description, ' ', accuracy)
  return accuracy

In [5]:
def calculate_classifiers_accuracy(max_feature, train_dataframe, train_y, valid_x, valid_y):

  xtrain_tfidf, xvalid_tfidf, xtrain_tfidf_ngram, xvalid_tfidf_ngram = calculate_feature_vectors_tfifd(max_feature, train_dataframe, valid_x, valid_y)
  acc_arr = []
  acc_arr.append(clasification_benchmark('LR, word,       ', linear_model.LogisticRegression(), train_y, valid_y, xtrain_tfidf, xvalid_tfidf)) 
  acc_arr.append(clasification_benchmark('LR, (1-2)gram   ', linear_model.LogisticRegression(), train_y, valid_y, xtrain_tfidf_ngram, xvalid_tfidf_ngram))
  return acc_arr

# 2. Constants and paths defintion

In [6]:
metrice_path = '/content/drive/MyDrive/metrics/tf-idf_PolEmo2.0_logisticregression_raw' + timestamp + '.txt'
fig_path = '/content/drive/MyDrive/figures/'
dataset_path = '/content/drive/MyDrive/master_datasets/dataset_conll/'
max_feature = 20000
no_samples_per_class = [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 120, 140, 160, 180, 200, 400, 800, 1200]
 

# 3. Benchmarks
## 3.1 Raw corpora benchmark

In [10]:
def benchamrk(CORPORA_TRAIN, CORPORA_TEST, index) :
  train = load_corpora_to_dataframe(dataset_path + CORPORA_TRAIN)
  test = load_corpora_to_dataframe(dataset_path + CORPORA_TEST)

  # test dataset will be always the same for ale tests 
  # train, test = model_selection.train_test_split(trainDF, test_size=0.2, random_state=42)
  valid_x = test['text']
  valid_y = test['label']

  encoder = preprocessing.LabelEncoder()
  valid_y = encoder.fit_transform(valid_y)

  accurracy_matrix = []

  for n_sample in no_samples_per_class:
    print(n_sample)
    dataset_fraction = train.groupby(['label']).sample(n=n_sample, replace=True)

    train_y = encoder.fit_transform(dataset_fraction['label'])
    accurracy_per_run = calculate_classifiers_accuracy(max_feature, dataset_fraction, train_y, valid_x, valid_y)
    # accurracy_matrix.append([n_sample] + accurracy_per_run)
    accurracy_matrix.append(accurracy_per_run)
    with open(metrice_path, 'a') as f:
        f.write(str([n_sample] + accurracy_per_run))

  df = pandas.DataFrame(accurracy_matrix, columns=index, index=no_samples_per_class).T
  return df

In [11]:
domains = [('all', 'MDT-A'),
           ('hotels', 'SDT-H'),
           ('medicine', 'SDT-M'),
           ('products', 'SDT-P'),
           ('reviews', 'SDT-R')]


df = pandas.DataFrame()
   
for domian, ix_name in domains:
  print('%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%')
  print('%%%%%%%%  ' + domian)
  print('%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%')
  CORPORA_TRAIN = domian + '.text.train.txt'
  CORPORA_TEST = domian + '.text.test.txt'
  df = df.append(benchamrk(CORPORA_TRAIN, CORPORA_TEST, index=[ ix_name + '_R_1g', ix_name + '_R_2g']))

df

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%  all
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
10
LR, word,          0.551219512195122
LR, (1-2)gram      0.5536585365853659
20
LR, word,          0.5573170731707318
LR, (1-2)gram      0.5487804878048781
30
LR, word,          0.6158536585365854
LR, (1-2)gram      0.6292682926829268
40
LR, word,          0.6707317073170732
LR, (1-2)gram      0.6780487804878049
50
LR, word,          0.6609756097560976
LR, (1-2)gram      0.6573170731707317
60
LR, word,          0.6743902439024391
LR, (1-2)gram      0.6658536585365854
70
LR, word,          0.6804878048780488
LR, (1-2)gram      0.6878048780487804
80
LR, word,          0.6926829268292682
LR, (1-2)gram      0.6780487804878049
90
LR, word,          0.6926829268292682
LR, (1-2)gram      0.6951219512195121
100
LR, word,          0.675609756097561
LR, (1-2)gram      0.6902439024390243
120
LR, word,          0.6853658536585366
LR, (1-2)gram      0.6829268292682927
140
LR, word,          0.6817073170731708
LR,

Unnamed: 0,10,20,30,40,50,60,70,80,90,100,120,140,160,180,200,400,800,1200
MDT-A_R_1g,0.55122,0.557317,0.615854,0.670732,0.660976,0.67439,0.680488,0.692683,0.692683,0.67561,0.685366,0.681707,0.709756,0.704878,0.703659,0.729268,0.753659,0.752439
MDT-A_R_2g,0.553659,0.54878,0.629268,0.678049,0.657317,0.665854,0.687805,0.678049,0.695122,0.690244,0.682927,0.69878,0.729268,0.707317,0.717073,0.745122,0.780488,0.768293
SDT-H_R_1g,0.582278,0.643038,0.678481,0.686076,0.724051,0.713924,0.718987,0.696203,0.716456,0.736709,0.731646,0.736709,0.749367,0.764557,0.782278,0.777215,0.802532,0.805063
SDT-H_R_2g,0.574684,0.622785,0.698734,0.691139,0.767089,0.731646,0.756962,0.713924,0.724051,0.746835,0.741772,0.744304,0.764557,0.787342,0.802532,0.794937,0.807595,0.812658
SDT-M_R_1g,0.614679,0.681957,0.697248,0.743119,0.685015,0.715596,0.746177,0.724771,0.70948,0.743119,0.755352,0.712538,0.752294,0.75841,0.749235,0.776758,0.779817,0.798165
SDT-M_R_2g,0.617737,0.691131,0.681957,0.755352,0.666667,0.712538,0.767584,0.730887,0.712538,0.749235,0.776758,0.743119,0.7737,0.779817,0.7737,0.788991,0.795107,0.816514
SDT-P_R_1g,0.541667,0.6875,0.708333,0.645833,0.729167,0.625,0.541667,0.666667,0.645833,0.666667,0.75,0.708333,0.666667,0.708333,0.729167,0.75,0.791667,0.791667
SDT-P_R_2g,0.5,0.645833,0.729167,0.583333,0.708333,0.583333,0.5625,0.708333,0.729167,0.75,0.729167,0.708333,0.708333,0.75,0.770833,0.75,0.708333,0.729167
SDT-R_R_1g,0.64,0.5,0.52,0.62,0.68,0.64,0.62,0.62,0.58,0.68,0.68,0.76,0.72,0.68,0.72,0.7,0.62,0.66
SDT-R_R_2g,0.62,0.5,0.52,0.6,0.64,0.68,0.68,0.66,0.6,0.72,0.68,0.74,0.74,0.68,0.68,0.68,0.7,0.66
