<a href="https://colab.research.google.com/github/bogus1aw/text-classification-benchmark/blob/main/M_TD_IDF_wiki_34.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# TF-IDF benchmark for wiki dataset https://clarin-pl.eu/dspace/handle/11321/738

gdrive mounted manually form GUI

In [None]:
from sklearn import model_selection, preprocessing, linear_model, naive_bayes, metrics, svm
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn import decomposition, ensemble

import pandas, xgboost, numpy, textblob, string
from keras.preprocessing import text, sequence
from keras import layers, models, optimizers

import matplotlib.pyplot as plt

import datetime
import time

timestamp = datetime.datetime.now().replace(microsecond=0).isoformat().replace(':', '-')


# 1. Function definitions

In [None]:
def load_corpora_to_dataframe(corpora):
  data = open(corpora).read()
  labels, texts = [], []
  for i, line in enumerate(data.split("\n")):
      content = line.split()
      labels.append(content[0])
      texts.append(" ".join(content[1:]))

  # create a dataframe using texts and lables
  trainDF = pandas.DataFrame()
  trainDF['text'] = texts
  trainDF['label'] = labels
  return trainDF

In [None]:
def calculate_feature_vectors_tfifd(max_features):
  # word level tf-idf
  tfidf_vect = TfidfVectorizer(analyzer='word', max_features=max_features)
  tfidf_vect.fit(trainDF['text']) 
  xtrain_tfidf =  tfidf_vect.transform(train_x)
  xvalid_tfidf =  tfidf_vect.transform(valid_x)
  # ngram level tf-idf 
  tfidf_vect_ngram = TfidfVectorizer(analyzer='word', ngram_range=(1,2 ), max_features=max_features)
  tfidf_vect_ngram.fit(trainDF['text'])
  xtrain_tfidf_ngram =  tfidf_vect_ngram.transform(train_x)
  xvalid_tfidf_ngram =  tfidf_vect_ngram.transform(valid_x)
  return xtrain_tfidf, xvalid_tfidf, xtrain_tfidf_ngram, xvalid_tfidf_ngram

In [None]:
def clasification_benchmark(txt_description, classifier, feature_vector_train, feature_vector_valid):
  classifier.fit(feature_vector_train, train_y)
  predictions = classifier.predict(feature_vector_valid)
  accuracy = metrics.accuracy_score(valid_y, predictions)
  print(txt_description, ' ', accuracy)
  return accuracy

In [None]:
def calculate_classifiers_accuracy(feature_no_list):
  accuracy_matrix = []

  for max_feature in feature_no_list:
    print(max_feature)
    xtrain_tfidf, xvalid_tfidf, xtrain_tfidf_ngram, xvalid_tfidf_ngram = calculate_feature_vectors_tfifd(max_feature)
    acc_list = []
    acc_list.append(max_feature) # add max_feature number to output array
    acc_list.append(clasification_benchmark('NB,  word,      ', naive_bayes.MultinomialNB(), xtrain_tfidf, xvalid_tfidf)) 
    acc_list.append(clasification_benchmark('NB,  (1-2)grams ', naive_bayes.MultinomialNB(), xtrain_tfidf_ngram, xvalid_tfidf_ngram)) 
    acc_list.append(clasification_benchmark('SVM, word,      ', svm.SVC(), xtrain_tfidf, xvalid_tfidf)) 
    acc_list.append(clasification_benchmark('SVM, (1-2)grams ', svm.SVC(), xtrain_tfidf_ngram, xvalid_tfidf_ngram)) 
    acc_list.append(clasification_benchmark('LR, word,       ', linear_model.LogisticRegression(), xtrain_tfidf, xvalid_tfidf)) 
    acc_list.append(clasification_benchmark('LR, (1-2)gram   ', linear_model.LogisticRegression(), xtrain_tfidf_ngram, xvalid_tfidf_ngram))
    accuracy_matrix.append(acc_list)
  return accuracy_matrix

# 2. Constants and paths defintion

In [None]:
fig_path = '/content/drive/MyDrive/figures/'
feature_no_list = [1000, 
                   5000, 
                   10000,
                   15000,
                   20000,
                   40000,
                   ]
 

# raw corpora
raw_corpora = '/content/drive/MyDrive/master_datasets/wiki_preprocessed/wikiInOneFileDataset.txt'
save_heat_map_path = fig_path + 'wiki_tfidf_classfiers_features_number_heatmap.svg'
# lemmatized corpora
lemmatized_corpora = '/content/drive/MyDrive/master_datasets/wiki_preprocessed/wikiInOneFileDataset_lemmas.txt'
save_heat_map_path = fig_path + 'wiki_tfidf_classfiers_features_number_heatmap_lemmas.svg'

# 3. Benchmarks
## 3.1 Raw corpora benchmark

In [None]:
trainDF = load_corpora_to_dataframe(raw_corpora)

# split the dataset into training and validation datasets
train_x, valid_x, train_y, valid_y = model_selection.train_test_split(trainDF['text'], trainDF['label'], test_size=0.2, random_state=42)

# label encode the target variable 
encoder = preprocessing.LabelEncoder()
train_y = encoder.fit_transform(train_y)
valid_y = encoder.fit_transform(valid_y)
accurracy_matrix_raw = calculate_classifiers_accuracy(feature_no_list)
df = pandas.DataFrame(accurracy_matrix_raw, 
                      columns=('no_featues', 'NB,  word,      ', 'NB,  (1-2)grams ', 'SVM, word,      ', 'SVM, (1-2)grams ', 'LR, word,       ', 'LR, (1-2)gram   '))
df

1000
NB,  word,         0.6971677559912854
NB,  (1-2)grams    0.6884531590413944
SVM, word,         0.739288307915759
SVM, (1-2)grams    0.7334785766158315
LR, word,          0.7378358750907771
LR, (1-2)gram      0.7334785766158315
5000
NB,  word,         0.8053740014524329
NB,  (1-2)grams    0.7988380537400145
SVM, word,         0.8583877995642701
SVM, (1-2)grams    0.8562091503267973
LR, word,          0.8612926652142339
LR, (1-2)gram      0.8533042846768337
10000
NB,  word,         0.8126361655773421
NB,  (1-2)grams    0.8097312999273784
SVM, word,         0.8634713144517067
SVM, (1-2)grams    0.8540305010893247
LR, word,          0.869281045751634
LR, (1-2)gram      0.8634713144517067
15000
NB,  word,         0.8191721132897604
NB,  (1-2)grams    0.8155410312273057
SVM, word,         0.8671023965141612
SVM, (1-2)grams    0.8641975308641975
LR, word,          0.878721859114016
LR, (1-2)gram      0.8671023965141612
20000
NB,  word,         0.8206245461147422
NB,  (1-2)grams    0.8206

Unnamed: 0,no_featues,"NB, word,","NB, (1-2)grams","SVM, word,","SVM, (1-2)grams","LR, word,","LR, (1-2)gram"
0,1000,0.697168,0.688453,0.739288,0.733479,0.737836,0.733479
1,5000,0.805374,0.798838,0.858388,0.856209,0.861293,0.853304
2,10000,0.812636,0.809731,0.863471,0.854031,0.869281,0.863471
3,15000,0.819172,0.815541,0.867102,0.864198,0.878722,0.867102
4,20000,0.820625,0.820625,0.86565,0.862019,0.874365,0.873638
5,40000,0.824256,0.825708,0.860566,0.860566,0.879448,0.879448


In [None]:
# fig = df.plot.line(figsize=(14,8), xlabel='No. of features', ylabel='Accuracy').get_figure()

In [None]:
# fig.savefig(fig_path + f'wiki_tfidf_classfiers_features_number_accuracy_chart{timestamp}.svg')

## 3.2 Lemmatized corpora benchamrk

In [None]:
trainDF = load_corpora_to_dataframe(lemmatized_corpora)

# split the dataset into training and validation datasets
train_x, valid_x, train_y, valid_y = model_selection.train_test_split(trainDF['text'], trainDF['label'], test_size=0.2, random_state=42)

# label encode the target variable 
encoder = preprocessing.LabelEncoder()
train_y = encoder.fit_transform(train_y)
valid_y = encoder.fit_transform(valid_y)
accurracy_matrix_lemmas = calculate_classifiers_accuracy(feature_no_list)
df = pandas.DataFrame(accurracy_matrix_lemmas, 
                      columns=('no_featues' ,'NB,  word,      ', 'NB,  (1-2)grams ', 'SVM, word,      ', 'SVM, (1-2)grams ', 'LR, word,       ', 'LR, (1-2)gram   '))
df

1000
NB,  word,         0.813362381989833
NB,  (1-2)grams    0.8053740014524329
SVM, word,         0.8496732026143791
SVM, (1-2)grams    0.8402323892519971
LR, word,          0.8540305010893247
LR, (1-2)gram      0.8460421205519245
5000
NB,  word,         0.8533042846768337
NB,  (1-2)grams    0.85039941902687
SVM, word,         0.8896151053013798
SVM, (1-2)grams    0.8859840232389252
LR, word,          0.8997821350762527
LR, (1-2)gram      0.8946986201888163
10000
NB,  word,         0.8576615831517792
NB,  (1-2)grams    0.859840232389252
SVM, word,         0.8946986201888163
SVM, (1-2)grams    0.8976034858387799
LR, word,          0.9026870007262164
LR, (1-2)gram      0.9034132171387074
15000
NB,  word,         0.8641975308641975
NB,  (1-2)grams    0.8627450980392157
SVM, word,         0.8946986201888163
SVM, (1-2)grams    0.8961510530137982
LR, word,          0.9041394335511983
LR, (1-2)gram      0.906318082788671
20000
NB,  word,         0.859840232389252
NB,  (1-2)grams    0.8663761

Unnamed: 0,no_featues,"NB, word,","NB, (1-2)grams","SVM, word,","SVM, (1-2)grams","LR, word,","LR, (1-2)gram"
0,1000,0.813362,0.805374,0.849673,0.840232,0.854031,0.846042
1,5000,0.853304,0.850399,0.889615,0.885984,0.899782,0.894699
2,10000,0.857662,0.85984,0.894699,0.897603,0.902687,0.903413
3,15000,0.864198,0.862745,0.894699,0.896151,0.904139,0.906318
4,20000,0.85984,0.866376,0.895425,0.893246,0.904866,0.906318
5,40000,0.856935,0.862019,0.891794,0.895425,0.899056,0.904139


In [None]:
# fig =  df.plot.line(figsize=(14,8), xlabel='No. of features', ylabel='Accuracy').get_figure()

In [None]:
# fig.savefig(fig_path + 'wiki_tfidf_classfiers_features_number_accuracy_chart_lemmas.svg')

In [None]:
# import seaborn as sns
# import matplotlib.pyplot as plt

# fig, ax = plt.subplots(figsize=(14,8))

# sns.heatmap(df, annot=True, cmap="Blues")
# plt.savefig(save_heat_map_path)
# plt.xlabel('Classifier and featres vector type')
# plt.ylabel('No. of features')