<a href="https://colab.research.google.com/github/bogus1aw/text-classification-benchmark/blob/main/M_TD_IDF_wiki_34.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# TF-IDF benchmark for wiki dataset https://clarin-pl.eu/dspace/handle/11321/738

gdrive mounted manually form GUI

In [1]:
from sklearn import model_selection, preprocessing, linear_model, naive_bayes, metrics, svm
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn import decomposition, ensemble

import pandas, xgboost, numpy, textblob, string
from keras.preprocessing import text, sequence
from keras import layers, models, optimizers

import matplotlib.pyplot as plt

import datetime
import time

timestamp = datetime.datetime.now().replace(microsecond=0).isoformat().replace(':', '-')


# 1. Function definitions

In [2]:
def load_corpora_to_dataframe(corpora):
  data = open(corpora).read()
  labels, texts = [], []
  for i, line in enumerate(data.split("\n")):
      content = line.split()
      labels.append(content[0])
      texts.append(" ".join(content[1:]))

  # create a dataframe using texts and lables
  trainDF = pandas.DataFrame()
  trainDF['text'] = texts
  trainDF['label'] = labels
  return trainDF

In [3]:
def calculate_feature_vectors_tfifd(max_features):
  # word level tf-idf
  tfidf_vect = TfidfVectorizer(analyzer='word', token_pattern=r'(?u)\b\w+\b', max_features=max_features)
  tfidf_vect.fit(trainDF['text']) 
  xtrain_tfidf =  tfidf_vect.transform(train_x)
  xvalid_tfidf =  tfidf_vect.transform(valid_x)
  # ngram level tf-idf 
  tfidf_vect_ngram = TfidfVectorizer(analyzer='word', token_pattern=r'(?u)\b\w+\b', ngram_range=(1,2 ), max_features=max_features)
  tfidf_vect_ngram.fit(trainDF['text'])
  xtrain_tfidf_ngram =  tfidf_vect_ngram.transform(train_x)
  xvalid_tfidf_ngram =  tfidf_vect_ngram.transform(valid_x)
  return xtrain_tfidf, xvalid_tfidf, xtrain_tfidf_ngram, xvalid_tfidf_ngram

In [4]:
def clasification_benchmark(txt_description, classifier, feature_vector_train, feature_vector_valid):
  classifier.fit(feature_vector_train, train_y)
  predictions = classifier.predict(feature_vector_valid)
  accuracy = metrics.accuracy_score(valid_y, predictions)
  print(txt_description, ' ', accuracy)
  return accuracy

In [5]:
def calculate_classifiers_accuracy(feature_no_list):
  accuracy_matrix = []

  for max_feature in feature_no_list:
    print(max_feature)
    xtrain_tfidf, xvalid_tfidf, xtrain_tfidf_ngram, xvalid_tfidf_ngram = calculate_feature_vectors_tfifd(max_feature)
    acc_list = []
    acc_list.append(max_feature) # add max_feature number to output array
    acc_list.append(clasification_benchmark('NB,  word,      ', naive_bayes.MultinomialNB(), xtrain_tfidf, xvalid_tfidf)) 
    acc_list.append(clasification_benchmark('NB,  (1-2)grams ', naive_bayes.MultinomialNB(), xtrain_tfidf_ngram, xvalid_tfidf_ngram)) 
    acc_list.append(clasification_benchmark('SVM, word,      ', svm.SVC(), xtrain_tfidf, xvalid_tfidf)) 
    acc_list.append(clasification_benchmark('SVM, (1-2)grams ', svm.SVC(), xtrain_tfidf_ngram, xvalid_tfidf_ngram)) 
    acc_list.append(clasification_benchmark('LR, word,       ', linear_model.LogisticRegression(), xtrain_tfidf, xvalid_tfidf)) 
    acc_list.append(clasification_benchmark('LR, (1-2)gram   ', linear_model.LogisticRegression(), xtrain_tfidf_ngram, xvalid_tfidf_ngram))
    accuracy_matrix.append(acc_list)
  return accuracy_matrix

# 2. Constants and paths defintion

In [6]:
fig_path = '/content/drive/MyDrive/figures/'
feature_no_list = [1000, 
                   5000, 
                   10000,
                   15000,
                   20000,
                   40000,
                   ]
 

# raw corpora
raw_corpora = '/content/drive/MyDrive/master_datasets/wiki_preprocessed/wikiInOneFileDataset.txt'
save_heat_map_path = fig_path + 'wiki_tfidf_classfiers_features_number_heatmap.svg'
# lemmatized corpora
lemmatized_corpora = '/content/drive/MyDrive/master_datasets/wiki_preprocessed/wikiInOneFileDataset_lemmas.txt'
save_heat_map_path = fig_path + 'wiki_tfidf_classfiers_features_number_heatmap_lemmas.svg'

# 3. Benchmarks

In [None]:
# count unique tokens
trainDF = load_corpora_to_dataframe(raw_corpora)
tfidf_vect = TfidfVectorizer(analyzer='word', token_pattern=r'(?u)\b\w+\b')
tfidf_vect_ngram = TfidfVectorizer(analyzer='word', token_pattern=r'(?u)\b\w+\b', ngram_range=(1,2 ))
tfidf_vect.fit(trainDF['text'])
tfidf_vect_ngram.fit(trainDF['text'])
print('(raw dataset) number of unique tokens for:')
print('monograms', len(tfidf_vect.get_feature_names()))
print('(1-2)grams', len(tfidf_vect_ngram.get_feature_names()))

In [None]:
trainDF = load_corpora_to_dataframe(lemmatized_corpora)
tfidf_vect = TfidfVectorizer(analyzer='word', token_pattern=r'(?u)\b\w+\b')
tfidf_vect_ngram = TfidfVectorizer(analyzer='word', token_pattern=r'(?u)\b\w+\b', ngram_range=(1,2 ))
tfidf_vect.fit(trainDF['text'])
tfidf_vect_ngram.fit(trainDF['text'])
print('(lemmatized dataset) number of unique tokens for:')
print('monograms', len(tfidf_vect.get_feature_names()))
print('(1-2)grams', len(tfidf_vect_ngram.get_feature_names()))

## 3.1 Raw corpora benchmark

In [7]:
trainDF = load_corpora_to_dataframe(raw_corpora)

# split the dataset into training and validation datasets
train_x, valid_x, train_y, valid_y = model_selection.train_test_split(trainDF['text'], trainDF['label'], test_size=0.2, random_state=42)

# label encode the target variable 
encoder = preprocessing.LabelEncoder()
train_y = encoder.fit_transform(train_y)
valid_y = encoder.fit_transform(valid_y)
accurracy_matrix_raw = calculate_classifiers_accuracy(feature_no_list)
df = pandas.DataFrame(accurracy_matrix_raw, 
                      columns=('no_featues', 'NB,  word,      ', 'NB,  (1-2)grams ', 'SVM, word,      ', 'SVM, (1-2)grams ', 'LR, word,       ', 'LR, (1-2)gram   '))
df

1000
NB,  word,         0.6971677559912854
NB,  (1-2)grams    0.6811909949164852
SVM, word,         0.7436456063907044
SVM, (1-2)grams    0.7312999273783588
LR, word,          0.7378358750907771
LR, (1-2)gram      0.7269426289034132
5000
NB,  word,         0.7923021060275962
NB,  (1-2)grams    0.7799564270152506
SVM, word,         0.859840232389252
SVM, (1-2)grams    0.8467683369644154
LR, word,          0.8612926652142339
LR, (1-2)gram      0.8474945533769063
10000
NB,  word,         0.8053740014524329
NB,  (1-2)grams    0.8046477850399419
SVM, word,         0.8700072621641249
SVM, (1-2)grams    0.8612926652142339
LR, word,          0.869281045751634
LR, (1-2)gram      0.8634713144517067
15000
NB,  word,         0.8119099491648512
NB,  (1-2)grams    0.8046477850399419
SVM, word,         0.8678286129266521
SVM, (1-2)grams    0.8641975308641975
LR, word,          0.8743645606390704
LR, (1-2)gram      0.8678286129266521
20000
NB,  word,         0.813362381989833
NB,  (1-2)grams    0.8090

Unnamed: 0,no_featues,"NB, word,","NB, (1-2)grams","SVM, word,","SVM, (1-2)grams","LR, word,","LR, (1-2)gram"
0,1000,0.697168,0.681191,0.743646,0.7313,0.737836,0.726943
1,5000,0.792302,0.779956,0.85984,0.846768,0.861293,0.847495
2,10000,0.805374,0.804648,0.870007,0.861293,0.869281,0.863471
3,15000,0.81191,0.804648,0.867829,0.864198,0.874365,0.867829
4,20000,0.813362,0.809005,0.864198,0.861293,0.872912,0.872186
5,40000,0.814815,0.81772,0.862745,0.857662,0.875091,0.875817


In [8]:
# fig = df.plot.line(figsize=(14,8), xlabel='No. of features', ylabel='Accuracy').get_figure()

In [9]:
# fig.savefig(fig_path + f'wiki_tfidf_classfiers_features_number_accuracy_chart{timestamp}.svg')

## 3.2 Lemmatized corpora benchamrk

In [10]:
trainDF = load_corpora_to_dataframe(lemmatized_corpora)

# split the dataset into training and validation datasets
train_x, valid_x, train_y, valid_y = model_selection.train_test_split(trainDF['text'], trainDF['label'], test_size=0.2, random_state=42)

# label encode the target variable 
encoder = preprocessing.LabelEncoder()
train_y = encoder.fit_transform(train_y)
valid_y = encoder.fit_transform(valid_y)
accurracy_matrix_lemmas = calculate_classifiers_accuracy(feature_no_list)
df = pandas.DataFrame(accurracy_matrix_lemmas, 
                      columns=('no_featues' ,'NB,  word,      ', 'NB,  (1-2)grams ', 'SVM, word,      ', 'SVM, (1-2)grams ', 'LR, word,       ', 'LR, (1-2)gram   '))
df

1000
NB,  word,         0.8061002178649237
NB,  (1-2)grams    0.7879448075526507
SVM, word,         0.8533042846768337
SVM, (1-2)grams    0.8358750907770516
LR, word,          0.85039941902687
LR, (1-2)gram      0.8395061728395061
5000
NB,  word,         0.8438634713144517
NB,  (1-2)grams    0.8380537400145244
SVM, word,         0.8925199709513435
SVM, (1-2)grams    0.8859840232389252
LR, word,          0.8961510530137982
LR, (1-2)gram      0.8939724037763254
10000
NB,  word,         0.8518518518518519
NB,  (1-2)grams    0.8518518518518519
SVM, word,         0.8961510530137982
SVM, (1-2)grams    0.8939724037763254
LR, word,          0.8990559186637618
LR, (1-2)gram      0.9019607843137255
15000
NB,  word,         0.8554829339143064
NB,  (1-2)grams    0.8518518518518519
SVM, word,         0.8961510530137982
SVM, (1-2)grams    0.8954248366013072
LR, word,          0.8997821350762527
LR, (1-2)gram      0.8997821350762527
20000
NB,  word,         0.8547567175018156
NB,  (1-2)grams    0.854

Unnamed: 0,no_featues,"NB, word,","NB, (1-2)grams","SVM, word,","SVM, (1-2)grams","LR, word,","LR, (1-2)gram"
0,1000,0.8061,0.787945,0.853304,0.835875,0.850399,0.839506
1,5000,0.843863,0.838054,0.89252,0.885984,0.896151,0.893972
2,10000,0.851852,0.851852,0.896151,0.893972,0.899056,0.901961
3,15000,0.855483,0.851852,0.896151,0.895425,0.899782,0.899782
4,20000,0.854757,0.854757,0.89833,0.894699,0.899782,0.897603
5,40000,0.849673,0.852578,0.893972,0.894699,0.89833,0.89833


In [11]:
# fig =  df.plot.line(figsize=(14,8), xlabel='No. of features', ylabel='Accuracy').get_figure()

In [12]:
# fig.savefig(fig_path + 'wiki_tfidf_classfiers_features_number_accuracy_chart_lemmas.svg')

In [13]:
# import seaborn as sns
# import matplotlib.pyplot as plt

# fig, ax = plt.subplots(figsize=(14,8))

# sns.heatmap(df, annot=True, cmap="Blues")
# plt.savefig(save_heat_map_path)
# plt.xlabel('Classifier and featres vector type')
# plt.ylabel('No. of features')