In [None]:
import re
import pandas as pd
import numpy as np

from tqdm.notebook import tqdm
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, confusion_matrix, f1_score

In [None]:
!pip install datasets
import datasets

In [None]:
# load data
dataset = datasets.load_dataset("tweet_eval", "irony")

Reusing dataset tweet_eval (/root/.cache/huggingface/datasets/tweet_eval/irony/1.1.0/12aee5282b8784f3e95459466db4cdf45c6bf49719c25cdb0743d71ed0410343)


In [None]:
df_train = dataset["train"].to_pandas()
df_val = dataset["validation"].to_pandas()
df_test = dataset["test"].to_pandas()

In [None]:
def transform_text(text):
  text = text.lower()
  text = re.sub('[^A-Za-z0-9 ]+', '', text)
  text = re.sub('\s+(a|is|be|will|the|was|were|have|has|are|been|s|ll)\s+', '', text)
  return text

def create_documents_list(l):
  temp_vocab = [i.split(' ') for i in l]
  documents_list = [[j for j in i if len(j)>0] for i in temp_vocab]
  return documents_list

In [None]:
df_train.text = df_train.text.apply(lambda x: transform_text(x))
df_test.text = df_test.text.apply(lambda x: transform_text(x))
df_val.text = df_val.text.apply(lambda x: transform_text(x))

In [None]:
documents_list_train = create_documents_list(df_train.text.to_list())
documents_list_val = create_documents_list(df_val.text.to_list())
documents_list_test = create_documents_list(df_test.text.to_list())

In [None]:

def compute_idf(documents_list):
  vocab = set([j for i in documents_list for j in i ])
  idf = dict()
  n_documents = len(documents_list)
  for w in tqdm(vocab):
    idf[w] = np.log(n_documents/np.sum([1 for d in documents_list if w in d]))
  return idf


def compute_tf(document):
  n_words = len(document)
  tf = dict()
  for w in document:
    tf[w] = sum([1 for i in document if w == i]) / n_words
  return tf


In [None]:
def compute_tfidf(documents_list, idf, type='pandas'):
  col_names = list(idf.keys())
  # insure outoput order of table
  col_names.sort()
  result = np.zeros((len(documents_list), len(col_names)))
  for i, d in tqdm(enumerate(documents_list)):
      temp_tf = compute_tf(d)
      for w in d:
        if w in col_names:
          j = col_names.index(w)
          result[i, j] = temp_tf[w] * idf[w] 
  if type == 'pandas':
    df_tfidf = pd.DataFrame(result)
    df_tfidf.columns = col_names;
    return df_tfidf
  else:
    return result

In [None]:
idf = compute_idf(documents_list_train)

HBox(children=(FloatProgress(value=0.0, max=10026.0), HTML(value='')))




In [None]:
tfidf_train = compute_tfidf(documents_list_train, idf, 'numpy')
tfidf_test = compute_tfidf(documents_list_test, idf, 'numpy')
tfidf_val = compute_tfidf(documents_list_val, idf, 'numpy')

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




In [None]:
X = tfidf_train
y = df_train.label.values
clf = LogisticRegression()
clf.fit(X, y )

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

Train performance

In [None]:
y_hat = clf.predict(X)
f1_score(y,y_hat)

0.9708535739070089

Validation performance

In [None]:
y_hat_val = clf.predict(tfidf_val)
f1_score(df_val.label,y_hat_val)

0.6254002134471719

Test performance

In [None]:
y_hat_test = clf.predict(tfidf_test)
f1_score(df_test.label,y_hat_test)

0.5890804597701149

## How to do it with sklearn

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
tfidf = TfidfVectorizer()

In [None]:
tfidf.fit(df_train.text)

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='utf-8',
                input='content', lowercase=True, max_df=1.0, max_features=None,
                min_df=1, ngram_range=(1, 1), norm='l2', preprocessor=None,
                smooth_idf=True, stop_words=None, strip_accents=None,
                sublinear_tf=False, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, use_idf=True, vocabulary=None)

In [None]:
tfidf_train_sklearn= tfidf.transform(df_train.text)
tfidf_test_sklearn= tfidf.transform(df_test.text)

In [None]:
tdidf_train_sklearn.shape

(2862, 9998)

In [None]:
clf2 = LogisticRegression()
clf2.fit(tfidf_train_sklearn, df_train.label )

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [None]:
y_hat = clf2.predict(tfidf_test_sklearn)

In [None]:
f1_score(df_test.label,y_hat)

0.5664233576642336