In [4]:
!pip install numpy git+https://github.com/makcedward/nlpaug.git
!pip install transformers
!pip install tqdm

Collecting git+https://github.com/makcedward/nlpaug.git
  Cloning https://github.com/makcedward/nlpaug.git to /tmp/pip-req-build-zk_m2flr
  Running command git clone -q https://github.com/makcedward/nlpaug.git /tmp/pip-req-build-zk_m2flr
Building wheels for collected packages: nlpaug
  Building wheel for nlpaug (setup.py) ... [?25l[?25hdone
  Created wheel for nlpaug: filename=nlpaug-1.1.3-cp37-none-any.whl size=837629 sha256=faffa85e8b7d249889efb6f4799e46b39f971b5a447d63e0d8c9ccc6edf456b9
  Stored in directory: /tmp/pip-ephem-wheel-cache-ktmwfvak/wheels/2b/ef/30/a4e22f9a97373c9ab6763670c94aa5e111b0b956983f3892a4
Successfully built nlpaug


In [5]:
import pandas as pd
import numpy as np
import random
import re
from tqdm import tqdm

In [6]:
################################
# text processing and cleaning #
################################

def is_ascii(w):
  try:
    w.encode("ascii")
    return True
  except UnicodeEncodeError:
    return False

def encode_taxonomy(taxonomy):
  """
  Label encode taxonomy for training distilbert.

  Parameters:
  -----------
  taxonomy: int, taxonomy class
  
  Returns:
  --------
  encoding: int, encoded taxonomy class
  """
  encoding_dict = {2: 0, 9: 1, 59: 2, 70: 3, 173: 4, 268: 5, 273: 6, 280: 7}
  encoding = encoding_dict[taxonomy]

  return encoding


def text_cleaning(text):
  """
  Clean text from symbols, punctuation, etc.

  Parameters:
  -----------
  text: string, text data
  
  Returns:
  --------
  cleaned_text: string, cleaned text data
  """
  # remove string formatting '\n' or '\t'
  tmp_text = re.sub(r'\n+', '. ', text)
  tmp_text = re.sub(r'\t+', '. ', text)
  # remove words with non-ascii characters
  tmp_text = " ".join([word for word in tmp_text.split() if is_ascii(word)])
  # remove email address
  tmp_text = " ".join([word for word in tmp_text.split() if not word.startswith("@")])
  # remove urls
  tmp_text = re.sub(r'http\S+', '', tmp_text, flags=re.MULTILINE)
  tmp_text = re.sub(r'www\S+', '', tmp_text, flags=re.MULTILINE)
  # remove punctuation but . (to split sentences)
  cleaned_text = re.sub('[^A-Za-z.,]+', ' ', tmp_text)
  # lowercase
  cleaned_text = cleaned_text.lower()

  return cleaned_text


def text_preprocessing_a1(text):
  """
  Approach 1: Join first 2 sentences with last 2 sentences of the text.
  
  Parameters:
  -----------
  text: string, text data
  
  Returns:
  --------
  preprocessed_text: string, preprocessed text data
  """
  # sentence tokenize based on '. '
  sentences = text.split('. ')
  # get 2 first and 2 last sentences
  if len(sentences) >= 4:
    preprocessed_text = ". ".join(text.split('. ')[:2] + text.split('. ')[-2:])
    return preprocessed_text
  # if there are not 4 sentences, return full text
  else:
    preprocessed_text = text
    return preprocessed_text


def preprocessing_a1(df):
  """
  Cleaning and preprocessing following approach 1.

  Parameters:
  -----------
  df: Pandas DataFrame, df with `title`, `description` and `taxonomy`
  
  Returns:
  --------
  preprocessed_df: Pandas DataFrame, df with `text` (preprocessed text) and `taxonomy`
  """
  # drop rows with missing descriptions and drop title
  cleaned_df = df.dropna(axis=0).drop('title', axis=1)
  # clean description
  cleaned_df['description'] = cleaned_df['description'].map(text_cleaning)
  # preprocess description
  preprocessed_df = cleaned_df.copy()
  preprocessed_df['description'] = cleaned_df['description'].map(text_preprocessing_a1)
  preprocessed_df = preprocessed_df.rename(columns={'description': 'text'})
  # keep examples with less than 210 tokens
  preprocessed_df = preprocessed_df[preprocessed_df['text'].map(lambda x: len(x.split())) < 210]

  # encode taxonomy
  preprocessed_df['taxonomy'] = preprocessed_df['taxonomy'].map(encode_taxonomy)

  return preprocessed_df

In [7]:
# train data
TRAIN_FILE = '/content/drive/MyDrive/Colab Notebooks/PFM/data/train_v0.csv'
train = pd.read_csv(TRAIN_FILE, index_col=0, encoding='utf-8')

In [8]:
train.head()

Unnamed: 0,title,description,taxonomy
10525,Bernina Accessoireag,Meet the Bernina accessories:\n* By a Bernina ...,273
19671,ABC iPad - Like a computer and even more,Every Thursday and Sunday from 11 to 12\nA cou...,70
37172,Meisterkonzert,,173
43220,GZA,The legendary founder and leader of Wu-Tang Cl...,173
6905,Little BIG City,Best of Berlin - from 'CafÃ© Achteck' over the...,273


In [9]:
# preprocess train data
preprocessed_train = preprocessing_a1(train).reset_index(drop=True)

In [10]:
preprocessed_train.head()

Unnamed: 0,text,taxonomy
0,meet the bernina accessories by a bernina expe...,6
1,every thursday and sunday from to a course to ...,3
2,the legendary founder and leader of wu tang cl...,4
3,best of berlin from achteck over the wall to t...,6
4,with helena attlee and rachel roddy.,6


## Data Augmentation

Low performers:

- Class 2 (Taxonomy 59), 3(Taxonomy 70), 5(Taxonomy 268) and 7(Taxonomy 280).

Increase all these classes by 25% of their examples.

In [11]:
import nltk
nltk.download("wordnet")
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
from nltk.corpus import stopwords

import nlpaug.augmenter.word as naw

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [12]:
stopwords_list = stopwords.words('english')

In [13]:
def augment_text_data_contextual(train_data, classes):
  """Augment data using word substitution by contextual word embeddings with
  DistilBERT"""
  # initialize augmenter
  aug = naw.ContextualWordEmbsAug(model_path='distilbert-base-uncased', action="insert")
  # create empty lists
  aug_texts = []
  labels = []
  # iterate over classes to be augmented
  for class_label in tqdm(classes, total=len(classes)):
    # take subset of class_label
    subset = train_data.loc[train_data['taxonomy'] == class_label]
    indexes = list(subset.index)
    n_samples = int(round(subset.shape[0] * 0.75, 0))
    # generate synthetic examples
    for i in range(n_samples):
      idx = indexes[random.randint(0, len(indexes)-1)]
      indexes.remove(idx)
      text = subset.loc[idx, 'text']
      aug_text = aug.augment(text)
      label = subset.loc[idx, 'taxonomy']
      # append
      aug_texts.append(aug_text)
      labels.append(label)
  
  return aug_texts, labels
      

In [14]:
def augment_text_data_synonym(train_data, classes):
  """Augment data using synonym substitution by wordnet"""
  # initialize augmenter
  aug = naw.SynonymAug(aug_src='wordnet', aug_p=0.15, stopwords=stopwords_list)
  # create empty lists
  aug_texts = []
  labels = []
  # iterate over classes to be augmented
  for class_label in tqdm(classes, total=len(classes)):
    # take subset of class_label
    subset = train_data.loc[train_data['taxonomy'] == class_label]
    indexes = list(subset.index)
    n_samples = int(round(subset.shape[0] * 0.75, 0))
    # generate synthetic examples
    for i in range(n_samples):
      idx = indexes[random.randint(0, len(indexes)-1)]
      indexes.remove(idx)
      text = subset.loc[idx, 'text']
      aug_text = aug.augment(text, num_thread=4)
      label = subset.loc[idx, 'taxonomy']
      # append
      aug_texts.append(aug_text)
      labels.append(label)
  
  return aug_texts, labels

In [19]:
def augment_text_data_embedding(train_data, classes):
  """Augment data using word substitution by word2vec similarity"""
  # initialize augmenter
  aug = naw.WordEmbsAug(
    model_type='word2vec',
    action="substitute",
    aug_p=0.15,
    stopwords=stopwords_list
  )
  # create empty lists
  aug_texts = []
  labels = []
  # iterate over classes to be augmented
  for class_label in tqdm(classes, total=len(classes)):
    # take subset of class_label
    subset = train_data.loc[train_data['taxonomy'] == class_label]
    indexes = list(subset.index)
    n_samples = int(round(subset.shape[0] * 0.75, 0))
    # generate synthetic examples
    for i in range(n_samples):
      idx = indexes[random.randint(0, len(indexes)-1)]
      indexes.remove(idx)
      text = subset.loc[idx, 'text']
      aug_text = aug.augment(text, num_thread=4)
      label = subset.loc[idx, 'taxonomy']
      # append
      aug_texts.append(aug_text)
      labels.append(label)
  
  return aug_texts, labels

In [16]:
classes_to_aug = [2, 3, 5, 7]

In [None]:
# contextual
aug_texts, labels = augment_text_data_contextual(train_data=preprocessed_train, classes=classes_to_aug)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=442.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=466062.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=28.0, style=ProgressStyle(description_w…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=267967963.0, style=ProgressStyle(descri…




 75%|███████▌  | 3/4 [5:18:53<1:51:50, 6710.88s/it]

In [None]:
new_train_data = pd.DataFrame([aug_texts, labels]).T.rename(columns={0: 'text', 1: 'taxonomy'})
new_train_data.to_csv("/content/drive/MyDrive/Colab Notebooks/PFM/data/new_train_v2.csv", index=False)

In [None]:
pd.read_csv("/content/drive/MyDrive/Colab Notebooks/PFM/data/new_train.csv")

Unnamed: 0,text,taxonomy
0,the big festival of the book contest for the y...,2
1,substratum is an architectural firm principall...,2
2,nape ridge is the term used to express on the ...,2
3,dane mitchell s new experimental exhibition ir...,2
4,nowadays it generally is free and requires dec...,2
...,...,...
3310,everywhere we make candle holders.,7
3311,use of a variety of media materials to collage...,7
3312,tickets for la gallina concepcion de los huevo...,7
3313,"village runs to a tight schedule, and they nev...",7


In [27]:
# synonym
aug_texts, labels = augment_text_data_synonym(train_data=preprocessed_train, classes=classes_to_aug)

100%|██████████| 4/4 [16:55<00:00, 253.89s/it]


In [28]:
new_train_data = pd.DataFrame([aug_texts, labels]).T.rename(columns={0: 'text', 1: 'taxonomy'})
new_train_data.to_csv("/content/drive/MyDrive/Colab Notebooks/PFM/data/new_train_synonym.csv", index=False)

In [17]:
# word embedding augmenter
import gzip
with gzip.open("/content/drive/MyDrive/Colab Notebooks/PFM/data/GoogleNews-vectors-negative300.bin.gz", "rb") as f:
  model = f.read()
  f.close()

In [21]:
aug_texts, labels = augment_text_data_embedding(train_data=preprocessed_train, classes=classes_to_aug)

IsADirectoryError: ignored