In [9]:
from collections import Counter
import numpy as np
import os
import pandas as pd
from sklearn.linear_model import LogisticRegression
import torch.nn as nn
import torch
from torch_rnn_classifier import TorchRNNClassifier
from torch_tree_nn import TorchTreeNN
import sst
import utils
import spacy 
import random
import re
import time
from typing import List, Union
from faker import Faker
import multiprocessing as mp

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [3]:
SST_HOME = os.path.join('data', 'sentiment')
data_path = '/home/americanthinker/notebooks/pytorch/cs224u/data/sentiment/'

In [4]:
sst_train = sst.train_reader(SST_HOME, include_subtrees=False)

In [5]:
sst_train.label.value_counts()

positive    3610
negative    3310
neutral     1624
Name: label, dtype: int64

In [6]:
sst_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8544 entries, 0 to 318573
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   example_id  8544 non-null   object
 1   sentence    8544 non-null   object
 2   label       8544 non-null   object
 3   is_subtree  8544 non-null   int64 
dtypes: int64(1), object(3)
memory usage: 333.8+ KB


### 1.Data Augmentation

#### Part A: Replacing Named Entities

In [91]:
sentences = sst_train.sentence.values.tolist()[:5]

In [92]:
sentences

["The Rock is destined to be the 21st Century 's new `` Conan '' and that he 's going to make a splash even greater than Arnold Schwarzenegger , Jean-Claud Van Damme or Steven Segal .",
 "The gorgeously elaborate continuation of `` The Lord of the Rings '' trilogy is so huge that a column of words can not adequately describe co-writer\\/director Peter Jackson 's expanded vision of J.R.R. Tolkien 's Middle-earth .",
 'Singer\\/composer Bryan Adams contributes a slew of songs -- a few potential hits , a few more simply intrusive to the story -- but the whole package certainly captures the intended , er , spirit of the piece .',
 "You 'd think by now America would have had enough of plucky British eccentrics with hearts of gold .",
 'Yet the act is still charming here .']

In [7]:
nlp = spacy.load('en_core_web_trf')

In [94]:
docs = [nlp(s) for s in sentences]

In [111]:
test = nlp(sentences[0])
test_labels = [ent.label_ for ent in test.ents]
test_labels
if "PERSON" in test_labels:
    print("found")
else: print("Does not exist")


['PERSON', 'DATE', 'WORK_OF_ART', 'PERSON', 'PERSON', 'PERSON']

found


In [8]:
def ner_sentence_generator(df: pd.DataFrame, n_augments: int = None, labels: Union[str, List[str]] = 'neutral') -> pd.DataFrame:
    augmented_df = df.copy()
    faker = Faker()
    frames = []
    
    if isinstance(labels, str):
        labels = [labels]
    
    #start augmentation process for each label provided
    for label in labels:
        new_sentences = []
        
        #coerce to a list of sentences
        if n_augments:
            sentences = augmented_df[augmented_df['label'] == label].sentence.values.tolist()[:n_augments]
        else:
            sentences = augmented_df[augmented_df['label'] == label].sentence.values.tolist()
            
        #coerce to a list of spacy docs
        docs = [nlp(s) for s in sentences]
        
        #grab tokens and match with original sentences by index
        for index, doc in enumerate(docs):
            sentence = sentences[index]
            
            #check to see if there is a PERSON entity in the sentence
            entities = [ent.label_ for ent in doc.ents]
            if "PERSON" not in entities:
                continue
            else:
                for token in doc:
                    if token.ent_type_ == 'PERSON':
                        sentence = sentence.replace(str(token), faker.name().split()[0])
                new_sentences.append(sentence)
        
        #coerce sentences into dict to make new df
        row_dicts = [{'example_id':'augmented', 'sentence':sentence, 'label':label, 'is_subtree':0} for sentence in new_sentences]
        aug_df = pd.DataFrame(row_dicts)
        frames.append(aug_df)
    
    #concat all frames first if multilabel
    if len(frames) > 1:
        augmented_frame = pd.concat(frames, ignore_index=True)
        new_df = pd.concat([augmented_df, augmented_frame])
        return new_df   
    
    elif len(frames) == 1:
        new_df = pd.concat([augmented_df, frames[0]], ignore_index=True)
        return new_df
    

In [33]:
def multiprocess_ner_sentence_generator(df: pd.DataFrame) -> pd.DataFrame:
    augmented_df = df.copy()
    faker = Faker()
    #n_augments = 1000

    #start augmentation process for each label provided
    
    new_sentences = []
    
    #coerce to a list of sentences
    sentences = augmented_df[augmented_df['label'] == 'neutral'].sentence.values.tolist()
        
    #coerce to a list of spacy docs
    docs = [nlp(s) for s in sentences]
    
    #grab tokens and match with original sentences by index
    for index, doc in enumerate(docs):
        sentence = sentences[index]
        
        #check to see if there is a PERSON entity in the sentence
        entities = [ent.label_ for ent in doc.ents]
        if "PERSON" not in entities:
            continue
        else:
            for token in doc:
                if token.ent_type_ == 'PERSON':
                    sentence = sentence.replace(str(token), faker.name().split()[0])
            new_sentences.append(sentence)
    
    #coerce sentences into dict to make new df
    row_dicts = [{'example_id':'augmented', 'sentence':sentence, 'label':'neutral', 'is_subtree':0} for sentence in new_sentences]
    aug_df = pd.DataFrame(row_dicts)
    
    return aug_df
    

In [46]:
def parallelize_dataframe(df, func, n_cores=mp.cpu_count()):
    df_split = np.array_split(df, n_cores)
    pool = mp.Pool(n_cores)
    df = pd.concat(pool.map(func, df_split))
    pool.close()
    pool.join()
    return df

In [57]:
def decontracted(phrase):
    # specific
    phrase = re.sub(r"won\'t", "will not", phrase)
    phrase = re.sub(r"can\'t", "can not", phrase)

    # general
    phrase = re.sub(r"n\'t", " not", phrase)
    phrase = re.sub(r"\'re", " are", phrase)
    phrase = re.sub(r"\'s", " is", phrase)
    phrase = re.sub(r"\'d", " would", phrase)
    phrase = re.sub(r"\'ll", " will", phrase)
    phrase = re.sub(r"\'t", " not", phrase)
    phrase = re.sub(r"\'ve", " have", phrase)
    phrase = re.sub(r"\'m", " am", phrase)
    return phrase

In [22]:
aug[aug['example_id'] == 'augmented'].loc[8820].values[1].replace("n't", " not").replace('  ', ' ')

'Jeanette gives a good performance in a film that does not merit it .'