In [1]:
!pip install checklist





# Generating Data

In [2]:
# Based on tutorial at: https://github.com/marcotcr/checklist/blob/master/notebooks/tutorials/1.%20Generating%20data.ipynb
import checklist
from checklist.editor import Editor
from checklist.perturb import Perturb

In [3]:
# Example of editor and template
editor = Editor()
ret = editor.template('This is a {adj} movie.', adj=['good', 'great', 'awesome', 'excellent'])
ret.data

['This is a good movie.',
 'This is a great movie.',
 'This is a awesome movie.',
 'This is a excellent movie.']

In [4]:
# Example of editor and template
ret = editor.template({'question': 'Is this a {adj} movie?',
                       'context': 'This is a {adj} movie.' },
                      labels='No, this is {badj} movie.',
                      adj=['good', 'great', 'awesome', 'excellent'],
                      badj=['bad', 'not great', 'not awesome', 'not excellent'])

In [5]:
print(ret.data[0])
print(ret.labels[0])
print()

{'question': 'Is this a good movie?', 'context': 'This is a good movie.'}
No, this is bad movie.



In [6]:
# Lexicons usage
print(list(editor.lexicons.keys()))

['male', 'female', 'first_name', 'first_pronoun', 'last_name', 'country', 'nationality', 'city', 'religion', 'religion_adj', 'sexual_adj', 'country_city', 'male_from', 'female_from', 'last_from']


In [7]:
ret = editor.template('{female} is not same as  {country}')
ret.data[0:4]

['Mary is not same as  China',
 'Elizabeth is not same as  China',
 'Margaret is not same as  China',
 'Sarah is not same as  China']

In [8]:
ret = editor.template('{male1} is not friends with {male2}', remove_duplicates=True)
ret.data[0:4]

['William is not friends with John',
 'James is not friends with John',
 'David is not friends with John',
 'Robert is not friends with John']

# Perturbing Data

In [9]:
# Based on https://github.com/marcotcr/checklist/blob/master/notebooks/tutorials/2.%20Perturbing%20data.ipynb

In [10]:
import re
def change_professions(x, *args, **kwargs):
    # Returns empty or a list of strings with profesions changed
    professions = ['doctor', 'nurse', 'engineer', 'lawyer']
    ret = []
    for p in professions:
        if re.search(r'\b%s\b' % p, x):
            ret.extend([re.sub(r'\b%s\b' % p, p2, x) for p2 in professions if p != p2])
    return ret

In [11]:
data = ['John is a very smart person, he lives in Ireland.',
        'Mark Stewart was born and raised in Chicago',
        'Luke Smith has 3 sisters.',
        'Ram is not a nurse.',
        'Sita is an engineer.',
        'My brother Ramesh used to be a lawyer.']

In [12]:
ret = Perturb.perturb(data, change_professions, keep_original=True)
ret.data

[['Ram is not a nurse.',
  'Ram is not a doctor.',
  'Ram is not a engineer.',
  'Ram is not a lawyer.'],
 ['Sita is an engineer.',
  'Sita is an doctor.',
  'Sita is an nurse.',
  'Sita is an lawyer.'],
 ['My brother Ramesh used to be a lawyer.',
  'My brother Ramesh used to be a doctor.',
  'My brother Ramesh used to be a nurse.',
  'My brother Ramesh used to be a engineer.']]

In [13]:

def change_professions_withmeta(x, meta=False, *args, **kwargs):
    # Returns empty or a list of strings with profesions changed
    professions = ['doctor', 'nurse', 'engineer', 'lawyer']
    ret = []
    ret_meta = []
    for p in professions:
        if re.search(r'\b%s\b' % p, x):
            ret.extend([re.sub(r'\b%s\b' % p, p2, x) for p2 in professions if p != p2])
            ret_meta.extend([(p, p2) for p2 in professions if p != p2])
    if meta:
        return ret, ret_meta
    else:
        return ret

In [14]:
change_professions_withmeta(data[4], True)

(['Sita is an doctor.', 'Sita is an nurse.', 'Sita is an lawyer.'],
 [('engineer', 'doctor'), ('engineer', 'nurse'), ('engineer', 'lawyer')])

In [15]:
ret = Perturb.perturb(data, change_professions_withmeta, keep_original=True, meta=True)
print('Data')
print(ret.data)
print('Metadata')
print(ret.meta)

Data
[['Ram is not a nurse.', 'Ram is not a doctor.', 'Ram is not a engineer.', 'Ram is not a lawyer.'], ['Sita is an engineer.', 'Sita is an doctor.', 'Sita is an nurse.', 'Sita is an lawyer.'], ['My brother Ramesh used to be a lawyer.', 'My brother Ramesh used to be a doctor.', 'My brother Ramesh used to be a nurse.', 'My brother Ramesh used to be a engineer.']]
Metadata
[[None, ('nurse', 'doctor'), ('nurse', 'engineer'), ('nurse', 'lawyer')], [None, ('engineer', 'doctor'), ('engineer', 'nurse'), ('engineer', 'lawyer')], [None, ('lawyer', 'doctor'), ('lawyer', 'nurse'), ('lawyer', 'engineer')]]


In [16]:
!pip install spacy



In [18]:
# Data manipulation with Perturb may assume processing with Spacy
import spacy
nlp = spacy.load("en_core_web_sm")
# import en_core_web_sm
# nlp = en_core_web_sm.load()

In [19]:
pdata = list(nlp.pipe(data))

In [25]:
# Punctuation
ret = Perturb.perturb(pdata, Perturb.punctuation)
ret.data[:4]

[['John is a very smart person, he lives in Ireland.',
  'John is a very smart person, he lives in Ireland'],
 ['Mark Stewart was born and raised in Chicago',
  'Mark Stewart was born and raised in Chicago.'],
 ['Luke Smith has 3 sisters.', 'Luke Smith has 3 sisters'],
 ['Ram is not a nurse.', 'Ram is not a nurse']]

In [26]:
# Typos
data[0], Perturb.add_typos(data[0])

('John is a very smart person, he lives in Ireland.',
 'John is  avery smart person, he lives in Ireland.')

In [28]:
# Contractions
data[3], Perturb.contract(data[3])

('Ram is not a nurse.', "Ram isn't a nurse.")

In [31]:
Perturb.contractions('What\'s the matter? You are not happy')

['What is the matter? You are not happy',
 "What's the matter? You're not happy"]

In [33]:
# Negation
for t in ['This is not good', 'He didn\'t play the guitar', 'He doesn\'t play anything', 'She wasn\'t sad']:
    print(t)
    print(Perturb.remove_negation(nlp(t)))
    print()

This is not good
This is good

He didn't play the guitar
He played the guitar

He doesn't play anything
He plays anything

She wasn't sad
She was sad

