In [1]:
# the following file contains the code for the BASE model

In [64]:
# import dependencies
import pandas as pd
import numpy as np
import re
import enchant
import sys
from collections import Counter

In [65]:
def prepare_data(file_path):
    # load dataframe
    df = pd.read_csv(file_path, encoding="latin1")
    
    # drop nan
    df = df.dropna()
    
    padding_length = 4
    sentence_length = 75
    splits = split_in_sentences(sentence_length, padding_length, df)
    output = filter_splits(splits, False)
    output['Word'] = output['Word']
    output = output.dropna()
    return output


# filter the sentences
def filter_splits(d, f=True):
    if f:
        split_tag = d.groupby('Split #')['Tag'].apply(list)
        to_remove = []
        for key, value in split_tag.items():
            tags_in_split = list(set(value))
            if len(tags_in_split) == 1 and tags_in_split[0] == 'O':
                d = d[d['Split #'] != key]
        
    return d


# split the dataset into sentences
def split_in_sentences(sen_len, pad_len, d):
    doc_words = d.groupby('File_id')['Word'].apply(list)
    doc_tags = d.groupby('File_id')['Tag'].apply(list)
    
    splits = []
    cur_counter = 0
    
    for i in range(len(doc_words)):
        cur_doc = list(doc_words)[i]
        cur_tags = list(doc_tags)[i]
        cur_counter += 1
        for j in range(len(cur_doc)):
            splits.append('split ' + str(cur_counter))
            if ((j % sen_len) == 0 and j != 0):  
                cur_counter += 1
    
    d['Split #'] = splits
    return d

In [66]:
f_path = './data/final/dataset.csv'
d = prepare_data(f_path)
eng_dict = enchant.Dict("en_US")

In [67]:
words = d['Word'].values
tags = d['Tag'].values

NATIONALITIES = (pd.read_csv('./nationalities.csv', encoding="latin1"))['Nationality'].values

wordset = set(words)

In [68]:
# check the entries for the base model
def validate(text):
    classifications = []
    
    if check_email(text):
        classifications.append('EMAIL')
    
    elif check_address(text):
        classifications.append('ADDRESS')
        
    elif check_phone(text):
        classifications.append('PHONE')
        
    elif check_name(text):
        classifications.append('NAME')
        
    elif check_religion(text):
        classifications.append('RELIGION')
        
    elif check_gender(text):
        classifications.append('GENDER')
        
    elif check_age(text):
        classifications.append('AGE')
        
    elif check_dob(text):
        classifications.append('DOB')
    
    elif check_ethnicity(text):
        classifications.append('ETHNICITY')
    
    elif check_nationality(text):
        classifications.append('NATIONALITY')
        
    else:
        classifications.append('O')
        
    return classifications[0]
        
    
def check_age(text):
    AGE = r"\d{1,3}\syears|\d{1,3}\syear"
    
    if re.match(AGE, text):
        return True
    else:
        return False
    
def check_email(text):
    EMAIL = r"(?:[a-z0-9!#$%&'*+\/=?^_`{|}~-]+(?:\.[a-z0-9!#$%&'*+\/=?^_`{|}~-]+)*|\"(?:[\x01-\x08\x0b\x0c\x0e-\x1f\x21\x23-\x5b\x5d-\x7f]|\\[\x01-\x09\x0b\x0c\x0e-\x7f])*\")@(?:(?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?\.)+[a-z0-9](?:[a-z0-9-]*[a-z0-9])?|\[(?:(?:(2(5[0-5]|[0-4][0-9])|1[0-9][0-9]|[1-9]?[0-9]))\.){3}(?:(2(5[0-5]|[0-4][0-9])|1[0-9][0-9]|[1-9]?[0-9])|[a-z0-9-]*[a-z0-9]:(?:[\x01-\x08\x0b\x0c\x0e-\x1f\x21-\x5a\x53-\x7f]|\\[\x01-\x09\x0b\x0c\x0e-\x7f])+)\])"
    
    if re.match(EMAIL, text):
        return True
    else:
        return False
    
def check_phone(text):
    PHONE = r"^([+]?[\s0-9]+)?(\d{3}|[(]?[0-9]+[)])?([-]?[\s]?[0-9])+$"
    if re.match(PHONE, text):
        return True
    else:
        return False

def check_name(text):
    if text[0].isupper():
        if (not eng_dict.check(text.lower())):
            return True
    return False


def check_religion(text):
    religions = ['christianity', 'islam', 'judiasm', 'buddhism', 'christian', 'jew', 'muslim', 'buddhist']
    if text.lower() in religions:
        return True
    return False


def check_gender(text):
    genders = ['male', 'female']
    if text.lower() in genders:
        return True
    return False

def check_address(text):
    ADDRESS = r".*street.*|.*road.*|.*lane.*|.*avenue.*|.*drive.*|.*boulevard.*"
    if re.match(ADDRESS, text.lower()):
        return True
    else:
        return False

def check_dob(text):
    DOB = r"(?:\d{1,2}[-/\s]\d{1,2}[-/\s]'?\d{2,4})|(?:\d{2,4}[-/\s]\d{1,2}[-/\s]\d{1,2})|(?:(?:January|February|March|April|May|June|July|August|September|October|November|December|Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sept|Sep|Oct|Nov|Dec)[-/\s,]*?\d{1,2}(?:\s)*(?:rd|th|st)?(?:\s)*[-/,]?(?:\s)*'?\d{2,4})|(?:\d{1,2}(?:\s)*(?:rd|th|st)?(?:\s)*(?:January|February|March|April|May|June|July|August|September|October|November|December|Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sept|Sep|Oct|Nov|Dec)(?:\s)*?[-/,]?(?:\s)*'?\d{2,4})"    
    if re.match(DOB, text):
        return True
    else:
        return False

    
def check_ethnicity(text):
    ethnicity = ['caucasian', 'african american', 'asian', 'hispanic', 'native american']
    if text.lower() in ethnicity:
        return True
    return False    


def check_nationality(text):
    if text in NATIONALITIES:
        return True
    return False 

In [69]:
output = []
for i in range(len(words)):
    output.append(validate(words[i]))

In [70]:
print(len(output), len(tags))

160125 160125


In [71]:
mod_tags = tags

for j in range(len(mod_tags)):
    if mod_tags[j] != 'O':
        mod_tags[j] = mod_tags[j].split('-')[1]

y_true = tags

final_y = []
final_pred = []
for j in range(len(y_true)):
    if y_true[j] != "O":
        final_y.append(y_true[j])
        final_pred.append(output[j])
        

['O' 'O' 'B-NAME' ... 'O' 'O' 'O']


In [72]:
from sklearn.metrics import precision_recall_fscore_support

In [73]:
precision_recall_fscore_support(final_y, final_pred, average='micro')

(0.46424192889252147, 0.46424192889252147, 0.46424192889252147, None)

In [74]:
precision_recall_fscore_support(final_y, final_pred, average='macro')

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


(0.643636220230971, 0.42443314035328245, 0.41454750093695075, None)

In [45]:
print(final_y)

['NAME']


              precision    recall  f1-score   support

     ADDRESS       1.00      0.02      0.04       818
         AGE       0.00      0.00      0.00        17
         DOB       1.00      0.02      0.04       154
       EMAIL       1.00      0.95      0.97       204
   ETHNICITY       0.00      0.00      0.00         7
      GENDER       1.00      1.00      1.00        29
        NAME       0.50      0.80      0.62       487
 NATIONALITY       1.00      0.13      0.24        67
           O       0.00      0.00      0.00         0
       PHONE       0.58      0.74      0.65       662
    RELIGION       1.00      1.00      1.00         2

    accuracy                           0.46      2447
   macro avg       0.64      0.42      0.41      2447
weighted avg       0.78      0.46      0.42      2447

