Create test data for the existing classes of mimesis and faker which are language dependent with two new languages  

Use Chinese and Italian to have a more different and a similar language to the data it is trained on

In [6]:
import pandas as pd
import random
import string
from mimesis import Fieldset
from faker import Faker
random.seed(42)

# Mimesis

In [7]:
personal_attributes = [
    "address", "academic_degree", "blood_type", "email", "first_name", "full_name", "last_name", 
    "gender", "language", "nationality", "occupation",
    "phone_number", "political_views", "title", "worldview", 
    "credit_card_number"
]

non_personal_attributes = [
    "company", "dish", "drink", "answer", "color", "isbn", "duration", 
    "programming_language", "system_quality_attribute", "version", "float_number", 
    "integer_number", "user_agent", "graphics", "cpu", "phone_model", 
    "manufacturer", "resolution", "word", "measure_unit", 
    "city"
]

In [8]:
for p in personal_attributes + non_personal_attributes:
    fieldset_de = Fieldset(locale="de", seed=42)
    fieldset_it = Fieldset(locale="it", seed=42)
    fieldset_zh = Fieldset(locale="zh", seed=42)
    de = fieldset_de(p, i=1)
    it = fieldset_it(p, i=1)
    zh = fieldset_zh(p, i=1)
    if de == it and de == zh:
        print(p)

blood_type
email
credit_card_number
isbn
duration
programming_language
system_quality_attribute
version
float_number
integer_number
user_agent
graphics
cpu
phone_model
manufacturer
resolution
measure_unit


Delete all the ouputed classes

In [9]:
personal_attributes_language = [
    "address", "academic_degree", "first_name", "full_name", "last_name", 
    "gender", "language", "nationality", "occupation",
    "phone_number", "political_views", "title", "worldview", 
]

non_personal_attributes_language = [
    "company", "dish", "drink", "answer", "color", "word", "city"
]

In [10]:
for p in personal_attributes_language + non_personal_attributes_language:
    fieldset_de = Fieldset(locale="de", seed=42)
    fieldset_it = Fieldset(locale="it", seed=42)
    fieldset_zh = Fieldset(locale="zh", seed=42)
    de = fieldset_de(p, i=1)
    it = fieldset_it(p, i=1)
    zh = fieldset_zh(p, i=1)
    print(p, de, it, zh)

address ['Carl-Zuckmayer-Brücke 1310'] ['1310 Via Cartari'] ['汉兴街道1310号']
academic_degree ['Bachelor'] ['Dottorato'] ['博士']
first_name ['Alder'] ['Alessio'] ['文渲']
full_name ['Alder Fleischer'] ['Alessio Sangermano'] ['文渲 阚']
last_name ['Winkler'] ['Pannone'] ['充']
gender ['Weiblich'] ['Neutro'] ['男性']
language ['Somali'] ['Curdo'] ['祖鲁语']
nationality ['Armenier'] ['Iowa'] ['津巴布韦']
occupation ['Buchhalter / Buchhalterin'] ['Casaro'] ['印刷设计与制版人员']
phone_number ['(07211) 7550026'] ['+39 072 117 550'] ['+86 721-17550026']
political_views ['Liberal'] ['Comunista'] ['共産']
title ['Herr'] ['Sig.'] ['先生']
worldview ['Buddhismus'] ['Ateo'] ['无神论']
company ['Vector Informatik'] ['Breda'] ['上海纺织控股']
dish ['Pinkel mit Grünkohl'] ['Stracciatella'] ['辣子鸡']
drink ['Zitronenmelissesirup'] ['Cirò'] ['鲜榨汁']
answer ['Vielleicht'] ['Forse'] ['不知道']
color ['Dunkelblau'] ['Viola'] ['柠檬黄']
word ['neue'] ['porta'] ['汲取']
city ['Pohlheim'] ['Caltanissetta'] ['娄底市 ']


delete also classes which have very similar values like phone number

In [11]:
personal_attributes_language = [
    "address", "academic_degree", "first_name", "full_name", "last_name", 
    "gender", "language", "nationality", "occupation",
     "political_views", "title", "worldview", 
]

non_personal_attributes_language = [
    "company", "dish", "drink", "answer", "color",  "word", "city"
]

In [12]:
FIELDSET_IT = Fieldset(locale = "it", seed = 42)
FIELDSET_ZH = Fieldset(locale = "zh", seed = 42)

def generate_random_string(length):
    characters = string.ascii_letters + string.digits
    return ''.join(random.choice(characters) for _ in range(length))
    
df = pd.DataFrame()
personal_type = []
original_class = []
for classes, personal in zip([personal_attributes_language, non_personal_attributes_language], ["personal", "non-personal"]):
    for cla in classes:
        for (fieldset, lan) in  zip([FIELDSET_IT, FIELDSET_ZH], ["it","zh"]):
                for i in range(1,3):                           
                    data = {
                            f"{cla}_{lan}_{i}": [],
                        }
                    data[f"{cla}_{lan}_{i}"] = fieldset(cla, i = 100)
                    personal_type.append(personal)
                    original_class.append(f"{cla}_{lan}")
                    df = pd.concat([df, pd.DataFrame(data)], axis=1)
# rename half of the columns
for p in personal_attributes_language + non_personal_attributes_language:
    for lan in ["it","zh"]:
        random_length = random.randint(5, 20)
        random_string = generate_random_string(random_length)
        df.rename(columns={f"{p}_{lan}_2": random_string}, inplace=True)
        
#shuffle the columns
labels = pd.DataFrame(personal_type).T
classes = pd.DataFrame(original_class).T
labels.columns = df.columns
classes.columns = df.columns
df = pd.concat([df, labels]).reset_index(drop=True)
df = pd.concat([df, classes]).reset_index(drop=True)
df = df.sample(frac=1, axis=1, random_state=42).reset_index(drop=True)

def extract_label(df):
    df_labels = df.iloc[100,:]
    df_classes = df.iloc[101,:]
    df = df.iloc[:100,:]
    df_labels = pd.DataFrame(df_labels)
    df_classes = pd.DataFrame(df_classes)
    df_labels = df_labels.rename(columns={df_labels.columns[0]: "label"}).reset_index(drop=True)
    df_classes = df_classes.rename(columns={df_classes.columns[0]: "class"}).reset_index(drop=True)
    return df, df_labels, df_classes

df_final, final_labels, final_classes = extract_label(df)

# Faker

In [13]:
personal_attributes = [
    "address", "iban","credit_card_number", "email",
    "job","first_name","last_name","name","phone_number",
    "ssn","passport_number"
]

profile_attributes = [
    "sex", "blood_group", "current_location"
]

non_personal_attributes = [
    "color","ean","credit_card_provider","company","currency",
    "url","isbn13","pyint","pyfloat", "date", "swift", 
]

In [14]:
def return_value(cla, lan, profile=False):
    Faker.seed(42)
    fake = Faker(lan)
    if profile:
        val = getattr(fake, "profile")()[cla]
    else:
        val = getattr(fake, cla)()
    return val

for cla in profile_attributes:
    de = return_value(cla, "de", profile=True)
    it = return_value(cla, "it", profile=True)
    zh = return_value(cla, "zh", profile=True)
    if de == it and de == zh:
        print(cla)
        
for cla in personal_attributes + non_personal_attributes:
    de = return_value(cla, "de")
    it = return_value(cla, "it")
    zh = return_value(cla, "zh")
    if de == it and de == zh:
        print(cla)

credit_card_number
passport_number
color
ean
credit_card_provider
currency
isbn13
pyint
pyfloat
date


drop the outputed classes and the classes which are already created by mimesis

In [15]:
personal_attributes_language = ["iban","email","phone_number","ssn"]

profile_attributes_language = ["sex", "blood_group", "current_location"]

non_personal_attributes_language = ["url","swift"]

In [16]:
def return_value(cla, lan, profile=False):
    Faker.seed(42)
    fake = Faker(lan)
    if profile:
        val = getattr(fake, "profile")()[cla]
    else:
        val = getattr(fake, cla)()
    return val

for cla in profile_attributes_language:
    de = return_value(cla, "de", profile=True)
    it = return_value(cla, "it", profile=True)
    zh = return_value(cla, "zh", profile=True)
    print(cla, de, it, zh)
        
for cla in personal_attributes_language + non_personal_attributes_language:
    de = return_value(cla, "de")
    it = return_value(cla, "it")
    zh = return_value(cla, "zh")
    print(cla, de, it, zh)

sex F M F
blood_group B- O+ A-
current_location (Decimal('-15.4094795'), Decimal('-96.527578')) (Decimal('-88.2555025'), Decimal('-94.283560')) (Decimal('-32.2021525'), Decimal('0.707823'))
iban DE41104332181960013389 IT86U1043321819600133890838 GB56UDAX4332181960013
email barthjolanta@example.com asmundolucia@example.com changlei@example.org
phone_number (01043) 32181 +39 0131433210 15510433218
ssn 043-32-1819 SMNGDI31D05H914B 150822194909150434
url http://www.hoefig.net/ http://www.leone.com/ http://www.liangkong.cn/
swift AXIHDEOI AXIHITOI AXIHGBOI


In [17]:
personal_attributes_language = ["iban","email","phone_number","ssn"]

non_personal_attributes_language = ["url","swift"]

In [18]:
def generate_random_string(length):
    # Define the characters to use (letters and digits)
    characters = string.ascii_letters + string.digits
    return ''.join(random.choice(characters) for _ in range(length))

def extract_label(df):
    df_labels = df.iloc[100,:]
    df_classes = df.iloc[101,:]
    df = df.iloc[:100,:]
    df_labels = pd.DataFrame(df_labels)
    df_classes = pd.DataFrame(df_classes)
    df_labels = df_labels.rename(columns={df_labels.columns[0]: "label"}).reset_index(drop=True)
    df_classes = df_classes.rename(columns={df_classes.columns[0]: "class"}).reset_index(drop=True)
    return df, df_labels, df_classes

Faker.seed(42)
FAKE_IT = Faker("it")
FAKE_ZH = Faker("zh")

df = pd.DataFrame()
personal_type, original_class = [], []

for classes, personal in zip([personal_attributes_language, non_personal_attributes_language], ["personal", "non-personal"]):
    for cla in classes:
        for (fake, lan) in zip([FAKE_IT, FAKE_ZH], ["it","zh"]):
            for i in range(1,3):                           
                data = {
                        f"{cla}_{lan}_{i}": [],
                    }
                for _ in range(100):
                    data[f"{cla}_{lan}_{i}"].append(getattr(fake, cla)())
                personal_type.append(personal)
                original_class.append(f"{cla}_{lan}")
                df = pd.concat([df, pd.DataFrame(data)], axis=1)
# rename half of the columns
for p in personal_attributes_language + non_personal_attributes_language:
    for lan in ["it","zh"]:
        random_length = random.randint(5, 20)
        random_string = generate_random_string(random_length)
        df.rename(columns={f"{p}_{lan}_2": random_string}, inplace=True)


#shuffle the columns
labels = pd.DataFrame(personal_type).T
classes = pd.DataFrame(original_class).T
labels.columns = df.columns
classes.columns = df.columns
df = pd.concat([df, labels]).reset_index(drop=True)
df = pd.concat([df, classes]).reset_index(drop=True)
df = df.sample(frac=1, axis=1, random_state=42).reset_index(drop=True)

In [19]:
#save the data
df_final2, final_labels2, final_classes2 = extract_label(df)

Personal mapping

In [20]:
pd.concat([df_final, df_final2], axis=1).to_csv("test.csv", index=False)
pd.concat([final_labels, final_labels2]).to_csv("test_labels_personal.csv", index=False)
pd.concat([final_classes, final_classes2]).to_csv("test_classes.csv", index=False)
pd.DataFrame(["mimesis"] * df_final.shape[1] + ["faker"] * df_final2.shape[1]).rename(columns={0: "dataset"}).to_csv("test_dataset.csv", index=False)

PII mapping

In [21]:
manual_mapping = {
    "CCN": "pii",
    "Date": "non-pii",
    'Date,NIN': "pii",
    "Email": "pii",
    "Email,NIN": "pii",
    'Email,Phone_number' : "pii",
    "GPE": "non-pii",
    "Gender": "non-pii",
    "Geolocation": "pii",
    "IBAN": "pii",
    "ID_Card": "pii",
    "NIN": "pii",
    "NIN,Date": "pii",
    "NIN,Email" : "pii",
    "NIN,Phone_number" : "pii",
    "Nationality": "non-pii",
    "Organization": "non-pii",
    "Passport": "pii",
    "Phone_number": "pii",
    "Phone_number,Email": "pii",
    "Phone_number,NIN" : "pii",
    "Race": "non-pii",
    "Religion": "non-pii",
    "SWIFT/BIC": "non-pii",
    "Sexuality": "non-pii",
    "academic_degree": "non-pii",
    "address": "pii",
    "answer": "non-pii",
    "blood_type": "non-pii",
    "city": "non-pii",
    "color": "non-pii",
    "company": "non-pii",
    "cpu": "non-pii",
    "credit_card_number": "pii",
    "credit_card_provider": "non-pii",
    "currency": "non-pii",
    "current_location": "pii",
    "date": "non-pii",
    'dish': "non-pii",
    'drink': "non-pii",
    'duration': "non-pii",
    'ean': "non-pii",
    'email': "pii",
    'first_name': "non-pii",
    'float_number': "non-pii",
    'full_name': "pii",
    'gender': "non-pii",
    'graphics': "non-pii",
    'iban': "pii",
    'integer_number': "non-pii",
    'isbn': "non-pii",
    'isbn13': "non-pii",
    'job': "non-pii",
    'language': "non-pii",
    'last_name': "non-pii",
    'manufacturer': "non-pii",
    'measure_unit': "non-pii",
    'name': "pii",
    'nationality': "non-pii",
    'occupation': "non-pii",
    'passport_number': "pii",
    'phone_model': "non-pii",
    'phone_number': "pii",
    'political_views': "non-pii",
    'programming_language': "non-pii",
    'pyfloat': "non-pii",
    'pyint': "non-pii",
    'resolution': "non-pii",
    'ssn': "pii",
    'swift': "non-pii",
    'system_quality_attribute': "non-pii",
    'title': "non-pii",
    'url': "non-pii",
    'user_agent': "non-pii",
    'version': "non-pii",
    'word': "non-pii",
    'worldview': "non-pii"
}

In [22]:
labels_pii = pd.concat([final_classes, final_classes2]).copy()
for i in range(len(labels_pii)):
    cla = labels_pii.iloc[i,0][:-3]
    labels_pii.iloc[i,0] = manual_mapping[cla]
labels_pii.rename(columns={"class":"label"}).to_csv("test_labels_pii.csv", index=False)

Multiclass mapping

In [23]:
mapping_multiclass = {
    "ccn": "credit_card_number",
    "current_location": "longitude_and_latitude",
    "geolocation": "longitude_and_latitude",
    "name": "full_name",
    "isbn13": "isbn",
    "nin": "national_identification_number",   #rename some classes so that GPT can understand them
    "ssn": "national_identification_number",
    "pyfloat": "float_number",
    "pyint": "integer_number",
    "swift": "SWIFT/BIC code",
    "swift/bic": "SWIFT/BIC code",
    "address": "full_address",
    "ean": "EAN_code",
    "occupation": "job",
    "organization": "company",
    "organization,phone_number": "company,phone_number",
    "passport": "passport_number",
    "religion": "religion/worldview",   #values in these columns contain both classes
    "worldview": "religion/worldview",
    "academic_degree": "academic_degree/title",          
    "title": "academic_degree/title",
    "blood_type": "blood_group",
    "sex": "gender"
}

def convert_classes(data_classes):
    new_classes = []
    for i in data_classes["class"]:
        new_classes.append(i[:-3].lower())
    for i in range(len(new_classes)):
        for a in mapping_multiclass.keys():
            if (a == new_classes[i]) or (a in new_classes[i] and "," in new_classes[i]):
                new_classes[i] = new_classes[i].replace(a, mapping_multiclass[a])
    return pd.DataFrame(new_classes).rename(columns={0: "label"})

convert_classes(pd.concat([final_classes, final_classes2])).to_csv("test_labels_multiclass.csv", index=False)