In [1]:
import pandas as pd
import random
from sklearn.model_selection  import StratifiedShuffleSplit
import string
from mimesis.locales import Locale
from mimesis import Fieldset

# Mimesis

In [2]:
print(list(Locale)) #print languages

[<Locale.AR_AE: 'ar-ae'>, <Locale.AR_DZ: 'ar-dz'>, <Locale.AR_EG: 'ar-eg'>, <Locale.AR_JO: 'ar-jo'>, <Locale.AR_OM: 'ar-om'>, <Locale.AR_SY: 'ar-sy'>, <Locale.AR_YE: 'ar-ye'>, <Locale.CS: 'cs'>, <Locale.DA: 'da'>, <Locale.DE: 'de'>, <Locale.DE_AT: 'de-at'>, <Locale.DE_CH: 'de-ch'>, <Locale.EL: 'el'>, <Locale.EN: 'en'>, <Locale.EN_AU: 'en-au'>, <Locale.EN_CA: 'en-ca'>, <Locale.EN_GB: 'en-gb'>, <Locale.ES: 'es'>, <Locale.ES_MX: 'es-mx'>, <Locale.ET: 'et'>, <Locale.FA: 'fa'>, <Locale.FI: 'fi'>, <Locale.FR: 'fr'>, <Locale.HU: 'hu'>, <Locale.HR: 'hr'>, <Locale.IS: 'is'>, <Locale.IT: 'it'>, <Locale.JA: 'ja'>, <Locale.KK: 'kk'>, <Locale.KO: 'ko'>, <Locale.NL: 'nl'>, <Locale.NL_BE: 'nl-be'>, <Locale.NO: 'no'>, <Locale.PL: 'pl'>, <Locale.PT: 'pt'>, <Locale.PT_BR: 'pt-br'>, <Locale.RU: 'ru'>, <Locale.SK: 'sk'>, <Locale.SV: 'sv'>, <Locale.TR: 'tr'>, <Locale.UK: 'uk'>, <Locale.ZH: 'zh'>]


Use French, German and English

I picked some classes which clearly fits to personal or non-personal data without the information of the column name (most classes are from mimesis.Person). Half of the column names are imputed with random strings to ensure the column name is not decisive for the models prediction.
  
--> 16 classes for personal attributed, 21 for non-personal

For some classes the restriction is added that all values in the whole datasets must be unique. This should ensure that the predictions of the BERT model are not dependent on specific words for these classes.

In [3]:
personal_attributes = [
    "address", "academic_degree", "blood_type", "email", "first_name", "full_name", "last_name", 
    "gender", "language", "nationality", "occupation",
    "phone_number", "political_views", "title", "worldview", 
    "credit_card_number"
]

non_personal_attributes = [
    "company", "dish", "drink", "answer", "color", "isbn", "duration", 
    "programming_language", "system_quality_attribute", "version", "float_number", 
    "integer_number", "user_agent", "graphics", "cpu", "phone_model", 
    "manufacturer", "resolution", "word", "measure_unit", 
    "city"
]

personal_attributes_unique = [
            "address", "email", "full_name",
            "phone_number", "credit_card_number"
        ]

non_personal_attributes_unique = ["isbn", "version", "float_number"]

In [6]:
fieldset = Fieldset(locale="de", seed=42)
for p in personal_attributes + non_personal_attributes:
     vals = fieldset(p, i = 100000)
     if len(set(vals)) > 50000:
         print(p, len(set(vals)))

address 99703
email 99798
full_name 83704
phone_number 100000
credit_card_number 100000
isbn 99999
version 95172
float_number 100000


In [13]:
#generate fieldsets as constants to avoid that data are generated equally
FIELDSET_EN = Fieldset(locale = "en", seed = 42)
FIELDSET_FR = Fieldset(locale = "fr", seed = 42)
FIELDSET_DE = Fieldset(locale = "de", seed = 42)
random.seed(42)
    

def generate_data(classes, num_col, personal_type, df, original_class, personal):
    for cla in classes:
        f1 = Fieldset(locale="en", seed=42)
        f2 = Fieldset(locale="fr", seed=42)
        if f1(cla, i=1) == f2(cla, i=1):
            fieldset = FIELDSET_EN
            for lan in  ["en","fr","de", "mixed"]:
                for i in range(1,num_col):                           
                    data = {
                            f"{cla}_{lan}_{i}": [],
                        }
                    data[f"{cla}_{lan}_{i}"] = fieldset(cla, i = 100)
                    personal_type.append(personal)
                    original_class.append(f"{cla}_{lan}")
                    df = pd.concat([df, pd.DataFrame(data)], axis=1)
        else:
            for (fieldset, lan) in  zip([FIELDSET_EN, FIELDSET_FR, FIELDSET_DE, FIELDSET_EN], ["en","fr","de", "mixed"]):
                for i in range(1,num_col):                           
                    data = {
                            f"{cla}_{lan}_{i}": [],
                        }
                    if lan != "mixed":
                        data[f"{cla}_{lan}_{i}"] = fieldset(cla, i = 100)
                    else:
                        for (fieldset, lan2) in zip([FIELDSET_EN, FIELDSET_FR, FIELDSET_DE], ["en","fr","de"]):
                            values = fieldset(cla, i = 33 if lan2 != "en" else 34)
                            for v in values:
                                data[f"{cla}_mixed_{i}"].append(v)
                    personal_type.append(personal)
                    original_class.append(f"{cla}_{lan}")
                    df = pd.concat([df, pd.DataFrame(data)], axis=1)
    return personal_type, df, original_class


def generate_data_unique(classes, num_col, personal_type, df, original_class, personal):
    mixed_data = dict()
    for cla in classes:    
        f1 = Fieldset(locale="en", seed=42)
        f2 = Fieldset(locale="fr", seed=42)
        if f1(cla, i=1) == f2(cla, i=1):
            fieldset = FIELDSET_EN
            values = set()
            num_generate = (num_col *4.5)
            while len(values) < num_generate*100:
                values.update(fieldset(cla, i = num_col*100))
            values = sorted(values)
            random.shuffle(values)
            for lan in  ["en","fr","de", "mixed"]:
                for i in range(1,num_col):                           
                    data = {
                            f"{cla}_{lan}_{i}": [],
                        }
                    popped_elements = values[:100]
                    values = values[100:]
                    data[f"{cla}_{lan}_{i}"] = popped_elements
                    personal_type.append(personal)
                    original_class.append(f"{cla}_{lan}")
                    df = pd.concat([df, pd.DataFrame(data)], axis=1)
        else:
            all_values = set()
            for (fieldset, lan) in zip([FIELDSET_EN, FIELDSET_FR, FIELDSET_DE], ["en","fr","de"]):
                values = set()
                num_generate = (num_col *1.5)
                while len(values) < num_generate*100:
                    new_values = set(fieldset(cla, i = num_col*100)) - all_values
                    values.update(new_values)
                values = sorted(values)
                random.shuffle(values)
                all_values = all_values.union(set(values))
                for i in range(1,num_col):                           
                    data = {
                            f"{cla}_{lan}_{i}": [],
                        }
                    popped_elements = values[:100]
                    values = values[100:]
                    data[f"{cla}_{lan}_{i}"] = popped_elements
                    personal_type.append(personal)
                    original_class.append(f"{cla}_{lan}")
                    
                    # Concatenate the new data to the main DataFrame
                    df = pd.concat([df, pd.DataFrame(data)], axis=1)
                mixed_data[f'{lan}_{cla}'] = values
            for i in range(1,num_col):                           
                data = {
                        f"{cla}_mixed_{i}": [],
                    }
                for lan in ["en","fr","de"]:
                    if lan == "en":
                        num=34
                    else:
                        num=33
                    for v in mixed_data[f'{lan}_{cla}'][num*(i-1):num*(i-1)+num]:
                        data[f"{cla}_mixed_{i}"].append(v)
                personal_type.append(personal)
                original_class.append(f"{cla}_mixed")
                
                # Concatenate the new data to the main DataFrame
                df = pd.concat([df, pd.DataFrame(data)], axis=1)
    return personal_type, df, original_class
    
    
def generate_random_string(length):
    characters = string.ascii_letters + string.digits
    return ''.join(random.choice(characters) for _ in range(length))
    
def rename_columns(df, num_personal, num_non_personal):
    for attributes in [personal_attributes, non_personal_attributes]:
        for p in attributes:
            if attributes == personal_attributes:
                num_col = num_personal
            else:
                num_col = num_non_personal
            for lan in ["en","fr","de", "mixed"]:
                for i in range(1,num_col, 2):
                    random_length = random.randint(5, 20)
                    random_string = generate_random_string(random_length)
                    df.rename(columns={f"{p}_{lan}_{i}": random_string}, inplace=True)
    return df

def extract_label(df):
    df_labels = df.iloc[100,:]
    df_classes = df.iloc[101,:]
    df = df.iloc[:100,:]
    df_labels = pd.DataFrame(df_labels)
    df_classes = pd.DataFrame(df_classes)
    df_labels = df_labels.rename(columns={df_labels.columns[0]: "label"}).reset_index(drop=True)
    df_classes = df_classes.rename(columns={df_classes.columns[0]: "class"}).reset_index(drop=True)
    return df, df_labels, df_classes

def create_dataset():
    personal_type = []
    original_class = []
    df = pd.DataFrame()

    # generate columns were only unique values are impossible
    personal_type, df, original_class = generate_data(set(personal_attributes) - set(personal_attributes_unique), 
                                                      250, personal_type, df, original_class, "personal")
    personal_type, df, original_class = generate_data(set(non_personal_attributes) - set(non_personal_attributes_unique), 
                                                      191, personal_type, df, original_class, "non-personal")
    
    # generate columns were only unique values are possible
    personal_type, df, original_class = generate_data_unique(personal_attributes_unique,
                                                      250, personal_type, df, original_class, "personal")
    personal_type, df, original_class = generate_data_unique(non_personal_attributes_unique,
                                                      191, personal_type, df, original_class, "non-personal")
    
   
    df = rename_columns(df, 250, 191)

    #shuffle the columns
    labels = pd.DataFrame(personal_type).T
    classes = pd.DataFrame(original_class).T
    labels.columns = df.columns
    classes.columns = df.columns
    df = pd.concat([df, labels]).reset_index(drop=True)
    df = pd.concat([df, classes]).reset_index(drop=True)
    df = df.sample(frac=1, axis=1, random_state=42).reset_index(drop=True)
    
    #save the data
    df_final, labels, classes = extract_label(df)
    df_final.to_csv("all.csv", index=False)
    labels.to_csv("all_labels.csv", index=False)
    classes.to_csv("all_classes.csv", index=False)

- Corrupt 50% of the column names for every class   
- Train-Val-Test-split (60/20/20)  -> like dessi
- try generating like dessi 18k/6k/6k columns
- Shuffle the dataset

Some classes like floats_integers are language independent.  
They are created the same way with every language

In [14]:
#Generate for on personal attribute 250 columns and for a non-personal attribute 191
print(16000 / len(personal_attributes) /4)
print(16000 / len(non_personal_attributes) /4)

250.0
190.47619047619048


In [15]:
create_dataset()

Train/Val/Test Split

In [16]:
df_all = pd.read_csv("all.csv")
df_all_labels = pd.read_csv("all_labels.csv")
df_all_classes = pd.read_csv("all_classes.csv")

  df_all = pd.read_csv("mimesis/all.csv")


In [17]:
split1 = StratifiedShuffleSplit(n_splits=1, test_size=0.4, random_state=42)
train_idx, temp_idx = next(split1.split(df_all.T, df_all_classes["class"]))
train_data = df_all.iloc[:, train_idx]
train_classes = df_all_classes.T[train_idx].T
train_labels = df_all_labels.T[train_idx].T
temp_data = df_all.iloc[:, temp_idx]  
temp_classes = df_all_classes.T[temp_idx].T
temp_labels = df_all_labels.T[temp_idx].T

split2 = StratifiedShuffleSplit(n_splits=1, test_size=0.5, random_state=42)
val_idx, test_idx = next(split2.split(temp_data.T, temp_classes))

val_data = temp_data.iloc[:, val_idx]
val_classes = temp_classes.T.iloc[:,val_idx].T
val_labels = temp_labels.T.iloc[:,val_idx].T
test_data = temp_data.iloc[:, test_idx]
test_classes = temp_classes.T.iloc[:,test_idx].T
test_labels = temp_labels.T.iloc[:,test_idx].T


print("Train data shape:", train_data.shape)
print("Validation data shape:", val_data.shape)
print("Test data shape:", test_data.shape)

Train data shape: (100, 19137)
Validation data shape: (100, 6379)
Test data shape: (100, 6380)


In [18]:
train_data.to_csv("train.csv", index=False)
train_labels.to_csv("train_labels.csv", index=False)
train_classes.to_csv("train_classes.csv", index=False)

val_data.to_csv("dev.csv", index=False)
val_labels.to_csv("dev_labels.csv", index=False)
val_classes.to_csv("dev_classes.csv", index=False)

test_data.to_csv("test.csv", index=False)
test_labels.to_csv("test_labels.csv", index=False)
test_classes.to_csv("test_classes.csv", index=False)