In [1]:
import pandas as pd
from faker import Faker
from sklearn.model_selection  import StratifiedShuffleSplit
import random
import string

# Faker

Create data with personal-related classes in three different languages

https://faker.readthedocs.io/en/stable/providers.html   
https://faker.readthedocs.io/en/stable/communityproviders.html

In [2]:
fake = Faker()
getattr(fake, 'profile')()

{'job': 'Management consultant',
 'company': 'Humphrey-Bradford',
 'ssn': '423-98-3062',
 'residence': '71555 Matthew Walk Apt. 365\nPort Nicolemouth, MP 86774',
 'current_location': (Decimal('-19.343395'), Decimal('164.036090')),
 'blood_group': 'O+',
 'website': ['http://www.pollard-cannon.com/'],
 'username': 'johntrujillo',
 'name': 'Cody Buchanan',
 'sex': 'M',
 'address': '541 Robinson Circle Suite 554\nStephanieburgh, NH 74410',
 'mail': 'ethanwilson@yahoo.com',
 'birthdate': datetime.date(1918, 3, 10)}

In [3]:
personal_attributes = [
    "address", "iban","credit_card_number", "email",
    "job","first_name","last_name","name","phone_number",
    "ssn","passport_number"
]

profile_attributes = [
    "sex", "blood_group", "current_location"
]

non_personal_attributes = [
    "color","ean","credit_card_provider","company","currency",
    "url","isbn13","pyint","pyfloat", "date", "swift"
]

personal_attributes_unique = [
    "address","iban","credit_card_number","email",
    "name","phone_number", "ssn","passport_number"
]

profile_attributes_unique = ["current_location"]

non_personal_attributes_unique = [
    "color","ean", "isbn13", "pyfloat", "swift"
    ]

In [4]:
fake = Faker("de")
def number_unique_values(cla, profile = False):
    val = []
    for _ in range(100000):
        if profile:
            val.append(getattr(fake, "profile")()[cla])
        else:
            val.append(getattr(fake, cla)())
    if len(set(val)) > 50000:
        print(cla, "has enough unique values: ", len(set(val)))

for p in personal_attributes + non_personal_attributes:
    if p == "profile":
        for pp in profile_attributes:
            number_unique_values(pp, profile = True)
    else:
        number_unique_values(p)

address has enough unique values:  100000
iban has enough unique values:  100000
credit_card_number has enough unique values:  99999
email has enough unique values:  91461
name has enough unique values:  98431
phone_number has enough unique values:  100000
ssn has enough unique values:  99996
passport_number has enough unique values:  100000
color has enough unique values:  89503
ean has enough unique values:  100000
isbn13 has enough unique values:  99974
pyfloat has enough unique values:  100000
swift has enough unique values:  99996


In [5]:
Faker.seed(42)
random.seed(42)

FAKE_EN = Faker("en")
FAKE_FR = Faker("fr_FR")
FAKE_DE = Faker("de_DE")


def generate_data(classes, num_col, personal_type, df, original_class, personal):
    for cla in classes:
        if cla in profile_attributes:
            Faker.seed(42)
            f1 = Faker("en")
            val1 = getattr(f1, "profile")()[cla]
            Faker.seed(42)
            f2 = Faker("fr_FR")
            val2 = getattr(f2, "profile")()[cla]
        else:
            Faker.seed(42)
            f1 = Faker("en")
            val1 = getattr(f1, cla)()
            Faker.seed(42)
            f2 = Faker("fr_FR")
            val2 = getattr(f2, cla)()
        if val1 == val2:
            fake = FAKE_EN
            for lan in ["en","fr_FR","de_DE", "mixed"]:
                for i in range(1,num_col):                           
                    data = {
                            f"{cla}_{lan}_{i}": [],
                        }
                    for _ in range(100):
                        if cla in profile_attributes:
                            data[f"{cla}_{lan}_{i}"].append(getattr(fake, "profile")()[cla])
                        else:
                            data[f"{cla}_{lan}_{i}"].append(getattr(fake, cla)())
                    personal_type.append(personal)
                    original_class.append(f"{cla}_{lan}")
                    df = pd.concat([df, pd.DataFrame(data)], axis=1)
        else:
            for (fake, lan) in zip([FAKE_EN, FAKE_FR, FAKE_DE], ["en","fr_FR","de_DE"]):
                for i in range(1,num_col):                           
                    data = {
                            f"{cla}_{lan}_{i}": [],
                        }
                    for _ in range(100):
                        if cla in profile_attributes:
                            data[f"{cla}_{lan}_{i}"].append(getattr(fake, "profile")()[cla])
                        else:
                            data[f"{cla}_{lan}_{i}"].append(getattr(fake, cla)())
                    personal_type.append(personal)
                    original_class.append(f"{cla}_{lan}")
                    df = pd.concat([df, pd.DataFrame(data)], axis=1)
            for i in range(1,num_col):                           
                data = {
                        f"{cla}_mixed_{i}": [],
                    }
                for (fake, lan) in zip([FAKE_EN, FAKE_FR, FAKE_DE],["en","fr_FR","de_DE"]):
                    for _ in range(33 if lan != "en" else 34):
                        if cla in profile_attributes:
                            data[f"{cla}_mixed_{i}"].append(getattr(fake, "profile")()[cla])
                        else:
                            data[f"{cla}_mixed_{i}"].append(getattr(fake, cla)())
                personal_type.append(personal)
                original_class.append(f"{cla}_mixed")
                df = pd.concat([df, pd.DataFrame(data)], axis=1)
    return personal_type, df, original_class
  
def generate_data_unique(classes, num_col, personal_type, df, original_class, personal):
    mixed_data = dict()
    for cla in classes:
        if cla in profile_attributes_unique:
            Faker.seed(42)
            f1 = Faker("en")
            val1 = getattr(f1, "profile")()[cla]
            Faker.seed(42)
            f2 = Faker("fr_FR")
            val2 = getattr(f2, "profile")()[cla]
        else:
            Faker.seed(42)
            f1 = Faker("en")
            val1 = getattr(f1, cla)()
            Faker.seed(42)
            f2 = Faker("fr_FR")
            val2 = getattr(f2, cla)()
        if val1 == val2:
            values = set()
            num_generate = (num_col *4.5)
            while len(values) < num_generate*100:
                if cla not in profile_attributes_unique:
                    values.add(getattr(f1, cla)())
                else:
                    values.add(getattr(f1, "profile")()[cla])
            values = sorted(values)
            random.shuffle(values)
            for lan in ["en","fr_FR","de_DE", "mixed"]:
                for i in range(1,num_col):                           
                    data = {
                            f"{cla}_{lan}_{i}": [],
                        }
                    popped_elements = values[:100]
                    values = values[100:]
                    data[f"{cla}_{lan}_{i}"] = popped_elements
                    personal_type.append(personal)
                    original_class.append(f"{cla}_{lan}")
                    df = pd.concat([df, pd.DataFrame(data)], axis=1)
        else:
            all_values = set()
            for (fake, lan) in zip([FAKE_EN, FAKE_FR, FAKE_DE], ["en","fr_FR","de_DE"]):
                values = set()
                num_generate = (num_col *1.5)
                while len(values) < num_generate*100:
                    if cla not in profile_attributes_unique:
                        adding_val = getattr(fake, cla)()
                        if adding_val not in all_values:
                            values.add(adding_val)
                    else:
                        adding_val = getattr(fake, "profile")()[cla]
                        if adding_val not in all_values:
                            values.add(adding_val)
                values = sorted(values)
                random.shuffle(values)
                all_values = all_values.union(set(values))
                for i in range(1,num_col):                           
                    data = {
                            f"{cla}_{lan}_{i}": [],
                        }
                    popped_elements = values[:100]
                    values = values[100:]
                    data[f"{cla}_{lan}_{i}"] = popped_elements
                    personal_type.append(personal)
                    original_class.append(f"{cla}_{lan}")
                    df = pd.concat([df, pd.DataFrame(data)], axis=1)
                mixed_data[f'{lan}_{cla}'] = values
            for i in range(1,num_col):                           
                    data = {
                            f"{cla}_mixed_{i}": [],
                        }
                    for lan in ["en","fr_FR","de_DE"]:
                        if lan == "en":
                            num=34
                        else:
                            num=33
                        for v in mixed_data[f'{lan}_{cla}'][num*(i-1):num*(i-1)+num]:
                            data[f"{cla}_mixed_{i}"].append(v)
                    personal_type.append(personal)
                    original_class.append(f"{cla}_mixed")
                    df = pd.concat([df, pd.DataFrame(data)], axis=1)
    return personal_type, df, original_class
    
    
def generate_random_string(length):
    # Define the characters to use (letters and digits)
    characters = string.ascii_letters + string.digits
    return ''.join(random.choice(characters) for _ in range(length))

def extract_label(df):
    df_labels = df.iloc[100,:]
    df_classes = df.iloc[101,:]
    df = df.iloc[:100,:]
    df_labels = pd.DataFrame(df_labels)
    df_classes = pd.DataFrame(df_classes)
    df_labels = df_labels.rename(columns={df_labels.columns[0]: "label"}).reset_index(drop=True)
    df_classes = df_classes.rename(columns={df_classes.columns[0]: "class"}).reset_index(drop=True)
    return df, df_labels, df_classes

  
def rename_columns(df, num_personal, num_non_personal):
    for attributes in [personal_attributes, non_personal_attributes, profile_attributes]:
        for p in attributes:
            if attributes == personal_attributes or attributes == profile_attributes:
                num_col = num_personal
            else:
                num_col = num_non_personal
            for lan in ["en","fr_FR","de_DE", "mixed"]:
                for i in range(1,num_col, 2):
                    random_length = random.randint(5, 20)
                    random_string = generate_random_string(random_length)
                    df.rename(columns={f"{p}_{lan}_{i}": random_string}, inplace=True)
            for i in range(1, num_col, 2):
                random_length = random.randint(5, 20)
                random_string = generate_random_string(random_length)
                df.rename(columns={f"mixed_{p}_{i}": random_string}, inplace=True)
    return df




def create_dataset():
    personal_type = []
    original_class = []
    df = pd.DataFrame()

    personal_type, df, original_class = generate_data(set(personal_attributes + profile_attributes) - set(personal_attributes_unique + profile_attributes_unique), 286, personal_type, df, original_class, "personal")
    personal_type, df, original_class = generate_data(set(non_personal_attributes) - set(non_personal_attributes_unique), 364, personal_type, df, original_class, "non-personal")

    # generate columns were only unique values are possible
    personal_type, df, original_class = generate_data_unique(personal_attributes_unique + profile_attributes_unique, 
                                                      286, personal_type, df, original_class, "personal")
    personal_type, df, original_class = generate_data_unique(non_personal_attributes_unique,
                                                      364, personal_type, df, original_class, "non-personal")
    
    df = rename_columns(df, 286, 364)

    #shuffle the columns
    labels = pd.DataFrame(personal_type).T
    classes = pd.DataFrame(original_class).T
    labels.columns = df.columns
    classes.columns = df.columns
    df = pd.concat([df, labels]).reset_index(drop=True)
    df = pd.concat([df, classes]).reset_index(drop=True)
    df = df.sample(frac=1, axis=1, random_state=42).reset_index(drop=True)
    
    #save the data
    df_final, labels, classes = extract_label(df)
    df_final.to_csv("all.csv", index=False)
    labels.to_csv("all_labels.csv", index=False)
    classes.to_csv("all_classes.csv", index=False)

In [6]:
16000 / (len(personal_attributes) + len(profile_attributes))/4

285.7142857142857

In [7]:
16000 / len(non_personal_attributes) /4

363.6363636363636

In [8]:
create_dataset()

Train/Val/Test Split

In [12]:
df_all = pd.read_csv("all.csv")
df_all_labels = pd.read_csv("all_labels.csv")
df_all_classes = pd.read_csv("all_classes.csv")

  df_all = pd.read_csv("faker/all.csv")


In [13]:
split1 = StratifiedShuffleSplit(n_splits=1, test_size=0.4, random_state=42)
train_idx, temp_idx = next(split1.split(df_all.T, df_all_classes["class"]))
train_data = df_all.iloc[:, train_idx]
train_classes = df_all_classes.T[train_idx].T
train_labels = df_all_labels.T[train_idx].T
temp_data = df_all.iloc[:, temp_idx]  
temp_classes = df_all_classes.T[temp_idx].T
temp_labels = df_all_labels.T[temp_idx].T

split2 = StratifiedShuffleSplit(n_splits=1, test_size=0.5, random_state=42)
val_idx, test_idx = next(split2.split(temp_data.T, temp_classes))

val_data = temp_data.iloc[:, val_idx]
val_classes = temp_classes.T.iloc[:,val_idx].T
val_labels = temp_labels.T.iloc[:,val_idx].T
test_data = temp_data.iloc[:, test_idx]
test_classes = temp_classes.T.iloc[:,test_idx].T
test_labels = temp_labels.T.iloc[:,test_idx].T


print("Train data shape:", train_data.shape)
print("Validation data shape:", val_data.shape)
print("Test data shape:", test_data.shape)

Train data shape: (100, 19159)
Validation data shape: (100, 6386)
Test data shape: (100, 6387)


In [14]:
train_data.to_csv("train.csv", index=False)
train_labels.to_csv("train_labels.csv", index=False)
train_classes.to_csv("train_classes.csv", index=False)

val_data.to_csv("dev.csv", index=False)
val_labels.to_csv("dev_labels.csv", index=False)
val_classes.to_csv("dev_classes.csv", index=False)

test_data.to_csv("test.csv", index=False)
test_labels.to_csv("test_labels.csv", index=False)
test_classes.to_csv("test_classes.csv", index=False)