This is a notebook to control if the new datasets only contain unique values for specific classes

In [1]:
import pandas as pd
import plotly.graph_objects as go

# DeSSI

In [2]:
def convert_dataframes(df, df_labels):
    df_labels = df_labels.T
    df_labels.columns = df.columns
    df = pd.concat([df, df_labels], ignore_index=True)
    return df

def load_data(path, labels=True):
    if labels == True:
        add = "labels"
    else:
        add = "classes"
    train = pd.read_csv(path + 'train.csv')
    train_labels = pd.read_csv(path + f'train_{add}.csv')
    dev = pd.read_csv(path + f'dev.csv')
    dev_labels = pd.read_csv(path + f'dev_{add}.csv')
    test = pd.read_csv(path + 'test.csv')
    test_labels = pd.read_csv(path + f'test_{add}.csv')
    train = convert_dataframes(train, train_labels)
    dev = convert_dataframes(dev, dev_labels)
    test = convert_dataframes(test, test_labels)
    return train, dev, test

In [7]:
train_dessi, dev_dessi, test_dessi = load_data('../../dessi-mf/dessi_unique/')

In [8]:
train_dev_dessi = train_dessi.merge(dev_dessi, left_index=True, right_index=True)

In [9]:
sol = dict()
for label in train_dev_dessi.loc[100].unique():
    s = train_dev_dessi.loc[100] == label
    train_dev = train_dev_dessi.iloc[:100,:]
    train_data = set(train_dev[s.index[s==True].tolist()].values.flatten()) 
    s2 = test_dessi.loc[100] == label
    
    has_value = []
    test = test_dessi.iloc[:100,:]
    for c in s2.index[s2==True].tolist():
        has_value.append(any(val in train_data for val in test[c].values))
    sol[label] = dict(pd.Series(has_value, dtype="object").value_counts(normalize=True))

In [10]:
pop_keys = []
for key, val in sol.items():
    if val == dict():
        pop_keys.append(key)
for key in pop_keys:
    sol.pop(key)
sol = {key: sol[key] for key in sorted(sol.keys(), reverse=True)}

In [11]:
fig = go.Figure()
fig.add_trace(go.Bar(y=list(sol.keys()), x=[a.get(False) for a in sol.values()], name="does not contain<br>element from train",
                      orientation='h', marker=dict(color="green")))
fig.add_trace(go.Bar(y=list(sol.keys()), x=[a.get(True) for a in sol.values()], name="contains element<br>from train", orientation='h', marker=dict(color="red")))
fig.update_layout(barmode='stack', title="Percentage of columns in DeSSI's test data per class that contain a value"
                  "<br>that also appears in the training data for the same class"
                  "<br><sup>classes which are allowed to contain values from the training data are marked red</sup>", 
                  width=1000, height=800)
fig.update_xaxes(title="Classes")
fig.update_yaxes(title="Percentage")

highlighted_classes = {"Nationality", "Religion", "Gender", "Sexuality", "Race"}  # Replace with your keys to highlight
custom_labels = [
    f"<b style='color:blue;'>{key}</b>" if key in highlighted_classes else key
    for key in sol.keys()
]

fig.update_yaxes(
    ticktext=custom_labels,
    tickvals=list(sol.keys()),  # Ensure alignment of ticks and custom labels
    title="Percentage"
)

fig.add_trace(
    go.Scatter(
        x=[None],  # Empty trace
        y=[None],
        mode='markers',
        marker=dict(color='blue', size=10),
        name="Classes allowed to contain training data"
    )
)

fig.show()

# Mimesis

In [13]:
train_mimesis, dev_mimesis, test_mimesis = load_data('../../dessi-mf/mimesis/', labels=False)


Columns (1855,4493,4607,5938,8673,9994,10789,10922,12402,14133,16114,16224,16975,17584,17907) have mixed types. Specify dtype option on import or set low_memory=False.



In [14]:
# adjust labels
def adjust_labels(data2):
    data = data2.copy()
    newlabels = []
    for i in range(len(data.columns)):
        if "mixed" in data.iloc[100,i] or "fr_FR" in data.iloc[100,i] or "de_DE" in data.iloc[100,i]:
            newlabels.append(data.iloc[100,i][:-6])
        elif "_en" == data.iloc[100,i][-3:] or "_fr" == data.iloc[100,i][-3:] or "_de" == data.iloc[100,i][-3:]:
            newlabels.append(data.iloc[100,i][:-3])
        else:
            newlabels.append(data.iloc[100,i])
    data.iloc[100,:] = newlabels
    return data

In [15]:
train_dev_mimesis = train_mimesis.merge(dev_mimesis, left_index=True, right_index=True)
train_dev_mimesis = adjust_labels(train_dev_mimesis)
test_mimesis = adjust_labels(test_mimesis)

In [16]:
sol = dict()
for label in train_dev_mimesis.loc[100].unique():
    s = train_dev_mimesis.loc[100] == label
    train_dev = train_dev_mimesis.iloc[:100,:]
    train_data = set(train_dev[s.index[s==True].tolist()].values.flatten()) 
    s2 = test_mimesis.loc[100] == label
    
    has_value = []
    test = test_mimesis.iloc[:100,:]
    for c in s2.index[s2==True].tolist():
        has_value.append(any(val in train_data for val in test[c].values))
    sol[label] = dict(pd.Series(has_value).value_counts(normalize=True))
sol = {key: sol[key] for key in sorted(sol.keys(), reverse=True)}

In [17]:
personal_attributes = [
    "address", "academic_degree", "blood_type", "email", "first_name", "full_name", "last_name", 
    "gender", "language", "nationality", "occupation",
    "phone_number", "political_views", "title", "worldview", 
    "credit_card_number"
]

non_personal_attributes = [
    "company", "dish", "drink", "answer", "color", "isbn", "duration", 
    "programming_language", "system_quality_attribute", "version", "float_number", 
    "integer_number", "user_agent", "graphics", "cpu", "phone_model", 
    "manufacturer", "resolution", "ility", "word", "measure_unit", 
    "city", "street_name",
]

personal_attributes_unique = [
            "address", "email", "full_name",
            "phone_number", "credit_card_number"
        ]

non_personal_attributes_unique = ["isbn", "version", "float_number"]

duplicates_allowed = set(personal_attributes + non_personal_attributes) - set(personal_attributes_unique) - set(non_personal_attributes_unique)

In [18]:
fig = go.Figure()
fig.add_trace(go.Bar(y=list(sol.keys()), x=[a.get(False) for a in sol.values()], name="does not contain<br>element from train",
                      orientation='h', marker=dict(color="green")))
fig.add_trace(go.Bar(y=list(sol.keys()), x=[a.get(True) for a in sol.values()], name="contains element<br>from train", orientation='h', marker=dict(color="red")))
fig.update_layout(barmode='stack', title="Percentage of columns in Mimesis test data per class that contain a value<br>that also appears in the training data for the same class", 
                  width=1000, height=1300)
fig.update_xaxes(title="Classes")
fig.update_yaxes(title="Percentage")

highlighted_classes = duplicates_allowed  # Replace with your keys to highlight
custom_labels = [
    f"<b style='color:blue;'>{key}</b>" if key in highlighted_classes else key
    for key in sol.keys()
]

fig.update_yaxes(
    ticktext=custom_labels,
    tickvals=list(sol.keys()),  # Ensure alignment of ticks and custom labels
    title="Percentage"
)

fig.add_trace(
    go.Scatter(
        x=[None],  # Empty trace
        y=[None],
        mode='markers',
        marker=dict(color='blue', size=10),
        name="Classes allowed to contain training data"
    )
)

fig.show()

# Faker

In [19]:
train_faker, dev_faker, test_faker = load_data('../../dessi-mf/faker/', labels=False)


Columns (673,711,1251,1458,1502,1734,1847,1856,1878,1937,2085,2498,2760,3043,3617,4024,4165,4409,4563,4708,4912,5353,5446,5724,6034,6381,6505,6781,7835,8237,8587,8891,9099,9104,9815,9853,10586,10948,11316,11571,13141,13847,13910,14134,14212,14381,14839,15295,15582,16355,16368,18340,18731) have mixed types. Specify dtype option on import or set low_memory=False.



In [20]:
train_dev_faker = train_faker.merge(dev_faker, left_index=True, right_index=True)
train_dev_faker = adjust_labels(train_dev_faker)
test_faker = adjust_labels(test_faker)

In [21]:
sol = dict()
for label in train_dev_faker.loc[100].unique():
    s = train_dev_faker.loc[100] == label
    train_dev = train_dev_faker.iloc[:100,:]
    train_data = set(train_dev[s.index[s==True].tolist()].values.flatten()) 
    s2 = test_faker.loc[100] == label
    
    has_value = []
    test = test_faker.iloc[:100,:]
    for c in s2.index[s2==True].tolist():
        has_value.append(any(val in train_data for val in test[c].values))
    sol[label] = dict(pd.Series(has_value).value_counts(normalize=True))
sol = {key: sol[key] for key in sorted(sol.keys(), reverse=True)}

In [22]:
personal_attributes = [
    "address", "iban","swift","credit_card_number", "email",
    "job","first_name","last_name","name","phone_number",
    "profile","ssn","passport_owner"
]

profile_attributes = [
    "sex", "blood_group", "current_location"
]

non_personal_attributes = [
    "color","ean","credit_card_provider","company","currency",
    "url","isbn13","pyint","pyfloat", "date"
]

personal_attributes_unique = [
    "address","iban","swift","credit_card_number","email",
    "name","phone_number", "ssn","passport_owner"
]

profile_attributes_unique = ["current_location"]

non_personal_attributes_unique = [
    "color","ean", "isbn13", "pyfloat"
    ]

duplicates_allowed = set(personal_attributes + non_personal_attributes + profile_attributes) - {"profile"}
duplicates_allowed = duplicates_allowed - set(personal_attributes_unique) - set(non_personal_attributes_unique) - set(profile_attributes_unique)

In [23]:
fig = go.Figure()
fig.add_trace(go.Bar(y=list(sol.keys()), x=[a.get(False) for a in sol.values()], name="does not contain<br>element from train",
                      orientation='h', marker=dict(color="green")))
fig.add_trace(go.Bar(y=list(sol.keys()), x=[a.get(True) for a in sol.values()], name="contains element<br>from train", orientation='h', marker=dict(color="red")))
fig.update_layout(barmode='stack', title="Percentage of columns in Faker test data per class that contain a value<br>that also appears in the training data for the same class", 
                  width=1000, height=900)
fig.update_xaxes(title="Classes")
fig.update_yaxes(title="Percentage")

highlighted_classes = duplicates_allowed  # Replace with your keys to highlight
custom_labels = [
    f"<b style='color:blue;'>{key}</b>" if key in highlighted_classes else key
    for key in sol.keys()
]

fig.update_yaxes(
    ticktext=custom_labels,
    tickvals=list(sol.keys()),  # Ensure alignment of ticks and custom labels
    title="Percentage"
)

fig.add_trace(
    go.Scatter(
        x=[None],  # Empty trace
        y=[None],
        mode='markers',
        marker=dict(color='blue', size=10),
        name="Classes allowed to contain training data"
    )
)

fig.show()

# Combination of all three Datasets

In [24]:
train_all, dev_all, test_all = load_data(f'../../dessi-mf/dessi-mf/', labels=False)


Columns (1064,1244,1756,1839,2016,2089,2405,2842,3037,3480,3495,4183,4278,5216,5978,6085,6208,6652,7602,7707,7722,7855,8474,9458,9821,10964,11572,12124,13546,14228,14630,14996,15060,16533,17405,18488,18831,19707,19792,20219,20440,20494,22031,22439,22659,22730,23028,24180,24227,25069,26422,27362,27917,28351,28586,30127,30376,30408,31655,31759,31867,32396,32561,33209,33682,34142,34173,34349,35084,36234,36248,36527,37252,37775,37879,38357,39070,39280,40053,40358,42044,42507,43106) have mixed types. Specify dtype option on import or set low_memory=False.



In [25]:
train_dev_all= train_all.merge(dev_all, left_index=True, right_index=True)
train_dev_all = adjust_labels(train_dev_all)
test_all = adjust_labels(test_all)

In [26]:
sol = dict()
for label in train_dev_all.loc[100].unique():
    s = train_dev_all.loc[100] == label
    train_dev = train_dev_all.iloc[:100,:]
    train_data = set(train_dev[s.index[s==True].tolist()].values.flatten()) 
    s2 = test_all.loc[100] == label
    
    has_value = []
    test = test_all.iloc[:100,:]
    for c in s2.index[s2==True].tolist():
        has_value.append(any(val in train_data for val in test[c].values))
    sol[label] = dict(pd.Series(has_value, dtype="object").value_counts(normalize=True))
sol = {key: sol[key] for key in sorted(sol.keys(), reverse=True)}

In [27]:
highlighted_classes = {"Nationality", "Religion", "Gender", "Sexuality", "Race"}
unique_dessi = set([a for a in train_dessi.iloc[100,:].value_counts().keys() if "," not in str(a)]) - highlighted_classes

unique_attributes = [
            "address", "email", "full_name",
            "phone_number", "credit_card_number",
            "isbn", "version", "float_number", "current_location", 
            "iban", "swift", "ssn", "passport_owner", "name", "ean", "pyfloat", "isbn13"]

unique_attributes = set(unique_attributes).union(unique_dessi)
duplicates_allowed = set(sol.keys()) - unique_attributes

In [28]:
sol.pop("Organization,Phone_number", None)
fig = go.Figure()
fig.add_trace(go.Bar(y=list(sol.keys()), x=[a.get(False) for a in sol.values()], name="does not contain<br>element from train",
                      orientation='h', marker=dict(color="green")))
fig.add_trace(go.Bar(y=list(sol.keys()), x=[a.get(True) for a in sol.values()], name="contains element<br>from train", orientation='h', marker=dict(color="red")))
fig.update_layout(barmode='stack', title="Percentage of columns in Faker test data per class that contain a value<br>that also appears in the training data for the same class", 
                  width=1000, height=1300)
fig.update_xaxes(title="Classes")
fig.update_yaxes(title="Percentage")

highlighted_classes = duplicates_allowed  # Replace with your keys to highlight
custom_labels = [
    f"<b style='color:blue;'>{key}</b>" if key in highlighted_classes else key
    for key in sol.keys()
]

fig.update_yaxes(
    ticktext=custom_labels,
    tickvals=list(sol.keys()),  # Ensure alignment of ticks and custom labels
    title="Percentage"
)

fig.add_trace(
    go.Scatter(
        x=[None],  # Empty trace
        y=[None],
        mode='markers',
        marker=dict(color='blue', size=10),
        name="Classes allowed to contain training data"
    )
)

fig.show()