# Test Microsoft Presidio on other datasets for binary classification

In [1]:
#!pip install presidio_structured

In [1]:
import pandas as pd
import ast
from presidio_analyzer import AnalyzerEngine
import json
from presidio_structured import PandasAnalysisBuilder

2025-03-22 15:37:23.099478: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-03-22 15:37:23.123908: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI AVX512_BF16 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


# Dataset containing mimesis, faker classes with chinese and italian language + kaggle datasets

In [2]:
data_languages = pd.read_csv("../../datasets/test_languages/test.csv")
labels_languages = pd.read_csv("../../datasets/test_languages/test_labels_personal.csv").T
data_kaggle = pd.read_csv("../../datasets/kaggle_datasets/all_datasets.csv")
labels_kaggle = pd.read_csv("../../datasets/kaggle_datasets/all_datasets_labels_personal.csv").T
data_openml = pd.read_csv("../../datasets/openml_datasets/all_datasets.csv")
labels_openml = pd.read_csv("../../datasets/openml_datasets/all_datasets_labels_personal.csv").T
data_openml_2 = pd.read_csv("../../datasets/openml_datasets_2/all_datasets.csv")
labels_openml_2 = pd.read_csv("../../datasets/openml_datasets_2/all_datasets_labels_personal.csv").T
data_medical = pd.read_csv("../../datasets/freiburg-medical/test.csv")
labels_medical = pd.read_csv("../../datasets/freiburg-medical/test_labels_personal.csv").T

In [3]:
data_openmlall = pd.read_csv("../../datasets/OpenMLall_data/OpenMLAll_data.csv")
labels_openmlall = pd.read_csv("../../datasets/OpenMLall_data/OpenMLAll_data_labels.csv").T

In [4]:
# data_openmlall

In [5]:
data_mimic = pd.read_csv('../../datasets/MIMICIIIandHonda/Mimic_for_Use.csv')
labels_mimic = pd.read_csv('../../datasets/MIMICIIIandHonda/Mimic_labels.csv').T

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,row_id,subject_id,hadm_id,icustay_id,dbsource,eventtype,prev_careunit,curr_careunit,...,location,locationcategory,first_careunit,last_careunit,first_wardid,last_wardid,drg_type,drg_code,drg_severity,drg_mortality
0,0,0,54440,10006.0,142345.0,206504.0,carevue,admit,,MICU,...,,,,,,,,,,
1,1,1,54441,10006.0,142345.0,,carevue,transfer,MICU,,...,,,,,,,,,,
2,2,2,54442,10006.0,142345.0,,carevue,discharge,,,...,,,,,,,,,,
3,3,3,54460,10011.0,105331.0,232110.0,carevue,admit,,MICU,...,,,,,,,,,,
4,4,4,54461,10011.0,105331.0,,carevue,discharge,MICU,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
195,195,195,54915,10119.0,157466.0,,carevue,admit,,,...,,,,,,,,,,
196,196,196,54916,10119.0,157466.0,247686.0,carevue,transfer,,SICU,...,,,,,,,,,,
197,197,197,54917,10119.0,157466.0,,carevue,transfer,SICU,,...,,,,,,,,,,
198,198,198,54918,10119.0,157466.0,,carevue,transfer,,,...,,,,,,,,,,


In [17]:
data_mimic = data_mimic.drop(columns=['Unnamed: 0','Unnamed: 0.1'])

In [9]:
# data_mimic.to_csv('../../datasets/MIMICIIIandHonda/Mimic_for_Use.csv')

In [6]:
data_mimicNew = pd.read_csv('../../datasets/MIMICIIIandHonda/MIMIC_100_all_FIN2025.csv')
labels_mimicNew = pd.read_csv('../../datasets/MIMICIIIandHonda/fmimic_labels1.csv').T

Method 1: 
- Input: The values are concatenated with the column names like a dictionary, the delimiter is a ,  
Example: "0: Email, 1: Phone_number, 2: Email, ...
- Output: Based on the position in the string where personal data is detected the detected classes are matched to the columns

In [7]:
def convert_to_text_rowwise1(df, row):
    s_array = []
    string_position_array = []
    s = ""
    string_position = []
    start_index = 0
    end_index = -1
    for e, c in enumerate(df.columns):
        add_string = str(c) + ": " + str(df.loc[row,c]) + ", "
        s = s + add_string
        start_index = end_index + 1 
        end_index = start_index + len(add_string) - 1
        string_position.append((start_index, end_index))
        if (e+1) % 100 == 0:
            s_array.append(s)
            string_position_array.append(string_position)
            s = ""
            string_position = []
    if string_position != []:
        s_array.append(s)
        string_position_array.append(string_position) 
    return s_array, string_position_array

In [8]:
def apply_analyzer(a, file_path, convert_method, data, num_samp=100):                  #a is the row where the analysis starts due to several kernel crashes    
    results = pd.read_csv(file_path)
    analyzer = AnalyzerEngine(supported_languages=["en", "hr"])               # other nlp_engine possible, support english and croatian
    for i in range(a,num_samp):
        print(i)
        position_index = i*2+1
        entities_index = i*2+2
        data_string, string_position = convert_method(data, i)
        results.loc[position_index] = [item for sublist in string_position for item in sublist]
        empty_row = [[] for _ in range(results.shape[1])]
        results.loc[entities_index] = empty_row
        last_stringposition = 0
        for e, substring in enumerate(data_string):
            results_analyzer = analyzer.analyze(text=substring, language="en")                #multiple languages could be a problem, for analyze is only one language possible
            column = 0
            results_analyzer.sort(key=lambda x: x.start)
            for result in results_analyzer:
                result_tuple = (result.start + last_stringposition, result.end + last_stringposition- 1)               # result.end gives the next character after the entity
                string_tuple = results.iloc[position_index, column]
                while result_tuple[0] > string_tuple[1]:                    #case if there is no more entity in the column
                    column += 1
                    string_tuple = results.iloc[position_index, column]
                if  string_tuple[0] <= result_tuple[0] and result_tuple[1] <= string_tuple[1]:      #case if entity is one column
                    results.iloc[entities_index,column].append(str(result.entity_type) + "_" + str(result.score)) 
                else:
                    counter = 0 
                    while True:
                        results.iloc[entities_index,column+counter].append("shared " + str(result.entity_type) + "_" + str(result.score))             # case if entity is in multiple columns
                        counter += 1
                        if column+counter >= results.shape[1]:
                            break
                        string_tuple = results.iloc[position_index, column+counter]
                        if result_tuple[1] < string_tuple[0]:
                            break
            last_stringposition = string_position[e][-1][1] + 1
        results.to_csv(file_path, index=False)

In [9]:
#only execute first time
results_rowwise0 = labels_languages.copy()
results_rowwise0.to_csv("binary_results/test_languages_res/results_rowwise1.csv", index=False)

apply_analyzer(0, "binary_results/test_languages_res/results_rowwise1.csv", convert_to_text_rowwise1, data_languages)

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99


In [10]:
#only execute first time
results_rowwise1 = labels_kaggle.copy()
results_rowwise1.to_csv("binary_results/kaggle/results_rowwise1.csv", index=False)

apply_analyzer(0, "binary_results/kaggle/results_rowwise1.csv", convert_to_text_rowwise1, data_kaggle)

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99


In [11]:
#only execute first time
results_rowwise2 = labels_openml.copy()
results_rowwise2.to_csv("binary_results/openml/results_rowwise1.csv", index=False)

apply_analyzer(0, "binary_results/openml/results_rowwise1.csv", convert_to_text_rowwise1, data_openml)

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99


In [12]:
#only execute first time
results_rowwise3 = labels_openml_2.copy()
results_rowwise3.to_csv("binary_results/openml_2/results_rowwise1.csv", index=False)

apply_analyzer(0, "binary_results/openml_2/results_rowwise1.csv", convert_to_text_rowwise1, data_openml_2)

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99


In [13]:
#only execute first time
results_rowwise4 = labels_medical.copy()
results_rowwise4.to_csv("binary_results/medical/results_rowwise1.csv", index=False)

apply_analyzer(0, "binary_results/medical/results_rowwise1.csv", convert_to_text_rowwise1, data_medical)

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99


In [20]:
results_rowwise1mim = labels_mimic.copy()
results_rowwise1mim.to_csv("binary_results/mimicIII/results_rowwise1.csv", index=False)

apply_analyzer(0, "binary_results/mimicIII/results_rowwise1.csv", convert_to_text_rowwise1, data_mimic, 200)

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199


In [21]:
results_rowwise1mimnew = labels_mimicNew.copy()
results_rowwise1mimnew.to_csv("binary_results/MIMICNew/results_rowwise1.csv", index=False)

apply_analyzer(0, "binary_results/MIMICNew/results_rowwise1.csv", convert_to_text_rowwise1, data_mimicNew)

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99


In [24]:
results_rowwise1openall = labels_openmlall.copy()
results_rowwise1openall.to_csv("binary_results/OpenMLall/results_rowwise1.csv", index=False)

apply_analyzer(0, "binary_results/OpenMLall/results_rowwise1.csv", convert_to_text_rowwise1, data_openmlall)

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99


In all results dataframes the first row contains the groundtruth label, the other rows contain the string position of the corresponding cell value and underneath the detected personal classed for this cell value

Method 2: Only use the cell values without the column names
- Input: Only use the cell values without the column names, the delimiter is a ,  
Example: "Email, Phone_number, Email, ...
- Output: Based on the position in the string where personal data is detected the detected classes are matched to the columns

In [25]:
def convert_to_text_rowwise2(df, row):
    s_array = []
    string_position_array = []
    s = ""
    string_position = []
    start_index = 0
    end_index = -1
    for e, c in enumerate(df.columns):
        add_string = str(df.loc[row,c]) + ", "
        s = s + add_string
        start_index = end_index + 1 
        end_index = start_index + len(add_string) - 1
        string_position.append((start_index, end_index))
        if (e+1) % 100 == 0:
            s_array.append(s)
            string_position_array.append(string_position)
            s = ""
            string_position = []
    if string_position != []:
        s_array.append(s)
        string_position_array.append(string_position) 
    return s_array, string_position_array

In [26]:
#only execute first time
results_rowwise21 = labels_languages.copy()
results_rowwise21.to_csv("binary_results/test_languages_res/results_rowwise2.csv", index=False)

apply_analyzer(0, "binary_results/test_languages_res/results_rowwise2.csv", convert_to_text_rowwise2, data_languages)

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99


In [27]:
#only execute first time
results_rowwise22 = labels_kaggle.copy()
results_rowwise22.to_csv("binary_results/kaggle/results_rowwise2.csv", index=False)

apply_analyzer(0, "binary_results/kaggle/results_rowwise2.csv", convert_to_text_rowwise2, data_kaggle)

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99


In [28]:
#only execute first time
results_rowwise23 = labels_openml.copy()
results_rowwise23.to_csv("binary_results/openml/results_rowwise2.csv", index=False)

apply_analyzer(0, "binary_results/openml/results_rowwise2.csv", convert_to_text_rowwise2, data_openml)

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99


In [29]:
#only execute first time
results_rowwise24 = labels_openml_2.copy()
results_rowwise24.to_csv("binary_results/openml_2/results_rowwise2.csv", index=False)

apply_analyzer(0, "binary_results/openml_2/results_rowwise2.csv", convert_to_text_rowwise2, data_openml_2)

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99


In [30]:
#only execute first time
results_rowwise25 = labels_medical.copy()
results_rowwise25.to_csv("binary_results/medical/results_rowwise2.csv", index=False)

apply_analyzer(0, "binary_results/medical/results_rowwise2.csv", convert_to_text_rowwise2, data_medical)

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99


In [31]:
results_rowwise1mim2 = labels_mimic.copy()
results_rowwise1mim2.to_csv("binary_results/mimicIII/results_rowwise2.csv", index=False)

apply_analyzer(0, "binary_results/mimicIII/results_rowwise2.csv", convert_to_text_rowwise2, data_mimic, 200)

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199


In [32]:
results_rowwise2mimnew = labels_mimicNew.copy()
results_rowwise2mimnew.to_csv("binary_results/MIMICNew/results_rowwise2.csv", index=False)

apply_analyzer(0, "binary_results/MIMICNew/results_rowwise2.csv", convert_to_text_rowwise2, data_mimicNew)

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99


In [33]:
results_rowwise2openall = labels_openmlall.copy()
results_rowwise2openall.to_csv("binary_results/OpenMLall/results_rowwise2.csv", index=False)

apply_analyzer(0, "binary_results/OpenMLall/results_rowwise2.csv", convert_to_text_rowwise2, data_openmlall)

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99


Method3: 
- Input: Columnwise approach, pass the column name and all cell values to the Analyzer, the delimiters are : and ,  
Example: "0: Email, Email, Email, Email, ..."
- Output: All detected classes can be matched to the analyzed column, string positions are not important but are saved anyways for the cell values

In [36]:
def convert_to_text_columnwise(df, column):
    s = str(column) + ": "
    string_position = pd.Series(dtype="object")
    start_index = 0
    end_index = len(s) - 1
    for r in range(100):
        add_string = str(df.loc[r,column]) + ", "
        s = s + add_string
        start_index = end_index + 1 
        end_index = start_index + len(add_string) - 1
        string_position.loc[r] = (start_index, end_index)
    return s, string_position

In [37]:
def apply_analyzer_columnwise(a, file_path, dessi):                  #a is the row where the analysis starts due to several kernel crashes    
    results = pd.read_csv(file_path)
    analyzer = AnalyzerEngine(supported_languages=["en", "hr"])               # other nlp_engine possible, support english and croatian
    for i in range(a,results.shape[1]):
        print(i)
        column = dessi.columns[i]
        dessi_string, string_position = convert_to_text_columnwise(dessi, column)
        for j in range(1,201,2):
            results.iloc[j, i] = string_position[(j-1)/2]
        results_analyzer = analyzer.analyze(text=dessi_string, language="en")                #multiple languages could be a problem, for analyze is only one language possible

        results_analyzer.sort(key=lambda x: x.start)
        row = 0
        for result in results_analyzer:
            result_tuple = (result.start, result.end - 1)               # result.end gives the next character after the entity
            string_tuple = results.iloc[row*2+1, i]
            while result_tuple[0] > string_tuple[1]:                    #case if there is no more entity in the column
                row += 1
                string_tuple = results.iloc[row*2+1, i]
            if  string_tuple[0] <= result_tuple[0] and result_tuple[1] <= string_tuple[1]:      #case if entity is one column
                results.iloc[row*2+2, i] = ast.literal_eval(str(results.iloc[row*2+2, i])) + [str(result.entity_type) + "_" + str(result.score)]
            else:
                counter = 0 
                while True:
                    results.iloc[(row+counter)*2+2, i] = ast.literal_eval(str(results.iloc[(row+counter)*2+2, i])) + ["shared " + str(result.entity_type) + "_" + str(result.score)]             # case if entity is in multiple columns
                    counter += 1
                    if row+counter > 99:
                            break
                    string_tuple = results.iloc[(row+counter)*2+1, i]
                    if result_tuple[1] < string_tuple[0]:
                        break
        if i % 100 == 0:
            results.to_csv(file_path, index=False)
    results.to_csv(file_path, index=False)
            

In [38]:
# only execute first time
results_columnwise1 = labels_languages.copy()
df_add = pd.DataFrame([[[] for _ in range(labels_languages.shape[1])] for _ in range(200)])
df_add.columns = results_columnwise1.columns
results_columnwise1 = pd.concat([results_columnwise1, df_add]).reset_index(drop=True)
results_columnwise1.to_csv("./binary_results/test_languages_res/results_columnwise.csv", index=False)

apply_analyzer_columnwise(0, "./binary_results/test_languages_res/results_columnwise.csv", data_languages)

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99


In [39]:
# only execute first time
results_columnwise2 = labels_kaggle.copy()
df_add = pd.DataFrame([[[] for _ in range(labels_kaggle.shape[1])] for _ in range(200)])
df_add.columns = results_columnwise2.columns
results_columnwise2 = pd.concat([results_columnwise2, df_add]).reset_index(drop=True)
results_columnwise2.to_csv("./binary_results/kaggle/results_columnwise.csv", index=False)

apply_analyzer_columnwise(0, "./binary_results/kaggle/results_columnwise.csv", data_kaggle)

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245


In [40]:
# only execute first time
results_columnwise3 = labels_openml.copy()
df_add = pd.DataFrame([[[] for _ in range(labels_openml.shape[1])] for _ in range(200)])
df_add.columns = results_columnwise3.columns
results_columnwise3 = pd.concat([results_columnwise3, df_add]).reset_index(drop=True)
results_columnwise3.to_csv("./binary_results/openml/results_columnwise.csv", index=False)

apply_analyzer_columnwise(0, "./binary_results/openml/results_columnwise.csv", data_openml)

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141


In [41]:
# only execute first time
results_columnwise4 = labels_openml_2.copy()
df_add = pd.DataFrame([[[] for _ in range(labels_openml_2.shape[1])] for _ in range(200)])
df_add.columns = results_columnwise4.columns
results_columnwise4 = pd.concat([results_columnwise4, df_add]).reset_index(drop=True)
results_columnwise4.to_csv("./binary_results/openml_2/results_columnwise.csv", index=False)

apply_analyzer_columnwise(0, "./binary_results/openml_2/results_columnwise.csv", data_openml_2)

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115


In [42]:
# only execute first time
results_columnwise5 = labels_medical.copy()
df_add = pd.DataFrame([[[] for _ in range(labels_medical.shape[1])] for _ in range(200)])
df_add.columns = results_columnwise5.columns
results_columnwise5 = pd.concat([results_columnwise5, df_add]).reset_index(drop=True)
results_columnwise5.to_csv("./binary_results/medical/results_columnwise.csv", index=False)

apply_analyzer_columnwise(0, "./binary_results/medical/results_columnwise.csv", data_medical)

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31


In [43]:
# only execute first time
results_columnwisemim = labels_mimic.copy()
df_addmim = pd.DataFrame([[[] for _ in range(labels_mimic.shape[1])] for _ in range(200)])
df_addmim.columns = results_columnwisemim.columns
results_columnwisemim = pd.concat([results_columnwisemim, df_addmim]).reset_index(drop=True)
results_columnwisemim.to_csv("./binary_results/mimicIII/results_columnwise.csv", index=False)

apply_analyzer_columnwise(0, "./binary_results/mimicIII/results_columnwise.csv", data_mimic)

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162


In [49]:
# only execute first time
results_columnwisemimNew = labels_mimicNew.copy()
df_addmim = pd.DataFrame([[[] for _ in range(labels_mimicNew.shape[1])] for _ in range(200)])
df_addmim.columns = results_columnwisemimNew.columns
results_columnwisemimNew = pd.concat([results_columnwisemimNew, df_addmim]).reset_index(drop=True)
results_columnwisemimNew.to_csv("./binary_results/MIMICNew/results_columnwise.csv", index=False)

apply_analyzer_columnwise(0, "./binary_results/MIMICNew/results_columnwise.csv", data_mimicNew)

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162


In [51]:
# labels_openmlall

In [52]:
# only execute first time
results_columnwiseopenall = labels_openmlall.copy()
df_addmim = pd.DataFrame([[[] for _ in range(labels_openmlall.shape[1])] for _ in range(200)])
df_addmim.columns = results_columnwiseopenall.columns
results_columnwiseopenall = pd.concat([results_columnwiseopenall, df_addmim]).reset_index(drop=True)
results_columnwiseopenall.to_csv("./binary_results/OpenMLall/results_columnwise.csv", index=False)

apply_analyzer_columnwise(0, "./binary_results/OpenMLall/results_columnwise.csv", data_openmlall)

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257


# Presidio structured

In [54]:
def apply_presidio_structured(data, path, selection_strategy="most_common", a=0, numsamp =100):
    if a != 0:
        with open(path, "rb") as file:
            tabular_analysis = json.load(file)
    else:
        tabular_analysis = dict()
    for i in range(a, data.shape[1], numsamp):
        print(i)
        tabular_analysis_partly = PandasAnalysisBuilder().generate_analysis(data.iloc[:, i:i+100], selection_strategy=selection_strategy)
        tabular_analysis = tabular_analysis | tabular_analysis_partly.entity_mapping
        with open(path, "w") as file:
            json.dump(tabular_analysis, file)

In [55]:
apply_presidio_structured(data_languages, 'binary_results/test_languages_res/most_common.json')

0


In [56]:
apply_presidio_structured(data_kaggle, 'binary_results/kaggle/most_common.json')

0
100
200


In [57]:
apply_presidio_structured(data_openml, 'binary_results/openml/most_common.json')

0
100


In [58]:
apply_presidio_structured(data_openml_2, 'binary_results/openml_2/most_common.json')

0
100


In [59]:
apply_presidio_structured(data_medical, 'binary_results/medical/most_common.json')

0


In [60]:
apply_presidio_structured(data_mimic, 'binary_results/mimicIII/most_common.json')

0
100


In [61]:
apply_presidio_structured(data_mimicNew, 'binary_results/MIMICNew/most_common.json')

0
100


In [63]:
apply_presidio_structured(data_openmlall, 'binary_results/OpenMLall/most_common.json')

0
100
200


In [64]:
apply_presidio_structured(data_languages, 'binary_results/test_languages_res/highest_confidence.json', selection_strategy="highest_confidence")

0


In [65]:
apply_presidio_structured(data_kaggle, 'binary_results/kaggle/highest_confidence.json', selection_strategy="highest_confidence")

0
100
200


In [66]:
apply_presidio_structured(data_openml, 'binary_results/openml/highest_confidence.json', selection_strategy="highest_confidence")

0
100


In [67]:
apply_presidio_structured(data_openml_2, 'binary_results/openml_2/highest_confidence.json', selection_strategy="highest_confidence")

0
100


In [68]:
apply_presidio_structured(data_medical, 'binary_results/medical/highest_confidence.json', selection_strategy="highest_confidence")

0


In [69]:
apply_presidio_structured(data_mimic, 'binary_results/mimicIII/highest_confidence.json',selection_strategy="highest_confidence")

0
100


In [70]:
apply_presidio_structured(data_mimicNew, 'binary_results/MIMICNew/highest_confidence.json',selection_strategy="highest_confidence")

0
100


In [71]:
apply_presidio_structured(data_openmlall, 'binary_results/OpenMLall/highest_confidence.json',selection_strategy="highest_confidence")

0
100
200
