In [1]:
import pandas as pd
import numpy as np

In [2]:
# read in the files
hospital_data = pd.read_csv("hospital_data.csv", names=['sex', 'race', 'age', 'disease'], delimiter=';')
citizens_list = pd.read_csv("citizens_list.csv", names=['name', 'sex', 'race', 'age'], delimiter=';')

In [3]:
hospital_data

Unnamed: 0,sex,race,age,disease
0,female,White: Irish,87,'Influenza & pneumonia'
1,female,White: Other,85,'Other diseases of the urinary system'
2,female,Black or Black British: African,77,'Cerebrovascular diseases'
3,female,White: British,65,'Ischaemic heart diseases'
4,male,White: British,50,'Other diseases of intestines'
...,...,...,...,...
9995,female,White: British,34,'Symptoms & signs inv. the digestive system & ...
9996,female,White: British,88,'Influenza & pneumonia'
9997,male,White: British,71,'In situ & benign neoplasms and others of unce...
9998,female,White: British,88,'General symptoms & signs'


In [4]:
citizens_list

Unnamed: 0,name,sex,race,age
0,Aaron Bilger,male,White: Irish,85
1,Aaron Erwin,male,Other: Arab,87
2,Aaron Ferrara,male,White: British,82
3,Aaron James,male,White: British,4
4,Aaron Jolin,male,Mixed: White and Asian,77
...,...,...,...,...
9995,Zane Koeppen,male,White: British,76
9996,Zane Pitman,male,White: British,28
9997,Zella Barnett,female,Black or Black British: Caribbean,70
9998,Zenaida Mccomb,female,White: Other,56


In [5]:
filtered_hd = hospital_data[(hospital_data['sex'] == "male") & (hospital_data['age'] == 70) & (hospital_data['race'] == "White: British")]
filtered_hd.shape
filtered_hd

Unnamed: 0,sex,race,age,disease
188,male,White: British,70,"'Diseases of oesophagus, stomach & duodenum'"
645,male,White: British,70,'Other diseases of intestines'
758,male,White: British,70,'In situ & benign neoplasms and others of unce...
803,male,White: British,70,'Other forms of heart disease'
952,male,White: British,70,'Other forms of heart disease'
1408,male,White: British,70,'Arthropathies'
1449,male,White: British,70,'Malignant neoplsm. of thyroid and oth. endo. ...
1668,male,White: British,70,'Arthropathies'
3414,male,White: British,70,'Symptoms & signs inv. the digestive system & ...
3727,male,White: British,70,'Dorsopathies'


### Ερώτημα 3α

In [6]:
all_diseases = filtered_hd['disease'].unique()
results = {}
for disease in all_diseases:
    disease_count = filtered_hd[filtered_hd['disease'] == disease].shape[0]
    total_count = filtered_hd.shape[0]
    percentage = disease_count / total_count
    results[disease] = percentage
    
pd.DataFrame.from_dict(results, orient='index')

Unnamed: 0,0
"'Diseases of oesophagus, stomach & duodenum'",0.0625
'Other diseases of intestines',0.03125
'In situ & benign neoplasms and others of uncertainty',0.03125
'Other forms of heart disease',0.15625
'Arthropathies',0.25
'Malignant neoplsm. of thyroid and oth. endo. Glands etc',0.03125
'Symptoms & signs inv. the digestive system & abdomen',0.03125
'Dorsopathies',0.03125
'Ischaemic heart diseases',0.125
'Malignant neoplasm of digestive organs',0.03125


### Ερώτημα 3β

In [7]:
counted = hospital_data.groupby(['sex', 'age', 'race']).count()
unique_disease = counted[counted['disease'] == 1]

In [8]:
results = []
for i, row in unique_disease.iterrows():
    sex, age, race = i
    res = citizens_list[(citizens_list['sex'] == sex) & (citizens_list['age'] == age) & (citizens_list['race'] == race)]
    results.append(res)
    
print("Άτομα που επισκέφθηκαν το νοσοκομείο για συγκεκριμένο λόγο:", len(results))
print()
print("Παρακάτω εμφανίζονται τα πρώτα 10 άτομα")
for i in results[:10]:
    print(i)

Άτομα που επισκέφθηκαν το νοσοκομείο για συγκεκριμένο λόγο: 708

Παρακάτω εμφανίζονται τα πρώτα 10 άτομα
             name     sex                             race  age
6018  Lori Demark  female  Asian or Asian British: Chinese    0
               name     sex                              race  age
9406  Tina Williams  female  Mixed: White and Black Caribbean    0
               name     sex                           race  age
3702  Hilda Rubarts  female  Other: Any other ethnic group    0
                 name     sex         race  age
1697  Consuelo Dufour  female  Other: Arab    0
                 name     sex          race  age
9083  Sylvia Williams  female  White: Irish    0
                 name     sex                             race  age
3780  Iris Pleasanton  female  Asian or Asian British: Chinese    1
                 name     sex                             race  age
7843  Renee Broussard  female  Asian or Asian British: Chinese    2
                name     sex           

### Ερώτημα 3α Generalization

In [9]:
def compute_k_anonymity(df: pd.DataFrame, qid: list[str]) -> int:
    return df.groupby(qid).count().min().get('disease')

In [11]:
def compute_l_diversity(df: pd.DataFrame, qid: list[str], sensitive_col: str) -> int:
    grouped = df.groupby(qid)
    l_div = []
    for name, group in grouped:
        l_div.append(group[sensitive_col].nunique())

    return min(l_div)

In [13]:
def compute_l_div_entropy(df: pd.DataFrame, qid: list[str], sensitive_col: str) -> float:
    grouped = df.groupby(qid)
    entropies = []
    for name, group in grouped:
        counts = group[sensitive_col].value_counts()
        total = counts.sum()
        s = 0
        for index, value in counts.items():
            s -= (value/total) * np.log10(value/total)
        entropies.append(np.power(10,s))

    return min(entropies)

In [27]:
qid = ['age', 'sex', 'race']
print('k-anonymity:', compute_k_anonymity(hospital_data, qid))
print('l-diversity:', compute_l_diversity(hospital_data, qid, 'disease'))
print('Entropy:', compute_l_div_entropy(hospital_data, qid, 'disease'))

k-anonymity: 1
l-diversity: 1
Entropy: 1.0


### Ερώτημα 3β Generalization

In [15]:
gen_data = hospital_data.copy()

#### Έλγχος τιμών που υπάρχουν στον πίνακα

In [16]:
races = gen_data['race'].unique()
for i in np.sort(races):
    print(i)

Asian or Asian British: Bangladeshi
Asian or Asian British: Chinese
Asian or Asian British: Indian
Asian or Asian British: Other Asian
Asian or Asian British: Pakistani
Black or Black British: African
Black or Black British: Caribbean
Black or Black British: Other Black
Mixed: Other Mixed
Mixed: White and Asian
Mixed: White and Black African
Mixed: White and Black Caribbean
Other: Any other ethnic group
Other: Arab
White: British
White: Gypsy or Irish Traveller
White: Irish
White: Other


In [17]:
ages = gen_data['age'].unique()
print(ages.min(), ages.max())

0 89


### Γενίκευση ηλικίας στις εξής ομάδες `[0,15), [15,30), [30,45), [45,60), [60,75), [75,90)`

In [18]:
gen_data = hospital_data.copy()
gen_data.astype({'age':'int'}).dtypes

sex        object
race       object
age         int32
disease    object
dtype: object

In [19]:
gen_data.loc[gen_data['age'] < 15, 'age'] = 0
gen_data.loc[(gen_data['age'] < 30) & (gen_data['age'] >= 15), 'age'] = 15
gen_data.loc[(gen_data['age'] < 45) & (gen_data['age'] >= 30), 'age'] = 30
gen_data.loc[(gen_data['age'] < 60) & (gen_data['age'] >= 45), 'age'] = 45
gen_data.loc[(gen_data['age'] < 75) & (gen_data['age'] >= 60), 'age'] = 60
gen_data.loc[gen_data['age'] >= 75, 'age'] = 75
gen_data['age'].unique()

array([75, 60, 45,  0, 15, 30], dtype=int64)

In [20]:
print('k-anonymity:', compute_k_anonymity(gen_data, qid))
print('l-diversity:', compute_l_diversity(gen_data, qid, 'disease'))
print('Entropy:', compute_l_div_entropy(gen_data, qid, 'disease'))

k-anonymity: 1
l-diversity: 1
Entropy: 1.0


### Γενίκευση φυλής με πρώτο επίπεδο ηλικίας

In [21]:
gen_data = hospital_data.copy()

gen_data = gen_data.replace(regex=r'^White.*$', value='White')
gen_data = gen_data.replace(regex=r'^Asian.*$', value='Asian')
gen_data = gen_data.replace(regex=r'^Black.*$', value='Black')
gen_data = gen_data.replace(regex=r'^Mixed.*$', value='Mixed')
gen_data = gen_data.replace(regex=r'^Other.*$', value='Other')

gen_data.loc[gen_data['age'] < 15, 'age'] = 0
gen_data.loc[(gen_data['age'] < 30) & (gen_data['age'] >= 15), 'age'] = 15
gen_data.loc[(gen_data['age'] < 45) & (gen_data['age'] >= 30), 'age'] = 30
gen_data.loc[(gen_data['age'] < 60) & (gen_data['age'] >= 45), 'age'] = 45
gen_data.loc[(gen_data['age'] < 75) & (gen_data['age'] >= 60), 'age'] = 60
gen_data.loc[gen_data['age'] >= 75, 'age'] = 75

races = gen_data['race'].unique()
for i in np.sort(races):
    print(i)

Asian
Black
Mixed
Other
White


In [22]:
print('k-anonymity:', compute_k_anonymity(gen_data, qid))
print('l-diversity:', compute_l_diversity(gen_data, qid, 'disease'))
print('Entropy:', compute_l_div_entropy(gen_data, qid, 'disease'))

k-anonymity: 8
l-diversity: 2
Entropy: 1.4575692649810903


### Γενίκευση ηλικίας δεύτερο επίπεδο

In [23]:
gen_data = hospital_data.copy()

gen_data.loc[gen_data['age'] < 45, 'age'] = 0
gen_data.loc[gen_data['age'] >= 45, 'age'] = 45

gen_data['age'].unique()

array([45,  0], dtype=int64)

In [24]:
print('k-anonymity:', compute_k_anonymity(gen_data, qid))
print('l-diversity:', compute_l_diversity(gen_data, qid, 'disease'))
print('Entropy:', compute_l_div_entropy(gen_data, qid, 'disease'))

k-anonymity: 1
l-diversity: 1
Entropy: 1.0


### Γενίκευση ηλικίας και φυλής σε δεύτερο επίπεδο

In [25]:
gen_data = hospital_data.copy()

gen_data.loc[gen_data['age'] < 45, 'age'] = 0
gen_data.loc[gen_data['age'] >= 45, 'age'] = 45

gen_data = gen_data.replace(regex=r'^White.*$', value='White')
gen_data = gen_data.replace(regex=r'^Asian.*$', value='Asian')
gen_data = gen_data.replace(regex=r'^Black.*$', value='Black')
gen_data = gen_data.replace(regex=r'^Mixed.*$', value='Mixed')
gen_data = gen_data.replace(regex=r'^Other.*$', value='Other')

In [26]:
print('k-anonymity:', compute_k_anonymity(gen_data, qid))
print('l-diversity:', compute_l_diversity(gen_data, qid, 'disease'))
print('Entropy:', compute_l_div_entropy(gen_data, qid, 'disease'))

k-anonymity: 34
l-diversity: 8
Entropy: 3.628988590913454
