In [559]:
import pandas as pd
import numpy as np
import datetime
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
import statsmodels.api as sm


to_run = True

In [560]:
#Loading the data
path = "C:/Users/abels/ITU/Privacy/Final_Proj/data/"
results = pd.read_excel(path+"public_data_resultsG.xlsx")
public =  pd.read_excel(path+"public_data_registerG.xlsx")
private =  pd.read_excel(path+"private_dataG.xlsx")

In [561]:
public.head()

Unnamed: 0,name,sex,dob,zip,citizenship,marital_status,last_voted
0,"Zeng, Darren",Male,1998-04-28,2200,Denmark,Never married,2
1,"Mian, Lauren",Female,2003-11-01,2100,Denmark,Never married,2
2,"Shumpert, Chieloka",Female,1998-10-07,2300,Denmark,Never married,1
3,"Martinez, Joslyn",Female,2004-02-15,2200,Denmark,Never married,1
4,"Lewis, Sydney",Female,2000-04-20,2200,Bulgaria,Never married,1


In [562]:
private.head()

Unnamed: 0,name,sex,evote,dob,zip,education,citizenship,marital_status,party
0,"Lewis, Samantha",Female,1,1975-07-30,2300,Vocational Education and Training (VET),Denmark,Married/separated,Green
1,"Moon, Ethan",Male,0,1937-10-21,2200,Primary education,Denmark,Widowed,Green
2,"Hoskins, Dawit",Male,0,1973-11-05,2100,Vocational Education and Training (VET),Denmark,Married/separated,Red
3,"Lowe, Brandon",Male,0,1977-04-19,2200,Masters programmes,Denmark,Married/separated,Red
4,"Sakamoto, Bruce",Male,1,1991-03-23,2200,Masters programmes,Ukraine,Married/separated,Green


In [563]:
results.head()

Unnamed: 0.1,Unnamed: 0,Red,Green,Invalid ballots,Total
0,Polling station: ZIP 2100,19,85,3,107
1,Polling station: ZIP 2200,42,127,3,172
2,Polling station: ZIP 2300,87,84,6,177
3,Polling station: ZIP 2400,75,148,6,229
4,E-votes,136,243,10,389


In [564]:
#preparation of survey data
survey = private.groupby(["evote","party"]).size().reset_index(name="count")

data = {
    "evote" : survey["evote"].values,
    "party" : survey["party"].values,
    "count" : survey["count"].values
}
df = pd.DataFrame(data)
df_survey = df.pivot_table(index ="evote",columns="party",values="count",fill_value=0)
df_survey["Total"] = df_survey.sum(axis=1)
df_survey = df_survey.reset_index().rename_axis(None, axis=1)
df_survey = df_survey[['evote', 'Red', 'Green', 'Invalid vote', 'Total']]
totals = df_survey[['Red', 'Green', 'Invalid vote', 'Total']].sum()
# totals["evote"] = "Total"
# df_survey = pd.concat([df_survey,totals], ignore_index=True)
df_survey.loc["column_total"] = df_survey.sum(axis=0)
df_survey.head()



Unnamed: 0,evote,Red,Green,Invalid vote,Total
0,0.0,44.0,79.0,4.0,127.0
1,1.0,31.0,38.0,4.0,73.0
column_total,1.0,75.0,117.0,8.0,200.0


In [565]:
#preparation of results

if to_run:
    results.loc["poll_votes"] = results.iloc[0:4,1:].sum()

    results.iloc[6,0] ="poll_votes"
    results = results.iloc[[4,5,6]]
    results = results.iloc[[0,2,1]]
    results = results.reset_index()
    results = results[["Unnamed: 0",	"Red",	"Green",	"Invalid ballots",	"Total"]]
    to_run = False
results

Unnamed: 0.1,Unnamed: 0,Red,Green,Invalid ballots,Total
0,E-votes,136.0,243.0,10.0,389.0
1,poll_votes,223.0,444.0,18.0,685.0
2,Total,359.0,687.0,28.0,1074.0


In [566]:
df_survey.head()

Unnamed: 0,evote,Red,Green,Invalid vote,Total
0,0.0,44.0,79.0,4.0,127.0
1,1.0,31.0,38.0,4.0,73.0
column_total,1.0,75.0,117.0,8.0,200.0


(A) Is there a significant difference between the political preferences as expressed in the survey and the election results for both electronic and polling station votes?

In [567]:
# TODO
"""
    scale down results by 200/1074 DONE
    get expectations for results df
    do chi square of independence between the two groups
"""
scaler = float(results.iloc[-1,-1]/df_survey.iloc[-1,-1])
for i in range(results.shape[0]):
    for j in range(1,5):
        results.iloc[i,j] = results.iloc[i,j] / scaler      #Rescaling results to same size as survey
results

Unnamed: 0.1,Unnamed: 0,Red,Green,Invalid ballots,Total
0,E-votes,25.325885,45.251397,1.862197,72.439479
1,poll_votes,41.527002,82.681564,3.351955,127.560521
2,Total,66.852886,127.932961,5.214153,200.0


In [568]:
#get expectations for results df
expected_results = results
for i in range(1,4):
    for j in range(0,2):
        grand_total = expected_results.iloc[2,4]
        row_total = expected_results.iloc[j,4]
        column_total = expected_results.iloc[2,i]
        expectation = (row_total*column_total)/grand_total
        expected_results.iloc[j,i] = expectation
expected_results = expected_results.reindex([1,0,2])

        
# expected_results.iloc[j,4] row total
# expected_results.iloc[2,i] column total
expected_results

Unnamed: 0.1,Unnamed: 0,Red,Green,Invalid ballots,Total
1,poll_votes,42.638945,81.595976,3.3256,127.560521
0,E-votes,24.213941,46.336985,1.888553,72.439479
2,Total,66.852886,127.932961,5.214153,200.0


In [569]:
df_survey

Unnamed: 0,evote,Red,Green,Invalid vote,Total
0,0.0,44.0,79.0,4.0,127.0
1,1.0,31.0,38.0,4.0,73.0
column_total,1.0,75.0,117.0,8.0,200.0


In [570]:
df_chi = pd.DataFrame([[1,2,3],[4,5,6]], index = ["poll","evote"],columns=["Red","Green","Invalid"])
for i in range(1,4):
    for j in range(0,2):
        Eji = expected_results.iloc[j,i]
        Oji = df_survey.iloc[j,i]
        val = ((Oji-Eji)**2)/Eji
        df_chi.iloc[j,i-1] = val

df_chi

  df_chi.iloc[j,i-1] = val
  df_chi.iloc[j,i-1] = val
  df_chi.iloc[j,i-1] = val


Unnamed: 0,Red,Green,Invalid
poll,0.043445,0.082591,0.136762
evote,1.901822,1.499997,2.360649


In [571]:
#testing of results
df_survey.iloc[1,1]
# df_chi.iloc[0,3]
# expected_results.iloc[0,4]

((4-1.888553)**2)/1.888553

2.3606477730881794

In [572]:
df_chi = df_chi[["Red","Green"]] 
print(df_chi) 
row_sums = df_chi.sum(axis=1)
row_sums
degree_of_freedom = 1


            Red     Green
poll   0.043445  0.082591
evote  1.901822  1.499997


(A) Is there a significant difference between the political preferences as expressed in the survey and the election results for both electronic and polling station votes?

with significance level $\alpha$ = 0.05, the critical threshhold for a chi square test is 3.841.

For the poll votes we find no significant difference between the results and the survey data with a value of 0.12.
For the evotes we find a higher value 3.40, but still not high enough to reject the null hypothesis and state that the findings are significantly different between the survey and the results data.

 .

(B) Is there a significant difference between political preferences of the voters depending on their demographic attributes recorded in the survey (that is, age, gender, education level…)?

In [573]:
#Logistic Regression
# data transformation

df_survey = private[private["party"].isin(["Red","Green"])] # removing invalid votes
df_survey.head()
df_survey.shape

(192, 9)

In [574]:
#Making binary party column
df_survey["party_bin"] = df_survey["party"].apply(lambda x: 1 if x == "Green" else 0)
df_survey.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_survey["party_bin"] = df_survey["party"].apply(lambda x: 1 if x == "Green" else 0)


Unnamed: 0,name,sex,evote,dob,zip,education,citizenship,marital_status,party,party_bin
0,"Lewis, Samantha",Female,1,1975-07-30,2300,Vocational Education and Training (VET),Denmark,Married/separated,Green,1
1,"Moon, Ethan",Male,0,1937-10-21,2200,Primary education,Denmark,Widowed,Green,1
2,"Hoskins, Dawit",Male,0,1973-11-05,2100,Vocational Education and Training (VET),Denmark,Married/separated,Red,0
3,"Lowe, Brandon",Male,0,1977-04-19,2200,Masters programmes,Denmark,Married/separated,Red,0
4,"Sakamoto, Bruce",Male,1,1991-03-23,2200,Masters programmes,Ukraine,Married/separated,Green,1


In [575]:
#Making age groups column
now = datetime.date.today()
df_survey["age"] = df_survey["dob"].apply(lambda x: now.year - x.year)
df_survey["age_group"] = df_survey["age"].apply(lambda x: 0 if x <= 35 else(1 if x <= 45 else(2 if x <= 55 else(3 if x <= 65 else 4))))
df_survey.groupby(["age_group"]).size().head()
df_survey.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_survey["age"] = df_survey["dob"].apply(lambda x: now.year - x.year)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_survey["age_group"] = df_survey["age"].apply(lambda x: 0 if x <= 35 else(1 if x <= 45 else(2 if x <= 55 else(3 if x <= 65 else 4))))


Unnamed: 0,name,sex,evote,dob,zip,education,citizenship,marital_status,party,party_bin,age,age_group
0,"Lewis, Samantha",Female,1,1975-07-30,2300,Vocational Education and Training (VET),Denmark,Married/separated,Green,1,49,2
1,"Moon, Ethan",Male,0,1937-10-21,2200,Primary education,Denmark,Widowed,Green,1,87,4
2,"Hoskins, Dawit",Male,0,1973-11-05,2100,Vocational Education and Training (VET),Denmark,Married/separated,Red,0,51,2
3,"Lowe, Brandon",Male,0,1977-04-19,2200,Masters programmes,Denmark,Married/separated,Red,0,47,2
4,"Sakamoto, Bruce",Male,1,1991-03-23,2200,Masters programmes,Ukraine,Married/separated,Green,1,33,0


In [576]:
#Making binary gender column
df_survey["sex_bin"] = df_survey["sex"].apply(lambda x: 0 if x =="Male" else 1) 


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_survey["sex_bin"] = df_survey["sex"].apply(lambda x: 0 if x =="Male" else 1)


In [577]:
#making numerical education column
education_mapping = {
    "Primary education" : 0,
    "Bachelors programmes": 2,
    "Masters programmes": 3,
    "Not stated": 0,
    "PhD programmes": 3,
    "Short cycle higher education": 0,
    "Upper secondary education": 0,
    "Vocational Education and Training (VET)": 1,
    "Vocational bachelors educations": 2 
}
df_survey["education_numeric"] = df_survey["education"].replace(education_mapping)
df_survey.groupby(["education_numeric"]).size()
# df_survey.groupby(["age_group"]).size()
# df_survey.groupby(["sex_bin"]).size()


  df_survey["education_numeric"] = df_survey["education"].replace(education_mapping)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_survey["education_numeric"] = df_survey["education"].replace(education_mapping)


education_numeric
0    65
1    64
2    39
3    24
dtype: int64

In [578]:
#making numerical education column
education_mapping = {
    "Primary education" : "not higher",
    "Bachelors programmes": "higher",
    "Masters programmes": "higher",
    "Not stated": "not higher",
    "PhD programmes": "higher",
    "Short cycle higher education": "higher",
    "Upper secondary education": "not higher",
    "Vocational Education and Training (VET)": "not higher",
    "Vocational bachelors educations": "higher" 
}
df_survey["education_transformed"] = df_survey["education"].replace(education_mapping)
df_survey.groupby(["education_numeric"]).size()
# df_survey.groupby(["age_group"]).size()
# df_survey.groupby(["sex_bin"]).size()
df_survey.groupby("citizenship").size()
# df_survey

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_survey["education_transformed"] = df_survey["education"].replace(education_mapping)


citizenship
Brazil              1
Bulgaria            1
Denmark           174
France              1
Iran                2
Ireland             1
Lithuania           1
Pakistan            1
Poland              1
Soviet Union        1
Syria               2
Turkey              2
USA                 1
Ukraine             1
United Kingdom      2
dtype: int64

Encodings:

-party:
0 Red
1 Green

-age:
0 19-33
1 33-44 
2 45-55
3 56-65
4 65+

-sex
0 Male
1 Female

-education
0 primary
1 vocational
2 bachelor
3 master+

In [579]:
df = df_survey[["party_bin","age_group","sex_bin","education_numeric"]]
df_vote = df_survey[["evote","age_group","sex_bin","education_numeric"]]
df_vote
df.groupby(["age_group"]).size()


age_group
0    43
1    34
2    45
3    30
4    40
dtype: int64

In [580]:
# Logistic Regression with model
y = df["party_bin"]
X = df[["age_group","sex_bin","education_numeric"]]
model = LogisticRegression()
model.fit(X,y)
coefficients = model.coef_[0]
feature_importance = dict(zip(X.columns,coefficients))
feature_importance

{'age_group': np.float64(-0.691000860870385),
 'sex_bin': np.float64(-0.9340428301072797),
 'education_numeric': np.float64(-0.6560230755389441)}

In [581]:

#getting p values with statsmodel

X_with_const = sm.add_constant(X)
logit_model = sm.Logit(y, X_with_const)
result = logit_model.fit()
print(result.summary())

#not the best way because the three attributes sex,age,education dont explain 
# well the distribution in the observed data


Optimization terminated successfully.
         Current function value: 0.532307
         Iterations 6
                           Logit Regression Results                           
Dep. Variable:              party_bin   No. Observations:                  192
Model:                          Logit   Df Residuals:                      188
Method:                           MLE   Df Model:                            3
Date:                Wed, 13 Nov 2024   Pseudo R-squ.:                  0.2044
Time:                        17:39:54   Log-Likelihood:                -102.20
converged:                       True   LL-Null:                       -128.45
Covariance Type:            nonrobust   LLR p-value:                 2.343e-11
                        coef    std err          z      P>|z|      [0.025      0.975]
-------------------------------------------------------------------------------------
const                 3.2791      0.530      6.185      0.000       2.240       4.318
age_grou

In [582]:
#Chi Square test of independence
#grouping for attribute and response
def grouping(dataframe,attribute,response):
    dataframe = dataframe.groupby([attribute,response]).size().reset_index(name="count")


    df_grouped = dataframe.pivot_table(index=attribute,columns=response,values="count",fill_value=0)
    df_grouped["total"] = df_grouped.sum(axis=1)
    total_loc = df_grouped.shape[0]
    df_grouped.loc[total_loc] = df_grouped.sum(axis=0)
    return df_grouped
df_group_age = grouping(df,"age_group","party_bin")
df_group_sex = grouping(df,"sex_bin","party_bin")
df_group_education = grouping(df,"education_numeric","party_bin")
df_group_education

party_bin,0,1,total
education_numeric,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,17.0,48.0,65.0
1,21.0,43.0,64.0
2,24.0,15.0,39.0
3,13.0,11.0,24.0
4,75.0,117.0,192.0


In [583]:
#get expectation
def expectation(dataframe):

    shape = dataframe.shape
    row = shape[0]-1 #5
    col = shape[1]-1 #2

    expectations = pd.DataFrame(np.zeros((row,col)))

    grand_total = dataframe.iloc[row,col]

    for i in range(row):
        column_total = dataframe.iloc[i,col]
        for j in range(col):
            row_total = dataframe.iloc[row,j]
            
            expectations.iloc[i,j] = (column_total*row_total)/grand_total
    return expectations
age_expectations = expectation(df_group_age)
sex_expectations = expectation(df_group_sex)
education_expectations = expectation(df_group_education)
education_expectations

Unnamed: 0,0,1
0,25.390625,39.609375
1,25.0,39.0
2,15.234375,23.765625
3,9.375,14.625


In [584]:
#get the chi test scores
def chi_square(exp_df,group_df):
    row = exp_df.shape[0]
    col = exp_df.shape[1]  
    df_chi = pd.DataFrame(np.zeros((row,col)))
    for i in range(row):
        for j in range(col):
            obs = group_df.iloc[i,j]
            exp = exp_df.iloc[i,j]
            df_chi.iloc[i,j] = ((obs-exp)**2)/exp

    degree_of_freedom = (len(df_chi.columns)-1)*(len(df_chi[df_chi.columns[0]])-1)
    df_chi.loc["total"] = df_chi.sum(axis=0)
    return df_chi, degree_of_freedom

In [585]:
chi_age, dof = chi_square(age_expectations, df_group_age)
print("degrees of freedom: " ,dof)
chi_age

degrees of freedom:  4


Unnamed: 0,0,1
0,9.749433,6.249637
1,0.810662,0.519655
2,0.333681,0.213898
3,3.36675,2.158173
4,3.481,2.23141
total,17.741525,11.372773


In [586]:
chi_sex, dof = chi_square(sex_expectations, df_group_sex)
print("degrees of freedom: " ,dof)
chi_sex

degrees of freedom:  1


Unnamed: 0,0,1
0,1.431154,0.917406
1,1.691364,1.084207
total,3.122517,2.001614


In [587]:
chi_education, dof = chi_square(education_expectations, df_group_education)
print("degrees of freedom: " ,dof)
chi_education

degrees of freedom:  3


Unnamed: 0,0,1
0,2.772779,1.777422
1,0.64,0.410256
2,5.043606,3.233081
3,1.401667,0.898504
total,9.858051,6.319264


In [588]:
#chi square distribution table
chi_table = pd.DataFrame([3.841,5.991,7.815,9.488,11.070,12.592],columns=["0.05"])
chi_table.index = range(1,len(chi_table)+1)
chi_table.index.name = "dof"

chi_table

Unnamed: 0_level_0,0.05
dof,Unnamed: 1_level_1
1,3.841
2,5.991
3,7.815
4,9.488
5,11.07
6,12.592


(B) Is there a significant difference between political preferences of the voters depending on their demographic attributes recorded in the survey (that is, age, gender, education level…)?

Education level
There is a statistically significant association between level of education and political preference towards the Red party.

Age
Age shows a significant association with people's party preference. It shows an especially strong association with the Red party and a weaker but still significant one with the Green party.

gender
Sex does not seem to be a significant contributor to people's choice of political party.



(C) Is there a significant difference between voter’s choice of the voting channel (that is, if they decide to vote either online or in person) depending on their demographic attributes recorded in the survey?

In [589]:
df_vote

Unnamed: 0,evote,age_group,sex_bin,education_numeric
0,1,2,1,1
1,0,4,0,0
2,0,2,0,1
3,0,2,0,3
4,1,0,0,3
...,...,...,...,...
195,0,2,1,0
196,0,0,1,1
197,1,4,0,0
198,1,2,1,2


In [590]:
df_group_education_evote = grouping(df_vote,"education_numeric","evote")
education_expectations_evote = expectation(df_group_education_evote)
chi_education_evote, dof = chi_square(education_expectations_evote,df_group_education_evote)
print("degrees_of_freedom: ",dof)
chi_education_evote

degrees_of_freedom:  3


Unnamed: 0,0,1
0,0.009856,0.017569
1,1.97561,3.521739
2,2.551605,4.548512
3,0.009146,0.016304
total,4.546216,8.104125


In [591]:
df_group_age_evote = grouping(df_vote,"age_group","evote")
age_expectations_evote = expectation(df_group_age_evote)
chi_age_evote, dof = chi_square(age_expectations_evote,df_group_age_evote)
print("degrees_of_freedom: ",dof)
chi_age_evote


degrees_of_freedom:  4


Unnamed: 0,0,1
0,0.010857,0.019354
1,1.049543,1.870924
2,0.11593,0.206658
3,2.392734,4.265308
4,0.005488,0.009783
total,3.574551,6.372026


In [592]:
df_group_sex_evote = grouping(df_vote,"sex_bin","evote")
sex_expectations_evote = expectation(df_group_sex_evote)
chi_sex_evote, dof = chi_square(sex_expectations_evote,df_group_sex_evote)
print("degrees_of_freedom: ",dof)
chi_sex_evote

degrees_of_freedom:  1


Unnamed: 0,0,1
0,0.005863,0.010452
1,0.006929,0.012352
total,0.012792,0.022803


In [593]:
chi_table

Unnamed: 0_level_0,0.05
dof,Unnamed: 1_level_1
1,3.841
2,5.991
3,7.815
4,9.488
5,11.07
6,12.592


(C) Is there a significant difference between voter’s choice of the voting channel (that is, if they decide to vote either online or in person) depending on their demographic attributes recorded in the survey?

Education level
There is a statistically significant association between voting online and level of education of a person.

Age
Age shows no association with people's choice of voting method. 

gender
Sex does not seem to be a significant contributor to people's choice of voting method.



In [594]:
df_survey.groupby("education").size()

education
Bachelors programmes                        2
Masters programmes                         23
Not stated                                  2
PhD programmes                              1
Primary education                          38
Short cycle higher education                8
Upper secondary education                  17
Vocational Education and Training (VET)    64
Vocational bachelors educations            37
dtype: int64

STEP 2

Anonimization

Sensitive data:
party



In [595]:
# Load the dataset
survey_data = pd.read_excel("data/private_dataG.xlsx")  # Replace with your actual file path

# Step 1: Remove 'name' to ensure direct identifiers are not in the dataset
anonymized_survey_data = survey_data.drop(columns=['name'])

# Step 2: Generalize 'dob' to new age groups: "18-35", "36-65", "65+"
def broader_age_group(dob):
    age = 2024 - dob.year
    if age <= 35:
        return "18-35"
    elif age <= 65:
        return "36-65"
    else:
        return "65+"

anonymized_survey_data['age_group'] = pd.to_datetime(anonymized_survey_data['dob']).apply(broader_age_group)
anonymized_survey_data = anonymized_survey_data.drop(columns=['dob'])

# Step 3: Generalize 'zip' to regions
zip_to_region = {2100: 'Region A', 2200: 'Region A', 2300: 'Region B', 2400: 'Region B'}
anonymized_survey_data['region'] = anonymized_survey_data['zip'].map(zip_to_region)
anonymized_survey_data = anonymized_survey_data.drop(columns=['zip'])

# Step 4: Simplify 'marital_status' to broader categories
marital_mapping = {
    'Never married': 'Single',
    'Divorced': 'Single',
    'Married/separated': 'Married',
    'Widowed': 'Single'
}
anonymized_survey_data['marital_status'] = anonymized_survey_data['marital_status'].map(marital_mapping)

# Step 5: Generalize 'education' into fewer categories
education_mapping = {
    'Primary education': 'Basic Education',
    'Upper secondary education': 'Basic Education',
    'Vocational Education and Training (VET)': 'Higher Education',
    'Short cycle higher education': 'Higher Education',
    'Vocational bachelors educations': 'Higher Education',
    'Bachelors programmes': 'Higher Education',
    'Masters programmes': 'Higher Education',
    'PhD programmes': 'Higher Education',
    'Not stated': 'Basic Education'
}
anonymized_survey_data['education'] = anonymized_survey_data['education'].map(education_mapping)

# Step 6: Generalize 'citizenship' to 'Domestic' or 'Foreign'
anonymized_survey_data['citizenship'] = anonymized_survey_data['citizenship'].apply(
    lambda x: 'Domestic' if x == 'Denmark' else 'Foreign'
)

# Display the final anonymized survey data for verification
print("Anonymized Survey Data Sample:")
print(anonymized_survey_data.head())

# Define quasi-identifiers for k-anonymity and l-diversity analysis
quasi_identifiers = ['sex', 'age_group', 'region', 'marital_status', 'education', 'citizenship']

# 1. Calculate Disclosure Risk
grouped_counts = anonymized_survey_data.groupby(quasi_identifiers).size()
unique_groups = grouped_counts[grouped_counts == 1].count()
near_unique_groups = grouped_counts[grouped_counts <= 2].count()

print("\nDisclosure Risk Analysis:")
print(f"Unique groups (potential re-identification risk): {unique_groups}")
print(f"Near-unique groups (groups of size 2): {near_unique_groups}")

# 2. Calculate k-Anonymity
min_k_value = grouped_counts.min()
print(f"\nk-Anonymity Analysis:")
print(f"Minimum k value: {min_k_value}")

# 3. Calculate l-Diversity
# We need to check the diversity of sensitive values ('party') within each quasi-identifier group
l_diversity_counts = anonymized_survey_data.groupby(quasi_identifiers)['party'].nunique()
min_l_diversity = l_diversity_counts.min()

print(f"\nl-Diversity Analysis:")
print(f"Minimum l-diversity value for 'party': {min_l_diversity}")

Anonymized Survey Data Sample:
      sex  evote         education citizenship marital_status  party  \
0  Female      1  Higher Education    Domestic        Married  Green   
1    Male      0   Basic Education    Domestic         Single  Green   
2    Male      0  Higher Education    Domestic        Married    Red   
3    Male      0  Higher Education    Domestic        Married    Red   
4    Male      1  Higher Education     Foreign        Married  Green   

  age_group    region  
0     36-65  Region B  
1       65+  Region A  
2     36-65  Region A  
3     36-65  Region A  
4     18-35  Region A  

Disclosure Risk Analysis:
Unique groups (potential re-identification risk): 20
Near-unique groups (groups of size 2): 31

k-Anonymity Analysis:
Minimum k value: 1

l-Diversity Analysis:
Minimum l-diversity value for 'party': 1


In [596]:

anonymized_survey_data.groupby(["citizenship"]).size()

citizenship
Domestic    181
Foreign      19
dtype: int64

In [597]:
# Step 1: Identify high-risk groups (unique and near-unique records)
# Group by quasi-identifiers and find groups with size <= 2 (unique or near-unique)
grouped_counts = anonymized_survey_data.groupby(quasi_identifiers).size()
high_risk_groups = grouped_counts[grouped_counts <= 2].index

# Step 2: Apply local suppression to high-risk groups
# Replace values in specific columns for records in high-risk groups
anonymized_survey_data['suppressed'] = anonymized_survey_data[quasi_identifiers].apply(
    lambda row: tuple(row) in high_risk_groups, axis=1
)

# Locally suppress sensitive quasi-identifier values for high-risk records
anonymized_survey_data.loc[anonymized_survey_data['suppressed'], 'age_group'] = "Unknown"
anonymized_survey_data.loc[anonymized_survey_data['suppressed'], 'region'] = "Unknown"
anonymized_survey_data.loc[anonymized_survey_data['suppressed'], 'education'] = "Unknown"
anonymized_survey_data.loc[anonymized_survey_data['suppressed'], 'citizenship'] = "Unknown"

# Drop the helper column used for marking suppressed rows
anonymized_survey_data = anonymized_survey_data.drop(columns=['suppressed'])

# Recalculate Disclosure Risk, k-Anonymity, and l-Diversity after local suppression
grouped_counts = anonymized_survey_data.groupby(quasi_identifiers).size()
unique_groups = grouped_counts[grouped_counts == 1].count()
near_unique_groups = grouped_counts[grouped_counts <= 2].count()

print("\nRevised Disclosure Risk Analysis with Local Suppression:")
print(f"Unique groups (potential re-identification risk): {unique_groups}")
print(f"Near-unique groups (groups of size 2): {near_unique_groups}")

# Recalculate minimum k-anonymity and l-diversity
min_k_value = grouped_counts.min()
print(f"\nk-Anonymity Analysis:")
print(f"Minimum k value: {min_k_value}")

l_diversity_counts = anonymized_survey_data.groupby(quasi_identifiers)['party'].nunique()
min_l_diversity = l_diversity_counts.min()
print(f"\nl-Diversity Analysis:")
print(f"Minimum l-diversity value for 'party': {min_l_diversity}")



Revised Disclosure Risk Analysis with Local Suppression:
Unique groups (potential re-identification risk): 0
Near-unique groups (groups of size 2): 0

k-Anonymity Analysis:
Minimum k value: 3

l-Diversity Analysis:
Minimum l-diversity value for 'party': 1


In [598]:
#calculate k anonimity

quasi_identifiers = ["sex", "education", "citizenship", "marital_status", "age_group", "region"]
k_df = anonymized_survey_data.groupby(quasi_identifiers).size().reset_index(name="count")

k2plus_df = k_df[k_df["count"] >= 2]

k2plus_df.head(50)
len(k_df[k_df["count"] == 1])

0

In [599]:

df_survey.groupby("marital_status").size()
allgroup = anonymized_survey_data.groupby(['sex', 'evote', 'education', 'citizenship', 'marital_status', 'party',
       'age_group', 'region']).size().reset_index(name="count")
len(allgroup[allgroup["count"]==1])

37

In [600]:
#save to csv


In [601]:
#calculate disclosure risk

num_unique_rows = len(k_df[k_df["count"] == 1])
num_total_rows = len(anonymized_survey_data)
disclosure_risk = (num_unique_rows/num_total_rows)*100
print(f"reidentification risk: {disclosure_risk}%")

reidentification risk: 0.0%


Statistical analysis of anonymized data

(A) Is there a significant difference between the political preferences as expressed in the survey and the election results for both electronic and polling station votes?

(B) Is there a significant difference between political preferences of the voters depending on their demographic attributes recorded in the survey (that is, age, gender, education level…)?

(C) Is there a significant difference between voter’s choice of the voting channel (that is, if they decide to vote either online or in person) depending on their demographic attributes recorded in the survey?

In [602]:
anonymized_survey_data = anonymized_survey_data[anonymized_survey_data["party"].isin(["Green","Red"])]

In [603]:
anonymized_survey_data.groupby(["party"]).size()
# anonymized_survey_data.head(50)
# anonymized_survey_data.groupby(["citizenship"]).size()
anonymized_survey_data


Unnamed: 0,sex,evote,education,citizenship,marital_status,party,age_group,region
0,Female,1,Higher Education,Domestic,Married,Green,36-65,Region B
1,Male,0,Unknown,Unknown,Single,Green,Unknown,Unknown
2,Male,0,Higher Education,Domestic,Married,Red,36-65,Region A
3,Male,0,Higher Education,Domestic,Married,Red,36-65,Region A
4,Male,1,Unknown,Unknown,Married,Green,Unknown,Unknown
...,...,...,...,...,...,...,...,...
195,Female,0,Basic Education,Domestic,Married,Green,36-65,Region B
196,Female,0,Unknown,Unknown,Married,Green,Unknown,Unknown
197,Male,1,Unknown,Unknown,Married,Green,Unknown,Unknown
198,Female,1,Higher Education,Domestic,Single,Red,36-65,Region A


In [604]:
#A
#results df
expected_results =  expected_results[["Green","Red"]]
expected_results

Unnamed: 0,Green,Red
1,81.595976,42.638945
0,46.336985,24.213941
2,127.932961,66.852886


In [605]:

df_anonym_grouped = grouping(anonymized_survey_data,"evote","party")
df_anonym_grouped

party,Green,Red,total
evote,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,79.0,44.0,123.0
1,38.0,31.0,69.0
2,117.0,75.0,192.0


In [606]:
df_chi_anonym = pd.DataFrame([[1,2],[4,5]], index = ["poll","evote"],columns=["Red","Green"])
for i in range(0,2):
    for j in range(0,2):
        Eji = expected_results.iloc[j,i]
        Oji = df_anonym_grouped.iloc[j,i]
        val = ((Oji-Eji)**2)/Eji
        df_chi_anonym.iloc[j,i-1] = val

df_chi_anonym

  df_chi_anonym.iloc[j,i-1] = val
  df_chi_anonym.iloc[j,i-1] = val


Unnamed: 0,Red,Green
poll,0.043445,0.082591
evote,1.901822,1.499997


In [607]:
1.90182 + 1.499997

3.4018170000000003

In [608]:
#B
#sex
df_anonym_sex = grouping(anonymized_survey_data,"sex","party")
anonym_sex_expectations = expectation(df_anonym_sex)
anonym_chi_sex, dof = chi_square(anonym_sex_expectations,df_anonym_sex)
print("degrees_of_freedom: ",dof)
anonym_chi_sex

degrees_of_freedom:  1


Unnamed: 0,0,1
0,1.084207,1.691364
1,0.917406,1.431154
total,2.001614,3.122517


In [609]:
#age
df_anonym_age = grouping(anonymized_survey_data,"age_group","party")
anonym_age_expectations = expectation(df_anonym_age)
anonym_chi_age, dof = chi_square(anonym_age_expectations,df_anonym_age)
print("degrees_of_freedom: ",dof)
anonym_chi_age

degrees_of_freedom:  3


Unnamed: 0,0,1
0,5.339847,8.330161
1,0.486924,0.759601
2,0.471154,0.735
3,0.321838,0.502067
total,6.619762,10.326829


In [610]:
#education
df_anonym_education = grouping(anonymized_survey_data,"education","party")
anonym_education_expectations = expectation(df_anonym_education)
anonym_chi_education, dof = chi_square(anonym_education_expectations,df_anonym_education)
print("degrees_of_freedom: ",dof)
anonym_chi_education

degrees_of_freedom:  2


Unnamed: 0,0,1
0,2.385256,3.721
1,0.342924,0.534961
2,0.321838,0.502067
total,3.050018,4.758029


In [611]:
#marital
df_anonym_marital = grouping(anonymized_survey_data,"marital_status","party")
anonym_marital_expectations = expectation(df_anonym_marital)
anonym_chi_marital, dof = chi_square(anonym_marital_expectations,df_anonym_marital)
print("degrees_of_freedom: ",dof)
anonym_chi_marital

degrees_of_freedom:  1


Unnamed: 0,0,1
0,0.854006,1.33225
1,0.753535,1.175515
total,1.607541,2.507765


In [612]:
chi_table

Unnamed: 0_level_0,0.05
dof,Unnamed: 1_level_1
1,3.841
2,5.991
3,7.815
4,9.488
5,11.07
6,12.592


In [613]:
#C
#sex
df_anonym_sex = grouping(anonymized_survey_data,"sex","evote")
anonym_sex_expectations = expectation(df_anonym_sex)
anonym_chi_sex, dof = chi_square(anonym_sex_expectations,df_anonym_sex)
print("degrees_of_freedom: ",dof)
anonym_chi_sex

degrees_of_freedom:  1


Unnamed: 0,0,1
0,0.006929,0.012352
1,0.005863,0.010452
total,0.012792,0.022803


In [614]:
#age
df_anonym_age = grouping(anonymized_survey_data,"age_group","evote")
anonym_age_expectations = expectation(df_anonym_age)
anonym_chi_age, dof = chi_square(anonym_age_expectations,df_anonym_age)
print("degrees_of_freedom: ",dof)
anonym_chi_age

degrees_of_freedom:  3


Unnamed: 0,0,1
0,0.007938,0.01415
1,0.052689,0.093923
2,0.025407,0.04529
3,0.157608,0.280954
total,0.243641,0.434317


In [615]:
#education
df_anonym_education = grouping(anonymized_survey_data,"education","evote")
anonym_education_expectations = expectation(df_anonym_education)
anonym_chi_education, dof = chi_square(anonym_education_expectations,df_anonym_education)
print("degrees_of_freedom: ",dof)
anonym_chi_education

degrees_of_freedom:  2


Unnamed: 0,0,1
0,0.07378,0.131522
1,0.00513,0.009144
2,0.157608,0.280954
total,0.236518,0.42162


Conclusion



Non anonymized



(A) Is there a significant difference between the political preferences as expressed in the survey and the election results for both electronic and polling station votes?

with significance level $\alpha$ = 0.05, the critical threshhold for a chi square test is 3.841.

For the poll votes we find no significant difference between the results and the survey data with a value of 0.12.
For the evotes we find a higher value 3.40, but still not high enough to reject the null hypothesis and state that the findings are significantly different between the survey and the results data.

(B) Is there a significant difference between political preferences of the voters depending on their demographic attributes recorded in the survey (that is, age, gender, education level…)?

Education level
There is a statistically significant association between level of education and political preference towards the Red party.

Age
Age shows a significant association with people's party preference. It shows an especially strong association with the Red party and a weaker but still significant one with the Green party.

gender
Sex does not seem to be a significant contributor to people's choice of political party.

(C) Is there a significant difference between voter’s choice of the voting channel (that is, if they decide to vote either online or in person) depending on their demographic attributes recorded in the survey?

Education level
There is a statistically significant association between voting online and level of education of a person.

Age
Age shows no association with people's choice of voting method. 

gender
Sex does not seem to be a significant contributor to people's choice of voting method.




Anonymized



(A) 
There is no significant association between people's voting method and choice of party.

(B)
There is no significant association between sex and people's choice of party.
Age is statistically significant for the Red party and close to being significant for the Green party.
Education is not significant, but close, for both parties.

(C)
There is no significant association between sex and people's choice of voting method.
Age has no statistically significant implication for people's choice of voting method.
Education is not significant for voting method.

In [616]:

df_survey.groupby("marital_status").size()
allgroup = anonymized_survey_data.groupby(['sex', 'evote', 'education', 'citizenship', 'marital_status', 'party',
       'age_group', 'region']).size().reset_index(name="count")
len(allgroup[allgroup["count"]==1])

29

In [617]:
print(anonymized_survey_data.columns)
len(anonymized_survey_data[anonymized_survey_data["education"]=="Unknown"])
# len(anonymized_survey_data[anonymized_survey_data["sex"]=="Unknown"])
# len(anonymized_survey_data[anonymized_survey_data["evote"]=="Unknown"])
# len(anonymized_survey_data[anonymized_survey_data["citizenship"]=="Unknown"])
# len(anonymized_survey_data[anonymized_survey_data["marital_status"]=="Unknown"])
# len(anonymized_survey_data[anonymized_survey_data["party"]=="Unknown"])
# len(anonymized_survey_data[anonymized_survey_data["age_group"]=="Unknown"])
# len(anonymized_survey_data[anonymized_survey_data["region"]=="Unknown"])
# len(anonymized_survey_data[anonymized_survey_data["education"]=="Unknown"])

Index(['sex', 'evote', 'education', 'citizenship', 'marital_status', 'party',
       'age_group', 'region'],
      dtype='object')


39

In [618]:
anonymized_survey_data.to_csv('anonymised_dataG.csv', index=False)

In [619]:
anonymized_survey_data

Unnamed: 0,sex,evote,education,citizenship,marital_status,party,age_group,region
0,Female,1,Higher Education,Domestic,Married,Green,36-65,Region B
1,Male,0,Unknown,Unknown,Single,Green,Unknown,Unknown
2,Male,0,Higher Education,Domestic,Married,Red,36-65,Region A
3,Male,0,Higher Education,Domestic,Married,Red,36-65,Region A
4,Male,1,Unknown,Unknown,Married,Green,Unknown,Unknown
...,...,...,...,...,...,...,...,...
195,Female,0,Basic Education,Domestic,Married,Green,36-65,Region B
196,Female,0,Unknown,Unknown,Married,Green,Unknown,Unknown
197,Male,1,Unknown,Unknown,Married,Green,Unknown,Unknown
198,Female,1,Higher Education,Domestic,Single,Red,36-65,Region A


In [620]:
import os
os.getcwd()

'c:\\Users\\abels\\ITU\\Privacy\\Final_Proj'

In [623]:
os.chdir('C:/Users/abels/ITU/Privacy/Final_Project/priv_proj')
os.getcwd()

'C:\\Users\\abels\\ITU\\Privacy\\Final_Project\\priv_proj'