In [None]:
import pandas as pd
import json
import diffprivlib as dp
import nltk
from nltk.stem.porter import PorterStemmer
import numpy as np
import pickle
from datasets import load_dataset
from yellowbrick.text import TSNEVisualizer
from sklearn.feature_extraction.text import TfidfVectorizer
import string
import unidecode
import re
import random


In [None]:
def CleaningPipeline(texts):
    for i, text in enumerate(texts):
        text = text.lower()
        text = unidecode.unidecode(text)
        text = text.translate(str.maketrans('', '', string.punctuation))
        re.sub("\s\s+" , " ", text)
        texts[i] = text
    return texts
def loadAndCleanDatasetBalanced(datasetName, _language):
    dataset = load_dataset(datasetName, language=_language)
    x_train = dataset["train"]["text"]
    x_train = CleaningPipeline(x_train)
    y_train = dataset["train"]["label"]
    
    positiveIndices = [i for i, x in enumerate(y_train) if x == 1]
    random.shuffle(positiveIndices)
    maxValues = min(5000,len(positiveIndices))
    posIdxs = positiveIndices[0:maxValues]
    
    negativeIndices = [i for i, x in enumerate(y_train) if x == 0]
    random.shuffle(negativeIndices)
    negIdxs = negativeIndices[0:maxValues]
    canton_train = dataset["train"]["canton"]
    region_train = dataset["train"]["region"]
    
    x_train_bal = []
    y_train_bal = []
    canton_train_bal = []
    region_train_bal = []
    for posIdx in posIdxs:
        x_train_bal.append(x_train[posIdx])
        y_train_bal.append(y_train[posIdx])
        canton_train_bal.append(canton_train[posIdx])
        region_train_bal.append(region_train[posIdx])
    for negIdx in negIdxs:
        x_train_bal.append(x_train[negIdx])
        y_train_bal.append(y_train[negIdx])
        canton_train_bal.append(canton_train[negIdx])
        region_train_bal.append(region_train[negIdx])
    
    x_val = dataset["validation"]["text"]
    x_val  = CleaningPipeline(x_val)
    y_val  = dataset["validation"]["label"]
    canton_val = dataset["validation"]["canton"]
    region_val = dataset["validation"]["region"]

    x_test = dataset["test"]["text"]
    x_test = CleaningPipeline(x_test)
    y_test = dataset["test"]["label"]
    canton_test = dataset["test"]["canton"]
    region_test = dataset["test"]["region"]
    print(len(y_train_bal))
    return x_train_bal, y_train_bal, x_val, y_val, x_test, y_test, canton_test, region_test, canton_val, region_val, canton_train_bal, region_train_bal
def loadAndCleanDatasetImbalanced(datasetName, _language):
    dataset = load_dataset(datasetName, language=_language)
    x_train = dataset["train"]["text"][0:10000]
    x_train = CleaningPipeline(x_train)
    y_train = dataset["train"]["label"][0:10000]
    
    x_val = dataset["validation"]["text"]
    x_val  = CleaningPipeline(x_val)[0:10000]
    y_val  = dataset["validation"]["label"][0:10000]
    canton_val = dataset["validation"]["canton"]
    region_val = dataset["validation"]["region"]
    
    x_test = dataset["test"]["text"]
    x_test = CleaningPipeline(x_test)
    y_test = dataset["test"]["label"]
    canton_test = dataset["test"]["canton"]
    canton_train = dataset["train"]["canton"][0:10000]
    region_test = dataset["test"]["region"]
    region_train = dataset["train"]["region"][0:10000]
    return x_train, y_train, x_val, y_val, x_test, y_test, canton_test, region_test, canton_val, region_val, canton_train, region_train
def Multiply(x, w):
    return np.multiply(x,w)
    

def VisualizeDatasetSklearn(x_val, canton_val, region_val, coef, name):
    print(x_val.shape)
    print(coef.shape)

    coef = coef.astype(np.float16)
    dataToVisualize = Multiply(x_val,coef)
    print(dataToVisualize)
    tsne = TSNEVisualizer(colormap = 'tab20c', title = 'TSNE with colored cantons, model: '+name)
    tsne.fit(dataToVisualize, canton_val)
    tsne.show()
    tsne = TSNEVisualizer(colormap = 'tab20c', title = 'TSNE with colored regions, model: '+name)
    tsne.fit(dataToVisualize, region_val)
    tsne.show()
    
def VisualizeDatasetSklearn2(x_val, canton_val, region_val, coef, name):
    coef = coef.astype(np.float16)
    dataToVisualize = Multiply(x_val.toarray(),coef)
    tsne = TSNEVisualizer(colormap = 'tab20c', title = 'TSNE with colored cantons, model: '+name)
    tsne.fit(dataToVisualize, canton_val)
    tsne.show()
    tsne = TSNEVisualizer(colormap = 'tab20c', title = 'TSNE with colored regions, model: '+name)
    tsne.fit(dataToVisualize, region_val)
    tsne.show()
    

In [None]:
def GetStatistics():
    cantonDict = {}
    regionDict = {}
    for language in languages:
        x_train, y_train, x_val, y_val, x_test, y_test, canton_test, region_test, canton_val, region_val, canton_train, region_train = loadAndCleanDataset('fscs',language)
        for i, canton in enumerate(canton_train):
            if canton in cantonDict:
                cantonDict[canton][language][y_train[i]] += 1
            else:
                cantonDict[canton] = {'de': [0, 0], 'fr': [0, 0],'it': [0, 0]}
                cantonDict[canton][language][y_train[i]] +=1
        for i, canton in enumerate(canton_test):
            if canton in cantonDict:
                 cantonDict[canton][language][y_test[i]] += 1
            else:
                cantonDict[canton] = {'de': [0, 0], 'fr': [0, 0],'it': [0, 0]}
                cantonDict[canton][language][y_test[i]] +=1
         
        for i, region in enumerate(region_train):
            if region in regionDict:
                regionDict[region][language][y_train[i]] += 1
            else:
                regionDict[region] = {'de': [0, 0], 'fr': [0, 0],'it': [0, 0]}
                regionDict[region][language][y_train[i]] +=1
        for i, region in enumerate(region_test):
            if region in regionDict:
                regionDict[region][language][y_test[i]] += 1
            else:
                regionDict[region] = {'de': [0, 0], 'fr': [0, 0],'it': [0, 0]}
                regionDict[region][language][y_test[i]] +=1
                
    file1 = open("canton.txt","a")
    for canton, languageDict in cantonDict.items():
        file1.write(str(canton)+";")
        for language2, value in languageDict.items():
            file1.write(str(value[0])+";"+str(value[1])+";")
        file1.write("\n")

    file1.close()
            
    file1 = open("region.txt","a")
    for region, languageDict in regionDict.items():
        file1.write(str(region)+";")
        for language2, value in languageDict.items():
            file1.write(str(value[0])+";"+str(value[1])+";")
        file1.write("\n")
    file1.close()

    return cantonDict, regionDict


def getTotalDist(language, canton_train, canton_test, region_train, region_test):
        cantonDict = {}
        regionDict = {}
        for canton in canton_test:
            if canton in cantonDict:
                cantonDict[canton] += 1
            else:
                cantonDict[canton] = 1
        for canton in canton_train:
            if canton in cantonDict:
                cantonDict[canton] += 1
            else:
                cantonDict[canton] = 1
        for region in region_train:
            if region in regionDict:
                regionDict[region] += 1
            else:
                regionDict[region] = 1
        for region in region_test:
            if region in regionDict:
                regionDict[region] += 1
            else:
                regionDict[region] = 1
        return cantonDict, regionDict

In [None]:
from sklearn.metrics import f1_score
def CalculateOddsAndRiskFixed(predictions, labels, sensitive_col, sensitiveDict):
    accuracies = []
    odds = []
    values = set(sensitive_col)
    for sensitive_value in values:
        if sensitiveDict[sensitive_value] < 5: #Filtering mechanism
            continue
        positives = 0
        count = 0
        accuracy = 0
        for i, value in enumerate(sensitive_col):
            if sensitive_value == value:
                count += 1
                if predictions[i] == 1:
                    positives += 1
                if predictions[i] == labels[i]:
                    accuracy += 1
        accuracies.append(accuracy/count)
        odds.append(positives/count)
    return accuracies, odds   
def CalculatePositiveAccuracy(predictions, labels):
    P = 0
    TP = 0
    
    for i, label in enumerate(labels):
        if label == 1:
            P += 1
            if label == predictions[i]:
                TP+=1
    return TP/P
def EvaluateResults(predictions, labels, sensitive_attributes, canton_test, region_test, language,  cantonDict, regionDict):
    equal_risk = []
    equal_odds = []
    micro = f1_score(labels,predictions, average='micro')
    macro = f1_score(labels,predictions, average='macro')
  
    positive_accuracy = CalculatePositiveAccuracy(predictions,labels)
    canton_acc, canton_odds = CalculateOddsAndRiskFixed(predictions,labels,canton_test,cantonDict)
    region_acc, region_odds = CalculateOddsAndRiskFixed(predictions,labels,region_test,regionDict)
        

    equal_risk.append((sensitive_attributes[0][0], max(canton_acc)-min(canton_acc)))
    equal_risk_var_canton = np.var(canton_acc)
    equal_odds.append((sensitive_attributes[0][0], max(canton_odds)-min(canton_odds)))
    equal_odds_var_canton = np.var(canton_odds)
    equal_risk.append((sensitive_attributes[1][0], max(region_acc)-min(region_acc)))
    equal_risk_var_region = np.var(region_acc)
    equal_odds.append((sensitive_attributes[1][0], max(region_odds)-min(region_odds)))
    equal_odds_var_region = np.var(region_odds)
    return micro, macro, equal_risk, equal_odds, positive_accuracy,equal_risk_var_canton,equal_odds_var_canton,equal_risk_var_region,equal_odds_var_region

In [None]:
def testSklearnModel(vectorizer, privacy_on, x_train, y_train, x_val, y_val, x_test, y_test, canton_test, region_test, canton_val, region_val, canton_train, region_train, epsilon):
    x_train = vectorizer.fit_transform(x_train)
    x_val = vectorizer.transform(x_val)
    x_test = vectorizer.transform(x_test)
    cantonDict, regionDict = getTotalDist(language, canton_train, canton_test, region_train, region_test)
    #language,model,vectorizer, f1_micro, f1_macro, equal_risk_canton, equal_risk_region, equal_odds_canton, equal_risk_region
    file1 = open("resultsWithBalancedDatasetEpsilonnew"+str(epsilon)+".txt","a")
    if privacy_on:
        totalMicro = 0
        totalMacro = 0
        totalRiskCanton = 0
        totalRiskRegion = 0
        totalOddsCanton = 0
        totalOddsRegion = 0
        totalOdds = 0
        totalPosAcc = 0
        equalVarRiskCanton = 0
        equalVarOddsCanton = 0
        equalVarRiskRegion = 0
        equalVarOddsRegion = 0
        for k in range(5): #set the 5 to 1, and avoid dividing by 5 in the result-writer to get the non-averaged results
            model = dp.models.LogisticRegression(epsilon = epsilon)
            model.fit(x_train.A,y_train)
            y_pred = model.predict(x_test)   
            micro, macro, equal_risk, equal_odds, positive_accuracy,equal_risk_var_canton,equal_odds_var_canton,equal_risk_var_region,equal_odds_var_region = EvaluateResults(y_pred, y_test, [("canton", canton_test),("region", region_test)], canton_test, region_test, language, cantonDict, regionDict)
            totalMicro += micro
            totalMacro += macro
            totalPosAcc += positive_accuracy
            totalRiskCanton += equal_risk[0][1]
            totalRiskRegion += equal_risk[1][1]
            totalOddsCanton += equal_odds[0][1]
            totalOddsRegion += equal_odds[1][1]
            equalVarRiskCanton += equal_risk_var_canton
            equalVarOddsCanton += equal_odds_var_canton
            equalVarRiskRegion += equal_risk_var_region
            equalVarOddsRegion += equal_odds_var_region
            if k % 3 == 0:
                filename = language + ";" + str("logisticregression_dp_")+str(vectorizer) + " BalancedDataSet"
                VisualizeDatasetSklearn2(x_val, canton_val, region_val, model.coef_, filename)
        file1.write(language + ";" + str("logisticregression_dp")+";"+str(vectorizer)+ ";"+str(totalMicro/5) +";"+ str(totalMacro/5) + ";" + str(totalPosAcc/5) + ";" + str(totalRiskCanton/5) + ";" + str(totalRiskRegion/5) + ";"+str(equalVarRiskCanton/5) + ";"+str(equalVarRiskRegion/5) +";" + str(totalOddsCanton/5) + ";" + str(totalOddsRegion/5)+ ";" + str(equalVarOddsCanton/5) + ";" + str(equalVarOddsRegion/5) +"\n")
    else:
        totalMicro = 0
        totalMacro = 0
        totalRiskCanton = 0
        totalRiskRegion = 0
        totalOddsCanton = 0
        totalOddsRegion = 0
        totalOdds = 0
        totalPosAcc = 0
        equalVarRiskCanton = 0
        equalVarOddsCanton = 0
        equalVarRiskRegion = 0
        equalVarOddsRegion = 0
        for k in range(5): #set the 5 to 1, and avoid dividing by 5 in the result-writer to get the non-averaged results
            print(k)
            model = LogisticRegression()
            model.fit(x_train,y_train)
            y_pred = model.predict(x_test)   
            micro, macro, equal_risk, equal_odds, positive_accuracy,equal_risk_var_canton,equal_odds_var_canton,equal_risk_var_region,equal_odds_var_region = EvaluateResults(y_pred, y_test, [("canton", canton_test),("region", region_test)], canton_test, region_test, language, cantonDict, regionDict)
            totalMicro += micro
            totalMacro += macro
            totalPosAcc += positive_accuracy
            totalRiskCanton += equal_risk[0][1]
            totalRiskRegion += equal_risk[1][1]
            totalOddsCanton += equal_odds[0][1]
            totalOddsRegion += equal_odds[1][1]
            equalVarRiskCanton += equal_risk_var_canton
            equalVarOddsCanton += equal_odds_var_canton
            equalVarRiskRegion += equal_risk_var_region
            equalVarOddsRegion += equal_odds_var_region
            if k % 3 == 0:
                filename = language + ";" + "logisticregression_"+str(vectorizer) + " BalancedDataSet"
                VisualizeDatasetSklearn2(x_val, canton_val, region_val, model.coef_, filename)
        file1.write(language + ";" + str(model)+";"+str(vectorizer)+ ";"+str(totalMicro/5) +";"+ str(totalMacro/5) + ";" + str(totalPosAcc/5) + ";" + str(totalRiskCanton/5) + ";" + str(totalRiskRegion/5) + ";"+str(equalVarRiskCanton/5) + ";"+str(equalVarRiskRegion/5) +";" + str(totalOddsCanton/5) + ";" + str(totalOddsRegion/5)+ ";" + str(equalVarOddsCanton/5) + ";" + str(equalVarOddsRegion/5) +"\n")    
            #filename = language + "," + str(model)+","+str(vectorizer)
                #VisualizeDatasetSklearn(x_val, canton_val, region_val, model.coef_, filename) UNCOMMENT TO VISUALIZE
    file1.close()

In [None]:
def shortenTokenization(input_ids, attention_mask):
    if(input_ids.shape[1] > 512):
        input_ids = input_ids[0,:512].reshape((1,512))
        attention_mask = attention_mask[0,:512].reshape((1,512))
        result = dict([("input_ids", input_ids.cuda()), ("attention_mask",attention_mask.cuda())])
    result = dict([("input_ids", input_ids.cuda()), ("attention_mask", attention_mask.cuda())])
    return result
    


def RobertaTokenizeAndTransformData(data, tokenizer, transformModel, mt5):
    transformedData = []
    x_train = np.zeros((len(data),768)) #subject to change depending on which model is run
    for i, dataSample in enumerate(data):
        tokenizedDataSample = tokenizer(dataSample, return_tensors='pt')
        input_ids = tokenizedDataSample['input_ids']
        attention_mask = tokenizedDataSample['attention_mask']
        tokenizedDataSample = shortenTokenization(input_ids, attention_mask)
        if(mt5):
            transformedDataSample = transformModel(input_ids=tokenizedDataSample["input_ids"], decoder_input_ids=tokenizedDataSample["input_ids"]).last_hidden_state.cpu().detach().numpy()
            x_train[i,:] = np.mean(transformedDataSample[0,:,:], axis=0)
        else:
            transformedDataSample = transformModel(**tokenizedDataSample).pooler_output.detach().cpu().numpy()
            x_train[i,:] = transformedDataSample[0,:]
    return x_train

In [None]:
def testMt5Model(naming, privacy_on, tokenizer, transformModel, x_train, y_train, x_val, y_val, x_test, y_test, canton_test, region_test, canton_val, region_val, canton_train, region_train, epsilon):
    x_train = RobertaTokenizeAndTransformData(x_train, tokenizer, transformModel, True)
    x_test = RobertaTokenizeAndTransformData(x_test,tokenizer,transformModel, True)
    x_val = RobertaTokenizeAndTransformData(x_val, tokenizer, transformModel, True)
    cantonDict, regionDict = getTotalDist(language, canton_train, canton_test, region_train, region_test)
    #language,model,vectorizer, f1_micro, f1_macro, equal_risk_canton, equal_risk_region, equal_odds_canton, equal_risk_region
    file1 = open("resultsWithBalancedDatasetEpsilonnew"+str(epsilon)+".txt","a")
    if privacy_on:
        totalMicro = 0
        totalMacro = 0
        totalRiskCanton = 0
        totalRiskRegion = 0
        totalOddsCanton = 0
        totalOddsRegion = 0
        totalOdds = 0
        totalPosAcc = 0
        equalVarRiskCanton = 0
        equalVarOddsCanton = 0
        equalVarRiskRegion = 0
        equalVarOddsRegion = 0
        for k in range(5): #set the 5 to 1, and avoid dividing by 5 in the result-writer to get the non-averaged results
            model = dp.models.LogisticRegression(epsilon = epsilon)
            model.fit(x_train,y_train)
            y_pred = model.predict(x_test)   
            micro, macro, equal_risk, equal_odds, positive_accuracy,equal_risk_var_canton,equal_odds_var_canton,equal_risk_var_region,equal_odds_var_region = EvaluateResults(y_pred, y_test, [("canton", canton_test),("region", region_test)], canton_test, region_test, language, cantonDict, regionDict)
            totalMicro += micro
            totalMacro += macro
            totalPosAcc += positive_accuracy
            totalRiskCanton += equal_risk[0][1]
            totalRiskRegion += equal_risk[1][1]
            totalOddsCanton += equal_odds[0][1]
            totalOddsRegion += equal_odds[1][1]
            equalVarRiskCanton += equal_risk_var_canton
            equalVarOddsCanton += equal_odds_var_canton
            equalVarRiskRegion += equal_risk_var_region
            equalVarOddsRegion += equal_odds_var_region
            if k % 3 == 0:
                filename = language + ";" + "logisticregression_dp" + naming
                VisualizeDatasetSklearn(x_val, canton_val, region_val, model.coef_, filename)
        file1.write(language  +";" + "logisticregression_dp_" + naming +";"+naming+"Vectorizer"+";"+str(totalMicro/5) +";"+ str(totalMacro/5) + ";" + str(totalPosAcc/5) + ";" + str(totalRiskCanton/5) + ";" + str(totalRiskRegion/5) + ";"+str(equalVarRiskCanton/5) + ";"+str(equalVarRiskRegion/5) +";" + str(totalOddsCanton/5) + ";" + str(totalOddsRegion/5)+ ";" + str(equalVarOddsCanton/5) + ";" + str(equalVarOddsRegion/5) +"\n")
        #VisualizeDatasetSklearn(x_val, canton_val, region_val, model.coef_, filename)
      
    else:
        totalMicro = 0
        totalMacro = 0
        totalRiskCanton = 0
        totalRiskRegion = 0
        totalOddsCanton = 0
        totalOddsRegion = 0
        totalOdds = 0
        totalPosAcc = 0
        equalVarRiskCanton = 0
        equalVarOddsCanton = 0
        equalVarRiskRegion = 0
        equalVarOddsRegion = 0
        for k in range(5): #set the 5 to 1, and avoid dividing by 5 in the result-writer to get the non-averaged results
            model = LogisticRegression()
            model.fit(x_train,y_train)
            y_pred = model.predict(x_test)   
            micro, macro, equal_risk, equal_odds, positive_accuracy,equal_risk_var_canton,equal_odds_var_canton,equal_risk_var_region,equal_odds_var_region = EvaluateResults(y_pred, y_test, [("canton", canton_test),("region", region_test)], canton_test, region_test, language, cantonDict, regionDict)
            totalMicro += micro
            totalMacro += macro
            totalPosAcc += positive_accuracy
            totalRiskCanton += equal_risk[0][1]
            totalRiskRegion += equal_risk[1][1]
            totalOddsCanton += equal_odds[0][1]
            totalOddsRegion += equal_odds[1][1]
            equalVarRiskCanton += equal_risk_var_canton
            equalVarOddsCanton += equal_odds_var_canton
            equalVarRiskRegion += equal_risk_var_region
            equalVarOddsRegion += equal_odds_var_region
            if k % 3 == 0:
                filename = language + ";" + "logisticregression" + naming +";"+naming+"Vectorizer"
                VisualizeDatasetSklearn(x_val, canton_val, region_val, model.coef_, filename)
        file1.write(language + ";" + "logisticregression_" + naming +";"+naming+"Vectorizer"+";"+str(totalMicro/5) +";"+ str(totalMacro/5) + ";" + str(totalPosAcc/5) + ";" + str(totalRiskCanton/5) + ";" + str(totalRiskRegion/5) + ";"+str(equalVarRiskCanton/5) + ";"+str(equalVarRiskRegion/5) +";" + str(totalOddsCanton/5) + ";" + str(totalOddsRegion/5)+ ";" + str(equalVarOddsCanton/5) + ";" + str(equalVarOddsRegion/5) +"\n")
        #VisualizeDatasetSklearn(x_val, canton_val, region_val, model.coef_, filename)
        #outfile = open(filename,'wb')
        #pickle.dump(y_pred, outfile)
        #outfile.close()
    file1.close()

In [None]:
def testRobertaModel(naming, privacy_on, tokenizer, transformModel, x_train, y_train, x_val, y_val, x_test, y_test, canton_test, region_test, canton_val, region_val, canton_train, region_train, epsilon):
    x_train = RobertaTokenizeAndTransformData(x_train, tokenizer, transformModel, False)
    x_test = RobertaTokenizeAndTransformData(x_test,tokenizer,transformModel, False)
    x_val = RobertaTokenizeAndTransformData(x_val, tokenizer, transformModel, False)
    cantonDict, regionDict = getTotalDist(language, canton_train, canton_test, region_train, region_test)

    #language,model,vectorizer, f1_micro, f1_macro, equal_risk_canton, equal_risk_region, equal_odds_canton, equal_risk_region
    file1 = open("resultsWithBalancedDatasetEpsilonnew"+str(epsilon)+".txt","a")
    if privacy_on:
        totalMicro = 0
        totalMacro = 0
        totalRiskCanton = 0
        totalRiskRegion = 0
        totalOddsCanton = 0
        totalOddsRegion = 0
        totalOdds = 0
        totalPosAcc = 0
        equalVarRiskCanton = 0
        equalVarOddsCanton = 0
        equalVarRiskRegion = 0
        equalVarOddsRegion = 0
        for k in range(5): #set the 5 to 1, and avoid dividing by 5 in the result-writer to get the non-averaged results
            model = dp.models.LogisticRegression(epsilon = epsilon)
            model.fit(x_train,y_train)
            y_pred = model.predict(x_test)
            micro, macro, equal_risk, equal_odds, positive_accuracy,equal_risk_var_canton,equal_odds_var_canton,equal_risk_var_region,equal_odds_var_region = EvaluateResults(y_pred, y_test, [("canton", canton_test),("region", region_test)], canton_test, region_test, language, cantonDict, regionDict)
            totalMicro += micro
            totalMacro += macro
            totalPosAcc += positive_accuracy
            totalRiskCanton += equal_risk[0][1]
            totalRiskRegion += equal_risk[1][1]
            totalOddsCanton += equal_odds[0][1]
            totalOddsRegion += equal_odds[1][1]
            equalVarRiskCanton += equal_risk_var_canton
            equalVarOddsCanton += equal_odds_var_canton
            equalVarRiskRegion += equal_risk_var_region
            equalVarOddsRegion += equal_odds_var_region
            
            if k % 3 == 0:
                filename = language + ";" + "logisticregression_dp_" + naming
                VisualizeDatasetSklearn(x_val, canton_val, region_val, model.coef_, filename)
        file1.write(language + ";" + "logisticregression_dp_" + naming +";"+naming+"Vectorizer"+";"+str(totalMicro/5) +";"+ str(totalMacro/5) + ";" + str(totalPosAcc/5) + ";" + str(totalRiskCanton/5) + ";" + str(totalRiskRegion/5) + ";"+str(equalVarRiskCanton/5) + ";"+str(equalVarRiskRegion/5) +";" + str(totalOddsCanton/5) + ";" + str(totalOddsRegion/5)+ ";" + str(equalVarOddsCanton/5) + ";" + str(equalVarOddsRegion/5) +"\n")
    else:
        totalMicro = 0
        totalMacro = 0
        totalRiskCanton = 0
        totalRiskRegion = 0
        totalOddsCanton = 0
        totalOddsRegion = 0
        totalOdds = 0
        totalPosAcc = 0
        equalVarRiskCanton = 0
        equalVarOddsCanton = 0
        equalVarRiskRegion = 0
        equalVarOddsRegion = 0
        for k in range(5): #set the 5 to 1, and avoid dividing by 5 in the result-writer to get the non-averaged results
            model = LogisticRegression()
            model.fit(x_train,y_train)
            y_pred = model.predict(x_test)
            micro, macro, equal_risk, equal_odds, positive_accuracy,equal_risk_var_canton,equal_odds_var_canton,equal_risk_var_region,equal_odds_var_region = EvaluateResults(y_pred, y_test, [("canton", canton_test),("region", region_test)], canton_test, region_test, language, cantonDict, regionDict)
            totalMicro += micro
            totalMacro += macro
            totalPosAcc += positive_accuracy
            totalRiskCanton += equal_risk[0][1]
            totalRiskRegion += equal_risk[1][1]
            totalOddsCanton += equal_odds[0][1]
            totalOddsRegion += equal_odds[1][1]
            equalVarRiskCanton += equal_risk_var_canton
            equalVarOddsCanton += equal_odds_var_canton
            equalVarRiskRegion += equal_risk_var_region
            equalVarOddsRegion += equal_odds_var_region
            if k % 3 == 0:
                filename = language + ";" + "logisticregression_" + naming
                VisualizeDatasetSklearn(x_val, canton_val, region_val, model.coef_, filename)
        file1.write(language + ";" + "logisticregression_" + naming +";"+naming+"Vectorizer"+";"+str(totalMicro/5) +";"+ str(totalMacro/5) + ";" + str(totalPosAcc/5) + ";" + str(totalRiskCanton/5) + ";" + str(totalRiskRegion/5) + ";"+str(equalVarRiskCanton/5) + ";"+str(equalVarRiskRegion/5) +";" + str(totalOddsCanton/5) + ";" + str(totalOddsRegion/5)+ ";" + str(equalVarOddsCanton/5) + ";" + str(equalVarOddsRegion/5) +"\n")
    file1.close()

In [None]:
from transformers import RobertaTokenizer, RobertaModel
from transformers import MT5Model, T5Tokenizer
from transformers import AutoTokenizer, AutoModel
from sklearn.linear_model import LogisticRegression
import diffprivlib as dp
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
languages = ['de','fr','it']
mt5_tokenizer = T5Tokenizer.from_pretrained("google/mt5-base")
mt5_model = MT5Model.from_pretrained("google/mt5-base").cuda()
tokenizer = AutoTokenizer.from_pretrained('xlm-roberta-base')
model = AutoModel.from_pretrained("xlm-roberta-base").cuda()
epsilons =[1,5]
for language in languages:
    for epsilon in epsilons:
        #uncomment for imbalanced dataset
    #x_train, y_train, x_val, y_val, x_test, y_test, canton_test, region_test, canton_val, region_val, canton_train, region_train = loadAndCleanDatasetImbalanced('fscs',language)
        x_train, y_train, x_val, y_val, x_test, y_test, canton_test, region_test, canton_val, region_val, canton_train, region_train = loadAndCleanDatasetBalanced('fscs',language)
        countVectorizer = CountVectorizer()
        testSklearnModel(countVectorizer, False,x_train, y_train, x_val, y_val, x_test, y_test, canton_test, region_test, canton_val, region_val, canton_train, region_train, epsilon)
        testSklearnModel(countVectorizer, True,x_train, y_train, x_val, y_val, x_test, y_test, canton_test, region_test, canton_val, region_val, canton_train, region_train, epsilon)
        tfidfVectorizer = TfidfVectorizer()
        testSklearnModel(tfidfVectorizer, False,x_train, y_train, x_val, y_val, x_test, y_test, canton_test, region_test, canton_val, region_val, canton_train, region_train, epsilon)
        testSklearnModel(tfidfVectorizer, True,x_train, y_train, x_val, y_val, x_test, y_test, canton_test, region_test, canton_val, region_val, canton_train, region_train, epsilon)
   
        testMt5Model("mt5-base", False,  mt5_tokenizer, mt5_model, x_train, y_train, x_val, y_val, x_test, y_test, canton_test, region_test, canton_val, region_val, canton_train, region_train, epsilon)
        testMt5Model("mt5-base", True,  mt5_tokenizer, mt5_model, x_train, y_train, x_val, y_val, x_test, y_test, canton_test, region_test, canton_val, region_val, canton_train, region_train, epsilon)
    
        testRobertaModel("xlm-roberta-base", False, tokenizer, model, x_train, y_train, x_val, y_val, x_test, y_test, canton_test, region_test, canton_val, region_val, canton_train, region_train, epsilon)
        testRobertaModel("xlm-roberta-base",True, tokenizer, model, x_train, y_train, x_val, y_val, x_test, y_test, canton_test, region_test, canton_val, region_val, canton_train, region_train, epsilon)
   

In [None]:
import numpy as np
def testMajorityBaseLine():
    languages = ['de','fr','it']
    for language in languages:
        x_train_bal, y_train_bal, x_val, y_val, x_test, y_test, canton_test, region_test, canton_val, region_val, canton_train_bal, region_train_bal = loadAndCleanDatasetBalanced('fscs',language)
        y_pred = [0] *len(y_test)
        cantonDict, regionDict = getTotalDist(language, canton_train, canton_test, region_train, region_test)
       # y_pred = model.predict(x_test)
        micro, macro, equal_risk, equal_odds, positive_accuracy,equal_risk_var_canton,equal_odds_var_canton,equal_risk_var_region,equal_odds_var_region = EvaluateResults(y_pred, y_test, [("canton", canton_test),("region", region_test)], canton_test, region_test, language, cantonDict, regionDict)
        #language,model,vectorizer, f1_micro, f1_macro, equal_risk_canton, equal_risk_region, equal_odds_canton, equal_risk_region
        file1 = open("results.txt","a")
        file1.write(language + "," + "majorityBaseLine"+","+"no_vectorizer"+","+ str(micro) +","+ str(macro) +"," + str(equal_risk[0][1]) + "," + str(equal_risk[1][1]) + "," + str(equal_odds[0][1]) + "," + str(equal_odds[1][1])+ "\n")
        file1.close()
        
testMajorityBaseLine()