# Datengenerator für Szenario 2 und Bewertung
    Szenario des Socialscores mit 7 Attributen (Name, Alter, Politische Orientierung, Bildungsabschluss, Soziales, Wohnlage, CO2-Fußabdruck)
    Dabei können Alter, Politische Orientierung, Bildungsabschluss, Soziales, Wohnlage und CO2-Fußabdruck durch Bias beeinflusst werden.

In [1]:
import numpy as np
from faker import Faker,Factory
import pandas as pd
import os
from datetime import datetime
import random

fake = Factory.create("de_DE")

#### Generieren eines ziemlich zufälligen Seeds
    Generieren eines Seeds beruhend auf den Angaben der Zeit.
    So ist bei jeder generierung der Daten ein anderer Seed verfügbar und es entstehen "wirkliche" Zufallswerte

In [2]:
def generate_seed():
    now = datetime.now()
    #Calculate seed number from a few time data
    seed = (now.day * now.minute * now.second * now.month * now.year * now.hour) / now.microsecond 
    #If a negative or 0 value is the result, a "simpler" replacement seed is generated. 
    if(seed <= 0):
        seed = now.day * (now.minute + 1)
    return seed

#### Eine Funktion zum erstellen der synthetischen Daten.
    
    Die Parameter num und seed geben dabei zum einen die Anzahl an Datensätzen an und zum anderen den seed der "künstlichen" Wahrscheinlichkeit.
    Es wird pro Datensatz das Alter(20-79) und daraufbasierend der Abschluss("Ausbildung", "Fachschulabschluss", "Bachelor", "Master", "Diplom", "Promotion","ohne") bestimmt, zusätzlich noch die Politische Orientierung(Links, Mitte, Rechts).
    Basiernd auf der Politischen Orientierung wird das SozialeEngagement(0,1,2,3) bestimmt.
    Die Wohnlage("Großstadt","Kleinstadt","Vorort","Ländlich") wird zufällig mit einer Verteilung bestimmt.
    Basiernd auf der Wohnlage wird der CO2Fußabdruck(4-12) in 0,5er Schritten bestimmt.
    Zum Schluss wird dann noch ein zufälliger Name gewählt.

In [3]:
def create_fake_data(num = 10, seed = 123):
    #Setting the seed for the probability functions
    np.random.seed(seed)
    fake.seed_instance(seed)
    #Defining the Output Array
    output = []
    #Loop over the number of personal data to be created
    for x in range(num):
        age_part = np.random.choice([0,1,2], p=[0.33,0.38,0.29])
        age = random.randint(20,39) if(age_part == 0) else random.randint(40,59) if(age_part == 1) else random.randint(60,79)
        politics = np.random.choice(["Links", "Mitte", "Rechts"], p=[0.1545, 0.722, 0.1235])
        if(age <= 29):
            grad = np.random.choice(["Ausbildung", "Fachschulabschluss", "Bachelor", "Master", "Diplom", "Promotion","ohne"], p=[0.425,0.081,0.117,0.071,0.059,0.003,0.244])
        elif(age <= 39):
            grad = np.random.choice(["Ausbildung", "Fachschulabschluss", "Bachelor", "Master", "Diplom", "Promotion","ohne"], p=[0.452,0.091,0.059,0.052,0.157,0.016,0.173])
        elif(age <= 49):
            grad = np.random.choice(["Ausbildung", "Fachschulabschluss", "Bachelor", "Master", "Diplom", "Promotion","ohne"], p=[0.516,0.097,0.014,0.011,0.179,0.017,0.166])
        elif(age <= 59):
            grad = np.random.choice(["Ausbildung", "Fachschulabschluss", "Bachelor", "Master", "Diplom", "Promotion","ohne"], p=[0.559,0.113,0.005,0.003,0.159,0.014,0.147])
        else:
            grad = np.random.choice(["Ausbildung", "Fachschulabschluss", "Bachelor", "Master", "Diplom", "Promotion","ohne"], p=[0.544,0.096,0.002,0.001,0.132,0.012,0.213])
        social = np.random.choice([0,1,2,3], p=[0.6,0.2,0.15,0.05]) if(politics=="Mitte") else np.random.choice([0,1,2,3], p=[0.4,0.25,0.25,0.1]) if(politics=="Links") else np.random.choice([0,1,2,3], p=[0.75,0.15,0.09,0.01])
        location = np.random.choice(["Großstadt","Kleinstadt","Vorort","Ländlich"], p=[0.55,0.25,0.05,0.15])
        if(location == "Großstadt"):
            if(social == 0):
                co2 = np.random.choice([4,4.5,5,5.5,6,6.5,7,7.5,8,8.5,9,9.5,10,10.5,11,11.5,12], 
                p=[0.001,0.003,0.009,0.022,0.044,0.078,0.116,0.147,0.160,0.147,0.116,0.078,0.044,0.022,0.009,0.003,0.001])
            elif(social == 1):
                co2 = np.random.choice([4,4.5,5,5.5,6,6.5,7,7.5,8,8.5,9,9.5,10,10.5,11,11.5,12], 
                p=[0.004,0.009,0.022,0.044,0.078,0.116,0.147,0.160,0.147,0.116,0.078,0.044,0.022,0.009,0.003,0.001,0])
            elif(social == 2):
                co2 = np.random.choice([4,4.5,5,5.5,6,6.5,7,7.5,8,8.5,9,9.5,10,10.5,11,11.5,12], 
                p=[0.013,0.022,0.044,0.078,0.116,0.147,0.160,0.147,0.116,0.078,0.044,0.022,0.009,0.003,0.001,0,0])
            elif(social == 3):
                co2 = np.random.choice([4,4.5,5,5.5,6,6.5,7,7.5,8,8.5,9,9.5,10,10.5,11,11.5,12], 
                p=[0.035,0.044,0.078,0.116,0.147,0.160,0.147,0.116,0.078,0.044,0.022,0.009,0.003,0.001,0,0,0])
        elif(location == "Kleinstadt"):
            if(social == 0):
                co2 = np.random.choice([4,4.5,5,5.5,6,6.5,7,7.5,8,8.5,9,9.5,10,10.5,11,11.5,12], 
                p=[0,0.001,0.003,0.009,0.022,0.044,0.078,0.116,0.147,0.160,0.147,0.116,0.078,0.044,0.022,0.009,0.004])
            elif(social == 1):
                co2 = np.random.choice([4,4.5,5,5.5,6,6.5,7,7.5,8,8.5,9,9.5,10,10.5,11,11.5,12], 
                p=[0.001,0.003,0.009,0.022,0.044,0.078,0.116,0.147,0.160,0.147,0.116,0.078,0.044,0.022,0.009,0.003,0.001])
            elif(social == 2):
                co2 = np.random.choice([4,4.5,5,5.5,6,6.5,7,7.5,8,8.5,9,9.5,10,10.5,11,11.5,12], 
                p=[0.004,0.009,0.022,0.044,0.078,0.116,0.147,0.160,0.147,0.116,0.078,0.044,0.022,0.009,0.003,0.001,0])
            elif(social == 3):
                co2 = np.random.choice([4,4.5,5,5.5,6,6.5,7,7.5,8,8.5,9,9.5,10,10.5,11,11.5,12], 
                p=[0.013,0.022,0.044,0.078,0.116,0.147,0.160,0.147,0.116,0.078,0.044,0.022,0.009,0.003,0.001,0,0])
        elif(location == "Vorort"):
            if(social == 0):
                co2 = np.random.choice([4,4.5,5,5.5,6,6.5,7,7.5,8,8.5,9,9.5,10,10.5,11,11.5,12], 
                p=[0,0,0.001,0.003,0.009,0.022,0.044,0.078,0.116,0.147,0.160,0.147,0.116,0.078,0.044,0.022,0.013])
            elif(social == 1):
                co2 = np.random.choice([4,4.5,5,5.5,6,6.5,7,7.5,8,8.5,9,9.5,10,10.5,11,11.5,12], 
                p=[0,0.001,0.003,0.009,0.022,0.044,0.078,0.116,0.147,0.160,0.147,0.116,0.078,0.044,0.022,0.009,0.004])
            elif(social == 2):
                co2 = np.random.choice([4,4.5,5,5.5,6,6.5,7,7.5,8,8.5,9,9.5,10,10.5,11,11.5,12], 
                p=[0.001,0.003,0.009,0.022,0.044,0.078,0.116,0.147,0.160,0.147,0.116,0.078,0.044,0.022,0.009,0.003,0.001])
            elif(social == 3):
                co2 = np.random.choice([4,4.5,5,5.5,6,6.5,7,7.5,8,8.5,9,9.5,10,10.5,11,11.5,12], 
                p=[0.004,0.009,0.022,0.044,0.078,0.116,0.147,0.160,0.147,0.116,0.078,0.044,0.022,0.009,0.003,0.001,0])
        else:
            if(social == 0):
                co2 = np.random.choice([4,4.5,5,5.5,6,6.5,7,7.5,8,8.5,9,9.5,10,10.5,11,11.5,12], 
                p=[0,0,0,0.001,0.003,0.009,0.022,0.044,0.078,0.116,0.147,0.160,0.147,0.116,0.078,0.044,0.035])
            elif(social == 1):
                co2 = np.random.choice([4,4.5,5,5.5,6,6.5,7,7.5,8,8.5,9,9.5,10,10.5,11,11.5,12], 
                p=[0,0,0.001,0.003,0.009,0.022,0.044,0.078,0.116,0.147,0.160,0.147,0.116,0.078,0.044,0.022,0.013])
            elif(social == 2):
                co2 = np.random.choice([4,4.5,5,5.5,6,6.5,7,7.5,8,8.5,9,9.5,10,10.5,11,11.5,12], 
                p=[0,0.001,0.003,0.009,0.022,0.044,0.078,0.116,0.147,0.160,0.147,0.116,0.078,0.044,0.022,0.009,0.004])
            elif(social == 3):
                co2 = np.random.choice([4,4.5,5,5.5,6,6.5,7,7.5,8,8.5,9,9.5,10,10.5,11,11.5,12], 
                p=[0.001,0.003,0.009,0.022,0.044,0.078,0.116,0.147,0.160,0.147,0.116,0.078,0.044,0.022,0.009,0.003,0.001])
        output.append(
            {
                #Name of the person
                "Name": fake.first_name(),
                "Alter": age,
                "Politische Orientierung": politics,
                "Bildungsabschluss": grad,
                "Soziales": social,
                "Wohnlage": location,
                "CO2-Fußabdruck": co2
            }  
        )
    #Return of the output array with one person per entry
    return output

##### generate_data(data_count):
    Parameter: data_count gibt die Anzahl der zu generierenden Daten an.
    Zuerst wird ein Seed für den Random Faktor erzeugt, danach die synthetischen Daten erstellt und als Dataframe abgespeichert.

In [4]:
def generate_data(data_count):
    seed = generate_seed()
    df = pd.DataFrame(create_fake_data(data_count,int(seed)))
    return df

#### Funktion zum Erstellen der Regeln nach welchen ein Bewerter bewerten soll.
    Dabei wird jedem Attribut und dessen Ausprägung ein Wert zugewissen mit welchem dieses später gewichtet wird. So wird zum Beispiel eine Ausbildung mit +20 und kein Abschluss mit -120 bewertet.

In [5]:
additional = {
    "Name": "Random",
    "Alter": "20-79"
}
influential = {
    "Politische Orientierung": ["Links", "Mitte", "Rechts"],
    "Bildungsabschluss": ["Ausbildung", "Fachschulabschluss", "Bachelor", "Master", "Diplom", "Promotion","ohne"],
    "Soziales": [0,1,2,3],
    "Wohnlage": ["Großstadt","Kleinstadt","Vorort","Ländlich"],
    "CO2-Fußabdruck": [4,5,6,7,8,9,10,11,12]
}
bias = {
    "Alter": "20-79",
    "Politische Orientierung": ["Links", "Mitte", "Rechts"],
    "Bildungsabschluss": ["Ausbildung", "Fachschulabschluss", "Bachelor", "Master", "Diplom", "Promotion","ohne"],
    "Soziales": [0,1,2,3],
    "Wohnlage": ["Großstadt","Kleinstadt","Vorort","Ländlich"],
    "CO2-Fußabdruck": [4,5,6,7,8,9,10,11,12]
}
def create_Rules():
    influential = {
        "Politische Orientierung": [-50, 50, -50],
        "Bildungsabschluss": [20, 40, 60, 80, 100, 120,-120],
        "Soziales": [-70,0,40,100],
        "Wohnlage": [-30,20,20,-30],
        "CO2-Fußabdruck": [110,70,40,10,0,-20,-40,-70,-110]
    }
    return influential

#### Klasse für das erstellen eines Bewerters 
    Beinhaltet Methoden zum einen zum erstellen des Bewerter Objekts und zum anderen die Methode bewerte, um einen Antrag nach den Regeln zu Bewerten. Zusätzlich hat jeder Bewerter einen Bias oder nicht. In der bewerte Methode wird beginnend bei 1000 der endgültige Socialscore ermittelt. Dabei wird dür jedes Attribut und desen Ausprägung der in den Regeln definierte Wert dazu gerechnet. Um diesen Wert noch etwas zu verschleiern und keine harten Werte zu haben wird noch kein kleiner Prozentsatz dazu gerechnet. Ein bias beeinflusst den Socialscore künstlich zusätzlich um einen zuvor festgelegten harten Wert. Zum Schluss bleibt dann ein Socialscore zwischen 600 und 1400.

In [6]:
class Evaluator:
    #Creating a evaluator with its own rules and a bias or not
    def __init__(self, rules, bias, bias_neg=200):
        self.rules = rules
        self.bias = bias
        self.bias_neg = bias_neg
    #Function to evaluate a submitted person with or without bias
    def rate(self, influential, person, bias):
        rate = 1000
        for key in self.rules.keys():
            if(key == "CO2-Fußabdruck"):
                value_of_key = int(person[key])
                index = influential[key].index(value_of_key)
                rate += self.rules[key][index]
                procent = self.rules[key][index] * 0.15
                procent = round(procent,0)
                if(procent<0):
                    rand = random.randint(0,(procent*-1))
                    rate -= int(rand)
                else:
                    rand = random.randint(0,procent)
                    rate += int(rand)
            else:
                value_of_key = person[key]
                index = influential[key].index(value_of_key)
                rate += self.rules[key][index]
                procent = self.rules[key][index] * 0.15
                procent = round(procent,0)
                if(procent<0):
                    rand = random.randint(0,(procent*-1))
                    rate -= int(rand)
                else:
                    rand = random.randint(0,procent)
                    rate += int(rand)
        if(self.bias):
            for b in bias:
                if(b == "Alter"):
                    under = int(bias[b].split('-')[0])
                    upper = int(bias[b].split('-')[1])
                    if(person[b]<=upper and person[b]>=under):                       
                        red_val = self.bias_neg
                        if((rate-red_val)<600):
                            rate_part = np.random.choice([0,1,2], p=[0.4,0.35,0.25])
                            biasrate = random.randint(600,610) if(rate_part == 0) else random.randint(611,620) if(rate_part == 1) else random.randint(621,630)
                            rate = biasrate
                        else:
                            rate-=red_val
                else:
                    if(bias[b].__contains__(person[b])):
                        red_val = self.bias_neg
                        if((rate-red_val)<600):
                            rate_part = np.random.choice([0,1,2], p=[0.4,0.35,0.25])
                            biasrate = random.randint(600,610) if(rate_part == 0) else random.randint(611,620) if(rate_part == 1) else random.randint(621,630)
                            rate = biasrate
                        else:
                            rate-=red_val
        if(rate<600 or rate >1400):
            rate = 600
        person["Bewertung"] = int(rate)
        return person

#### Methoden für den Gesamtablauf
##### work(df, bias, evaluator_count, bias_Evaluator, bias_neg):
    Parameter: df ist der Datensatz welcher Bewertet werden soll, bias ist der Bias welcher ausgeführt werden soll, evaluator_count ist die Anzahl der Bewerter welche die Anträge bewerten, bias_Evaluator ist die Anzahl an Bewerter welche diskriminieren. bias_neg ist der Wert durch welchen mit einem Bias der Antrag beeinflusst wird.
    Zuerst werden die bestehenden Werte für die Anträge definiert und die Regeln erstellt. Danach wird die gegebene Anzahl an Bewertern erstellt. Im Anschluss wird die in biasBewerter angegebene Anzahl der Bewerter zu einem Bias Bewerter verwandelt. 
    Zu guter Letzt werden die gegebenen Anträge bewertet und mit der Socialscore Bewertung zurückgegeben.

In [7]:
def work(df, bias, evaluator_count, bias_evaluator, bias_neg):
    #Values in the personprofile which only serve as filler and are therefore irrelevant
    additional = {
        "Name": "Random",
        "Alter": "20-79"
    }
    #Values in the personprofile which influence the evaluation
    influential = {
        "Politische Orientierung": ["Links", "Mitte", "Rechts"],
        "Bildungsabschluss": ["Ausbildung", "Fachschulabschluss", "Bachelor", "Master", "Diplom", "Promotion","ohne"],
        "Soziales": [0,1,2,3],
        "Wohnlage": ["Großstadt","Kleinstadt","Vorort","Ländlich"],
        "CO2-Fußabdruck": [4,5,6,7,8,9,10,11,12]
    }
    #Values in the personprofile which can have an effect on the evaluation as a bias
    personprofile_bias = {
        "Alter": "20-79",
        "Politische Orientierung": ["Links", "Mitte", "Rechts"],
        "Bildungsabschluss": ["Ausbildung", "Fachschulabschluss", "Bachelor", "Master", "Diplom", "Promotion","ohne"],
        "Soziales": [0,1,2,3],
        "Wohnlage": ["Großstadt","Kleinstadt","Vorort","Ländlich"],
        "CO2-Fußabdruck": [4,5,6,7,8,9,10,11,12]
    }
    #Create the rules and save them
    rules = create_Rules()
    #Create the number of evaluators
    evaluator = []
    for x in range(evaluator_count):
        evaluator.append(Evaluator(rules=rules, bias=False))
    #Convert the number of evaluators specified as parameters to evaluators with a bias
    for x in range(bias_evaluator):
        evaluator[x].bias = True
        evaluator[x].bias_neg = bias_neg
    #Random number of evaluator selections
    i = 0
    #The final evaluated persons
    finished_persons = []
    #For each person in the dataframe
    for index, r in df.iterrows():
        #Determine a random evaluator from all the evaluators
        i = random.randint(0, evaluator_count-1)
        #Have the personprofile evaluated and saved
        person = evaluator[i].rate(influential, r.copy(), bias)
        #Add the finished personprofile to the array
        finished_persons.append(person)
    #Save the finished personprofiles as a data frame and return them
    newdf = pd.DataFrame(finished_persons)
    return newdf

#### Finale Zelle zum ausführen des Ablaufs von Szenario 2
    Zuerst wird der bias definiert welcher in den Daten negativ zu finden sein soll. Also Beispiel Politische Orientierung: Links bedeutet, dass es Bewerter gibt, welche Personen mit der Politische Orientierung: Links diskriminieren und den Antrag negativ durch den bias_neg Wert beeinflussen.
    Für das Alter ist ein sonder Beispiel gegeben, da dieses in einem Intervall angegeben werden muss.
    Danach werden die angegebne Anzahl der Daten generiert.
    Mit den Daten wird dann der fertig Bewertete Datensatz durch die Methode work erstellt.
    Zum Schluss werden die Ursprungsdaten und die finalen Daten abgespeichert als CSV Datei.

In [17]:
#Here is the section for the possible parameters to enter
#This is an example of how a bias on age can be indicated
age_sample = {
    "Alter": "20-30"
}
#This dictionary specifies the bias(es) on a possible attribute
bias = {
    "Alter": "20-30",
    "Bildungsabschluss": ["Ausbildung","ohne"]
}
#The number of datasets that are to be generated
datasets = 10000
#The number of evaluators who evaluate entries
evaluatorcount = 10
#The number of evaluators who evaluate with a bias
bias_evaluator = 4
#This decides how strong the bias will be. The higher the stronger.
negativ_bias_impact = 200

#Dont touch this
data = generate_data(datasets)
finished = work(df=data,bias=bias,evaluator_count=evaluatorcount,bias_evaluator=bias_evaluator,bias_neg=negativ_bias_impact)
data.to_csv("Daten_Szenario2.csv", sep=';', encoding='utf-8', index=False)
finished.to_csv("Daten_Bewertet_Szenario2.csv", sep=';', encoding='utf-8', index=False)