# Data Anonymizer

This notebook anonymizes the log files. It processes grades to associated them to the log files.

### Imports

In [1]:
import os
import pandas as pd
import random

### Load raw data

We load the raw data from `.csv` files. 

In [2]:
subject = 'RTWIBNet_W22'
input_filepath = '/GradePredictionData/data'
output_filepath = '/GradePredictionData/data-registry'

# other Params
HSRT_IP_start = '134.103'

In [3]:
log_filepath = os.path.join(input_filepath, subject + '_log.csv')
grade_filepath = os.path.join(input_filepath, subject + '_grades.csv')

In [4]:
logfile = pd.read_csv(log_filepath, sep=',')
grades = pd.read_csv(grade_filepath, sep=';')

### Some Filtering

In [5]:
# remove myself from logfile
logfile.drop(logfile[logfile['Vollständiger Name'] == 'Christian Decker'].index, inplace=True)
logfile.dropna(axis=0, how='any', inplace=True)

In [6]:
# Build complete name
grades['Vollständiger Name']= grades['vorname'] + ' ' + grades['nachname']

### Create Name Codes

In [7]:
# extract all names from logfile
logfile_names = set(list(logfile['Vollständiger Name']))
logfile_names = logfile_names.union(set(list(logfile['Betroffene/r Nutzer/in'])))
# extract all names from grades
grades_names = set(list(grades['Vollständiger Name']))
# combine both, shuffle and create dataframe
name_list = list(logfile_names.union(grades_names))
random.shuffle(name_list)
all_names = pd.DataFrame(name_list, columns=['Vollständiger Name'])

In [8]:
# replace name by index in 4digit hex
all_names['name_code'] = all_names.apply(lambda row : f'{row.name:0>4X}' , axis = 1)

### Create IP Address Codes

In [9]:
# extract all IPs from logfile
ip_list = list(set(list(logfile['IP-Adresse'])))
random.shuffle(ip_list)
all_ips = pd.DataFrame(ip_list, columns=['IP-Adresse'])

In [10]:
def ip_anonymizer(ip, idx):
    if ip.startswith(HSRT_IP_start):
        return HSRT_IP_start + '.0.0'
    else:
        return f'{idx:0>4X}'

In [11]:
# replace name by index in 4digit hex
all_ips['ip_code'] = all_ips.apply(lambda row : ip_anonymizer(str(row['IP-Adresse']), row.name) , axis = 1)

### Anonymize Names and IPs

In [12]:
# Lookup functions: provide name or IP and retrieve anonymized code
def anon_name(name):
    return all_names[all_names['Vollständiger Name'] == name]['name_code'].iloc[0]

def anon_ip(ip):
    code = all_ips[all_ips['IP-Adresse'] == ip]['ip_code']
    return code.iloc[0]

In [13]:
# anonymize logfile
logfile['Vollständiger Name'] = logfile.apply(lambda row : anon_name(row['Vollständiger Name']) , axis = 1)
logfile['Betroffene/r Nutzer/in'] = logfile.apply(lambda row : anon_name(row['Betroffene/r Nutzer/in']) , axis = 1)
logfile['IP-Adresse'] = logfile.apply(lambda row : anon_ip(str(row['IP-Adresse'])) , axis = 1)

In [14]:
# anonymize grades
grades['Vollständiger Name'] = grades.apply(lambda row : anon_name(row['Vollständiger Name']) , axis = 1)
grades = grades[['Vollständiger Name', 'bewertung']]

### Store Anonymized Data as `.csv` File

In [15]:
logfile_output = os.path.join(output_filepath, subject, subject+'_log.csv')
grades_output = os.path.join(output_filepath, subject, subject+'_grades.csv')

In [16]:
# create directory, if not exists
data_reg_dir = os.path.dirname(logfile_output)
!mkdir -p {data_reg_dir}

In [17]:
logfile.to_csv(logfile_output, sep=',', index=False)
grades.to_csv(grades_output, sep=',', index=False)

### Store Lookup Tables for Names and IPs

This is useful for later use, when we want to keep the lookup tables while getting updated data.

In [18]:
lookup_names_output = os.path.join(input_filepath, subject+'_names.csv')
lookup_ips_output = os.path.join(input_filepath, subject+'_ips.csv')

In [19]:
all_names.to_csv(lookup_names_output, sep=',', index=False)
all_ips.to_csv(lookup_ips_output, sep=',', index=False)