In [None]:
import numpy as np
import pandas as pd
import sys

from arrow import Arrow
from bcpp_community import communities
from bcpp_status.models import StatusHistory
from bcpp_subject.models import SubjectVisit, SubjectConsent
from edc_registration.models import RegisteredSubject
from datetime import datetime
from django.db import connection
from edc_constants.constants import YES, NO, NEG, UNK, MALE, FEMALE
from edc_pdutils.model_to_dataframe import ModelToDataframe
from pprint import pprint

VIRAL_LOAD = 'Viral Load'
start_date = datetime(2017, 1, 1)
date_format = '%Y-%m-%d %H:%M:%S.%f'
export_date_format = '%Y-%m-%d'
local_tz = 'Africa/Gaborone'

load_consent_from_csv = True
consent_csv_filename = '~/Documents/bcpp/referral/consent20170927142648.csv'
split_csv_by_community = True
delimiter = '|'

In [None]:
class SubjectModelToDataframe(ModelToDataframe):
    
    columns = {k: k for k in ['subject_identifier', 'gender', 'dob']}
    
    def __init__(self, columns=None, **kwargs):
        if columns:
            self.columns = {k: k for k in columns}
        super().__init__(**kwargs)

In [None]:
# Load the data set.
# df = pd.read_csv(consent_csv_filename)
# df = df.where((pd.notnull(df)), None)
# df['dob'] = df['dob'].astype('datetime64[ns]')

# df.head()

df = SubjectModelToDataframe(
    model='edc_registration.registeredsubject',
    columns=['subject_identifier', 'gender', 'dob', 'study_site'], decrypt=False).dataframe
df = df.where((pd.notnull(df)), None)
df = df[pd.notnull(df['dob'])]
df = df[pd.notnull(df['study_site'])]
df['dob'] = df['dob'].astype('datetime64[ns]')
df['birth_year'] = df['dob'].map(lambda d: d.year)
df['birth_decade'] = df['birth_year'] // 10 * 10
df['gender'] = df['gender'].map({MALE: 1, FEMALE: 0})


df.head()


In [None]:
# Define function to evaluate uniqueness of the provided dataset.
def uniqueness(dataframe, pseudo):
    groups = list(dataframe.groupby(pseudo).groups.values())
    return sum(1. for g in groups if len(g) == 1) / len(dataframe)

In [None]:
print((uniqueness(df, ['subject_identifier'])))
print((uniqueness(df, ['gender', 'dob'])))
print((uniqueness(df, ['gender', 'dob', 'study_site'])))
print((uniqueness(df, ['gender', 'birth_year', 'study_site'])))
print((uniqueness(df, ['gender', 'birth_decade', 'study_site'])))

In [None]:
# Define function to evaluate k-anonymity of the provided data set.
def k_anonymity(dataframe, pseudo):
    return dataframe.groupby(pseudo).count().min()[0]

In [None]:
print((k_anonymity(df, ['gender', 'dob', 'study_site'])))
print((k_anonymity(df, ['gender', 'birth_year', 'study_site'])))
print((k_anonymity(df, ['gender', 'birth_decade', 'study_site'])))

In [None]:
df.groupby(['gender', 'dob', 'study_site']).count().mean()

In [None]:
df.groupby(['gender', 'birth_year', 'study_site']).count().mean()

In [None]:
df.groupby(['gender', 'birth_decade', 'study_site']).count().mean()

In [None]:
print((k_anonymity(df, ['gender', 'birth_year', 'study_site'])))
grouped = df.groupby(['gender', 'birth_year', 'study_site'])
df_filtered = grouped.filter(lambda x: len(x) > 5)
print(('Reducing size:', len(df), '> ', len(df_filtered)))
print(('K-anonymity after suppression:', k_anonymity(df_filtered, ['gender', 'birth_year', 'study_site'])))

In [None]:
df['birth_year'].describe()

In [None]:
df_filtered['birth_year'].describe()

In [None]:
df['gender'].describe()

In [None]:
df_filtered['gender'].describe()