In [7]:
import pandas as pd
import numpy as np

In [8]:
#original_data = pd.read_csv("original_file.csv")

In [9]:
#anon_data = pd.read_csv(r"anonimized_file.csv")

In [10]:
def hour_utility(df_anon, df_original):
    df_orig = df_original.copy()
    df_anon = df_anon.copy()

    df = pd.DataFrame({ 'df_hour': df_anon['DateTime'].dt.hour, 'df_origin_hour': df_orig['DateTime'].dt.hour })
    #Chaque ligne vaut 1 point
    #Une fraction de point eguale a 1/24 est enlevée à chaque heure d'écart
    df['hour_util'] = 1- abs(df['df_hour'] - df['df_origin_hour'])/24
    # le score finale est la moyenne d'ecart d'heures sur tous les points detecter
    score_hour_utility = df["hour_util"].sum()/len(df_orig)
    
    return score_hour_utility

In [11]:
def date_utility(df_anon, df_original):
    df_orig = df_original.copy()
    df_anon = df_anon.copy()

    df_date_utility = pd.DataFrame({ 'DayOfTheWeek_orig': df_orig['DateTime'].dt.dayofweek, 'DayOfTheWeek_anon': df_anon['DateTime'].dt.dayofweek, 'Week_orig':df_orig['DateTime'].dt.isocalendar().week, 'Week_anon': df_anon['DateTime'].dt.isocalendar().week })
    df_date_utility['DiffDate'] = abs(df_date_utility['DayOfTheWeek_orig']-df_date_utility['DayOfTheWeek_anon'])
    #pour tout changement de semaine l'utilite doit etre 0 
    df_date_utility.loc[~(df_date_utility['Week_orig']==df_date_utility['Week_anon']),'DiffDate']=7
    df_date_utility['date_util']= 1- df_date_utility['DiffDate']/7

    score = df_date_utility["date_util"].mean().round(3)
    return score

In [12]:
def distance_utility(df_anon, df_original):

    df_orig = df_original.copy()
    df_anon = df_anon.copy()
    
    df_anon.rename(columns={'ID':'ID_ano', 'DateTime':'DateTime_ano', 'lat':'lat_ano', 'lon':'lon_ano'}, inplace = True)
    df = pd.concat([df_orig.reset_index(drop=True),df_anon.reset_index(drop=True)], axis=1)
    
    #Haversine distance
    to_radians = np.pi /180
    R = 6371.009 #en km
    #a=np.sin(((df.lat*to_radians-df.lat_ano*to_radians)/2)**2) + np.sin((((df.lon*to_radians-df.lon_ano*to_radians)/2)**2))*np.cos(df.lat*to_radians)*np.cos(df.lat_ano*to_radians)
    a = np.sin(((df.lat*to_radians-df.lat_ano*to_radians)/2))**2 + (np.sin((((df.lon*to_radians-df.lon_ano*to_radians)/2)))**(2))*np.cos(df.lat*to_radians)*np.cos(df.lat_ano*to_radians)
    b = np.sqrt(a)
    df['Haversine_score']= 2 * R * np.arcsin(b)
    if(df["Haversine_score"].mean().round(3)==0):
        score = 1
    else:
        score = (1/(df["Haversine_score"].mean())).round(3)
    return score


In [13]:
def POI_utility(df_anon, df_original):
    #Global variables
    size = 2
    values = ['NIGHT', 'NIGHT','WORK', 'WEEKEND']
    df_orig = df_original.copy()
    df_anon = df_anon.copy() #anon_data

    df_orig['DateTime']= df_orig['DateTime'].astype('datetime64[ns]')
    df_orig['ID']= df_orig['ID'].astype('string')
    df_anon['DateTime']= df_anon['DateTime'].astype('datetime64[ns]')
    df_anon['ID']= df_anon['ID'].astype('string')

    #Pre-treatment of original dataframe
    df_orig['lat']=df_orig['lat'].round(size)
    df_orig['lon']=df_orig['lon'].round(size)
    df_orig['Hour'] = df_orig['DateTime'].dt.hour
    df_orig['Day'] = df_orig['DateTime'].dt.day
    df_orig['Month'] = df_orig['DateTime'].dt.month
    df_orig['Week'] = df_orig['DateTime'].dt.isocalendar().week
    df_orig['DayOfTheWeek'] = df_orig['DateTime'].dt.dayofweek
    df_orig.sort_values(by=['ID', 'DateTime'], inplace=True)
    df_orig.reset_index(drop=True, inplace=True)
    df_orig['DatetimeIndex'] = np.select(conditions(df_orig), values, 'RegularTime')
    df_orig['time_spent']=0

    df_orig['Index_of_POI'] = df_orig['ID'] + '-' + df_orig['Day'].astype('string') + '-' + df_orig['Week'].astype('string') + '-' + df_orig['lat'].astype('string') + '-' + df_orig['lon'].astype('string') + '-' + df_orig['DatetimeIndex'].astype('string')
    df_orig['Index_of_POI_shifted_backward'] = df_orig['Index_of_POI'].shift(-1)
    df_orig['Index_of_POI_shifted_forward'] = df_orig['Index_of_POI'].shift(+1)
    df_orig.loc[0,'Index_of_POI_shifted_forward']='0'
    df_orig.loc[len(df_orig)-1,'Index_of_POI_shifted_backward']='0'
    df_orig['start_time'] = df_orig.loc[~(df_orig['Index_of_POI']==df_orig['Index_of_POI_shifted_forward']), 'DateTime']
    df_orig.fillna(method="ffill", inplace=True) #propagate non-null values forward or backward
    df_orig['time_spent'] = (df_orig['DateTime'] - df_orig['start_time'])#.dt.total_seconds()/60

    #Getting the POI
    df_orig2 = df_orig.loc[~(df_orig['Index_of_POI_shifted_backward']==df_orig['Index_of_POI']),['ID', 'lat', 'lon', 'Week', 'DatetimeIndex', 'time_spent']].groupby(by=['ID', 'lat', 'lon', 'Week', 'DatetimeIndex']).sum().reset_index()
    df_orig2 = df_orig2.sort_values(by=['ID', 'Week', 'time_spent',  'DatetimeIndex'], ascending=[True, True, False, False]).reset_index(drop=True)
    df_orig2 = df_orig2.groupby(by=['ID', 'Week', 'DatetimeIndex']).head(1).reset_index(drop=True)
    
    #Pre-treatment of original dataframe
    df_anon['ID']=df_orig['ID']
    df_anon['lat']=df_anon['lat'].round(size)
    df_anon['lon']=df_anon['lon'].round(size)
    df_anon['Hour'] = df_anon['DateTime'].dt.hour
    df_anon['Day'] = df_anon['DateTime'].dt.day
    df_anon['Month'] = df_anon['DateTime'].dt.month
    df_anon['Week'] = df_anon['DateTime'].dt.isocalendar().week
    df_anon['DayOfTheWeek'] = df_anon['DateTime'].dt.dayofweek
    df_anon.sort_values(by=['ID', 'DateTime'], inplace=True)
    df_anon.reset_index(drop=True, inplace=True)
    df_anon['DatetimeIndex'] = np.select(conditions(df_anon), values, 'RegularTime')
    df_anon['time_spent']=0

    df_anon['Index_of_POI'] = df_anon['ID'] + '-' + df_anon['Day'].astype('string') + '-' + df_anon['Week'].astype('string') + '-' + df_anon['lat'].astype('string') + '-' + df_anon['lon'].astype('string') + '-' + df_anon['DatetimeIndex'].astype('string')
    df_anon['Index_of_POI_shifted_backward'] = df_anon['Index_of_POI'].shift(-1)
    df_anon['Index_of_POI_shifted_forward'] = df_anon['Index_of_POI'].shift(+1)
    df_anon.loc[0,'Index_of_POI_shifted_forward']='0'
    df_anon.loc[len(df_anon)-1,'Index_of_POI_shifted_backward']='0'
    
    df_anon['start_time'] = df_anon.loc[~(df_anon['Index_of_POI']==df_anon['Index_of_POI_shifted_forward']), 'DateTime']
    df_anon.fillna(method="ffill", inplace=True) #propagate non-null values forward or backward
    df_anon['time_spent'] = (df_anon['DateTime'] - df_anon['start_time'])#.dt.total_seconds()/60

    df_anon2 = df_anon.loc[~(df_anon['Index_of_POI_shifted_backward']==df_anon['Index_of_POI']),['ID', 'lat', 'lon', 'Week', 'DatetimeIndex', 'time_spent']].groupby(by=['ID', 'lat', 'lon', 'Week', 'DatetimeIndex']).sum().reset_index()
    
    #Comparing the time spent in POI between original and anonymized dataset
    df_orig2 = df_orig2.loc[~(df_orig2.DatetimeIndex =='RegularTime')]
    left_join_df = pd.merge(df_orig2, df_anon2, on=['ID','lat','lon','Week','DatetimeIndex'], how='left')
    
    left_join_df['time_spent_y'] = left_join_df['time_spent_y'].fillna(pd.Timedelta(seconds=0))
    left_join_df['diff_time_spent'] = abs( left_join_df['time_spent_y'].dt.total_seconds() - left_join_df['time_spent_x'].dt.total_seconds() )
    left_join_df['time_spent_x'] = left_join_df['time_spent_x'].dt.total_seconds()
    
    #Calculating the scrore
    score = 1- (left_join_df['diff_time_spent'].sum()/left_join_df['time_spent_x'].sum())
    return score

def conditions(df):
    return [
        (df['DayOfTheWeek'] < 4) & (df['Hour']>=22) & (df['Hour']<=23), 
        (df['DayOfTheWeek'] < 4) & (df['Hour']>=0) & (df['Hour']<=6),
        (df['DayOfTheWeek'] < 4) & (df['Hour']>=9) & (df['Hour']<=17),
        (df['DayOfTheWeek'] >= 4) & (df['Hour']>=10) & (df['Hour']<=18)
    ]


In [14]:
def meet_utility(df_anon, df_original):
    #Define global variable
    size = 2
    pt = 0.1

    df = df_anon.copy()
    df_orig = df_original.copy()

    # Converting longitude and latitude as float 
    df = df.astype({'longitude': 'float64', 'latitude': 'float64'})
    df_orig = df_orig.astype({'longitude': 'float64', 'latitude': 'float64'})

    # Round lat,long with size
    df['latitude'] = df['latitude'].round(size)
    df['longitude'] = df['longitude'].round(size)
    df_orig['latitude'] = df_orig['latitude'].round(size)
    df_orig['longitude'] = df_orig['longitude'].round(size)

    # get all unique positions and sort them by most visited
    df = df.groupby(['latitude','longitude']).size().reset_index(name='count')
    df_orig = df_orig.groupby(['latitude','longitude']).size().reset_index(name='count')
    df = df.sort_values(by=['count'])
    df_orig = df_orig.sort_values(by=['count'])

    # Only keep top % cells
    nb_cellules = int(len(df_orig)*pt)
    df = df.head(nb_cellules)
    df_orig = df_orig.head(nb_cellules)

    # left join and compare cells
    df = pd.merge(df_orig, df, on=['latitude', 'longitude'], how='left')
    score = df['count_y'].notnull().sum()

    return score / nb_cellules


In [15]:
def tuile_utility(df_anon, df_original):
    size = 2
    df = df_anon.copy()
    df_orig = df_original.copy()
    
    # Converting longitude and latitude as float 
    df = df.astype({'longitude': 'float64', 'latitude': 'float64', 'id': 'string' })
    df_orig = df_orig.astype({'longitude': 'float64', 'latitude': 'float64', 'id': 'string'})

    # Round lat,long with size
    df['latitude'] = df['latitude'].round(size)
    df['longitude'] = df['longitude'].round(size)
    df_orig['latitude'] = df_orig['latitude'].round(size)
    df_orig['longitude'] = df_orig['longitude'].round(size)

    # Group each position for ids and retrieve the count of unique position
    df = df.groupby(['id','latitude','longitude']).size().reset_index(name='count')
    df_orig = df_orig.groupby(['id','latitude','longitude']).size().reset_index(name='count')
    df = df.groupby(['id']).size().reset_index(name='count')
    df_orig = df_orig.groupby(['id']).size().reset_index(name='count')

    df = pd.merge(df_orig, df, on=['id'], how='left')
    df['score'] = df.apply(createScore, axis=1)
    score = df['score'].sum()
    return score / len(df)

def createScore(row):
    if pd.isnull(row['count_x']) or pd.isnull(row['count_y']):
        score = 0
    elif row['count_x'] > row['count_y']:
        score = row['count_y'] / row['count_x']
    else:
        score = row['count_x'] / row['count_y']
    return score
