In [1]:
import pandas as pd
import numpy as np

In [2]:
original_data = pd.read_csv("original_file.csv")
original_data

Unnamed: 0,ID,DateTime,lat,lon
0,1,2015-03-04 00:35:16,4.870147,45.772140
1,1,2015-03-04 00:35:48,4.870218,45.772095
2,1,2015-03-04 00:35:49,4.870210,45.772072
3,1,2015-03-04 00:35:50,4.870210,45.772072
4,1,2015-03-04 00:35:52,4.870210,45.772072
...,...,...,...,...
34551844,110,2015-03-12 16:23:21,2.343094,48.891650
34551845,110,2015-03-12 16:23:22,2.343094,48.891650
34551846,110,2015-03-12 16:23:24,2.343094,48.891649
34551847,110,2015-03-12 16:23:25,2.343094,48.891649


In [3]:
anon_data = pd.read_csv(r"anonimized_file.pkl")
anon_data = anon_data[['ID','DateTime','lon','lat']]
anon_data.rename(columns={"lat": "lon", "lon": "lat"}, inplace=True)
anon_data

Unnamed: 0,ID,DateTime,lat,lon
0,958,2015-03-18 18:00:00,4.888,45.740
1,DEL,2015-03-04 18:00:00,-0.571,44.849
2,DEL,2015-03-11 02:00:00,5.279,45.585
3,404,2015-04-08 14:00:00,4.867,45.750
4,488,2015-03-25 14:00:00,4.879,45.786
...,...,...,...,...
34551844,229,2015-03-25 10:00:00,4.872,45.783
34551845,424,2015-04-08 10:00:00,4.882,45.784
34551846,719,2015-04-08 18:00:00,4.873,45.785
34551847,335,2015-03-25 14:00:00,4.874,45.784


In [4]:
original_data['DateTime']= original_data['DateTime'].astype('datetime64[ns]')
original_data['ID']= original_data['ID'].astype('string')
anon_data['DateTime']= anon_data['DateTime'].astype('datetime64[ns]')
anon_data['ID']= anon_data['ID'].astype('string')

In [5]:
def hour_util(df_anon, df_original):
    df_orig = df_original.copy()
    df_anon = df_anon.copy()

    df = pd.DataFrame({ 'df_hour': df_anon['DateTime'].dt.hour, 'df_origin_hour': df_orig['DateTime'].dt.hour })
    #Chaque ligne vaut 1 point
    #Une fraction de point eguale a 1/24 est enlevée à chaque heure d'écart
    df['hour_util'] = 1- abs(df['df_hour'] - df['df_origin_hour'])/24
    # le score finale est la moyenne d'ecart d'heures sur tous les points detecter
    score_hour_utility = df["hour_util"].sum()/len(df_orig)
    
    return score_hour_utility

In [11]:
def date_util(df_anon, df_original):
    df_orig = df_original.copy()
    df_anon = df_anon.copy()

    df_date_utility = pd.DataFrame({ 'DayOfTheWeek_orig': df_orig['DateTime'].dt.dayofweek, 'DayOfTheWeek_anon': df_anon['DateTime'].dt.dayofweek, 'Week_orig':df_orig['DateTime'].dt.isocalendar().week, 'Week_anon': df_anon['DateTime'].dt.isocalendar().week })
    df_date_utility['DiffDate'] = abs(df_date_utility['DayOfTheWeek_orig']-df_date_utility['DayOfTheWeek_anon'])
    #pour tout changement de semaine l'utilite doit etre 0 
    df_date_utility.loc[~(df_date_utility['Week_orig']==df_date_utility['Week_anon']),'DiffDate']=7
    df_date_utility['date_util']= 1- df_date_utility['DiffDate']/7

    score = df_date_utility["date_util"].mean().round(3)
    return score

In [9]:
def distance_util(df_anon, df_original):

    df_orig = df_original.copy()
    df_anon = df_anon.copy()
    
    df_anon.rename(columns={'ID':'ID_ano', 'DateTime':'DateTime_ano', 'lat':'lat_ano', 'lon':'lon_ano'}, inplace = True)
    df = pd.concat([df_orig.reset_index(drop=True),df_anon.reset_index(drop=True)], axis=1)
    
    #Haversine distance
    to_radians = np.pi /180
    R = 6371.009 #en km
    #a=np.sin(((df.lat*to_radians-df.lat_ano*to_radians)/2)**2) + np.sin((((df.lon*to_radians-df.lon_ano*to_radians)/2)**2))*np.cos(df.lat*to_radians)*np.cos(df.lat_ano*to_radians)
    a = np.sin(((df.lat*to_radians-df.lat_ano*to_radians)/2))**2 + (np.sin((((df.lon*to_radians-df.lon_ano*to_radians)/2)))**(2))*np.cos(df.lat*to_radians)*np.cos(df.lat_ano*to_radians)
    b = np.sqrt(a)
    df['Haversine_score']= 2 * R * np.arcsin(b)
    if(df["Haversine_score"].mean().round(3)==0):
        score = 1
    else:
        score = (1/(df["Haversine_score"].mean())).round(3)
    return score


In [98]:
def POI_util(df_anon, df_original, params):
    #Global variables
    size = params['size']
    values = ['NIGHT', 'NIGHT','WORK', 'WEEKEND']
    df_orig = df_original.copy()
    df_anon = df_anon.copy()
    
    df_orig['DateTime']= df_orig['DateTime'].astype('datetime64[ns]')
    df_orig['ID']= df_orig['ID'].astype('string')
    df_anon['DateTime']= df_anon['DateTime'].astype('datetime64[ns]')
    df_anon['ID']= df_anon['ID'].astype('string')

    #Pre-treatment of original dataframe
    df_orig['lat']=df_orig['lat'].round(size)
    df_orig['lon']=df_orig['lon'].round(size)
    df_orig['Hour'] = df_orig.loc[:,'DateTime'].dt.hour
    df_orig['Day'] = df_orig.loc[:,'DateTime'].dt.day
    df_orig['Month'] = df_orig.loc[:,'DateTime'].dt.month
    df_orig['Week'] = df_orig.loc[:,'DateTime'].dt.isocalendar().week
    df_orig['DayOfTheWeek'] = df_orig.loc[:,'DateTime'].dt.dayofweek
    df_orig.sort_values(by=['ID', 'DateTime'], inplace=True)
    df_orig.reset_index(drop=True, inplace=True)
    df_orig['DatetimeIndex'] = np.select(conditions(df_orig), values, 'RegularTime')
    df_orig['time_spent']=0
    
    df_orig['Index_of_POI'] = df_orig['ID'] + '-' + df_orig['Day'].astype('string') + '-' + df_orig['Week'].astype('string') + '-' + df_orig['lat'].astype('string') + '-' + df_orig['lon'].astype('string') + '-' + df_orig['DatetimeIndex'].astype('string')
    df_orig['Index_of_POI_shifted_backward'] = df_orig['Index_of_POI'].shift(-1)
    df_orig['Index_of_POI_shifted_forward'] = df_orig['Index_of_POI'].shift(+1)
    df_orig.loc[0,'Index_of_POI_shifted_forward']='0'
    df_orig.loc[len(df_orig)-1,'Index_of_POI_shifted_backward']='0'
    df_orig['start_time'] = df_orig.loc[~(df_orig['Index_of_POI']==df_orig['Index_of_POI_shifted_forward']), 'DateTime']
    df_orig.fillna(method="ffill", inplace=True) #propagate non-null values forward or backward
    df_orig['time_spent'] = (df_orig['DateTime'] - df_orig['start_time'])#.dt.total_seconds()/60
    
    #Getting the POI
    df_orig2 = df_orig.loc[~(df_orig['Index_of_POI_shifted_backward']==df_orig['Index_of_POI']),['ID', 'lat', 'lon', 'Week', 'DatetimeIndex', 'time_spent']].groupby(by=['ID', 'lat', 'lon', 'Week', 'DatetimeIndex']).sum().reset_index()
    df_orig2 = df_orig2.sort_values(by=['ID', 'Week', 'time_spent',  'DatetimeIndex'], ascending=[True, True, False, False]).reset_index(drop=True)
    df_orig2 = df_orig2.groupby(by=['ID', 'Week', 'DatetimeIndex']).head(1).reset_index(drop=True)

    #Pre-treatment of anonymized dataframe
    df_anon['ID_original']=df_orig.loc[:,'ID']
    df_anon['lat']=df_orig['lat'].round(size)
    df_anon['lon']=df_orig['lon'].round(size)
    df_anon['Hour'] = df_anon.loc[:,'DateTime'].dt.hour
    df_anon['Day'] = df_anon.loc[:,'DateTime'].dt.day
    df_anon['Month'] = df_anon.loc[:,'DateTime'].dt.month
    df_anon['Week'] = df_anon.loc[:,'DateTime'].dt.isocalendar().week
    df_anon['DayOfTheWeek'] = df_anon.loc[:,'DateTime'].dt.dayofweek
    df_anon['DatetimeIndex'] = np.select(conditions(df_anon), values, None)
    df_anon.sort_values(by=['ID_original', 'DateTime'], inplace=True)
    df_anon.reset_index(drop=True, inplace=True)
    df_anon['time_spent']=0

    # Creating the index
    df_anon['Index_of_POI'] = df_anon['ID_original'] + '-' + df_anon['Day'].astype('string') + '-' + df_anon['Week'].astype('string') + '-' + df_anon['lat'].astype('string') + '-' + df_anon['lon'].astype('string') + '-' + df_anon['DatetimeIndex'].astype('string')
    df_anon['Index_of_POI_shifted_backward'] = df_anon['Index_of_POI'].shift(-1)
    df_anon['Index_of_POI_shifted_forward'] = df_anon['Index_of_POI'].shift(+1)
    df_anon.loc[0,'Index_of_POI_shifted_forward']='0'
    df_anon.loc[len(df_anon)-1,'Index_of_POI_shifted_backward']='0'

    df_anon['start_time'] = df_anon.loc[~(df_anon['Index_of_POI']==df_anon['Index_of_POI_shifted_forward']), 'DateTime']
    df_anon.fillna(method="ffill", inplace=True) #propagate non-null values forward or backward
    df_anon['time_spent'] = (df_anon['DateTime'] - df_anon['start_time'])#.dt.total_seconds()/60

    df_anon2 = df_anon.loc[~(df_anon['Index_of_POI_shifted_backward']==df_anon['Index_of_POI']),['ID', 'lat', 'lon', 'Week', 'DatetimeIndex', 'time_spent']].groupby(by=['ID', 'lat', 'lon', 'Week', 'DatetimeIndex']).sum().reset_index()
    df_anon2 = df_anon2.sort_values(by=['ID', 'Week', 'time_spent',  'DatetimeIndex'], ascending=[True, True, False, False]).reset_index(drop=True)

    #Comparing the time spent in POI between original and anonymized dataset
    left_join_df = pd.merge(df_orig2, df_anon2, on=['ID','lat','lon','Week','DatetimeIndex'], how='left')
    left_join_df['time_spent_y'] = left_join_df['time_spent_y'].fillna(pd.Timedelta(seconds=0))
    left_join_df.loc[left_join_df['time_spent_y']-left_join_df['time_spent_x'] == pd.Timedelta(0), 'diff_time_spent'] = 0
    left_join_df.loc[~(left_join_df['time_spent_y']-left_join_df['time_spent_x'] == pd.Timedelta(0)), 'diff_time_spent'] = abs( (left_join_df.loc[left_join_df['time_spent_y']-left_join_df['time_spent_x'] > pd.Timedelta(0), 'time_spent_y'].dt.total_seconds()) - (left_join_df.loc[left_join_df['time_spent_y']-left_join_df['time_spent_x'] > pd.Timedelta(0), 'time_spent_x'].dt.total_seconds()) )
    left_join_df.loc[left_join_df['time_spent_y'].dt.total_seconds()==0, 'diff_time_spent'] = left_join_df.loc[left_join_df['time_spent_y'].dt.total_seconds()==0, 'time_spent_x'].dt.total_seconds()
    left_join_df = left_join_df.loc[~(left_join_df.DatetimeIndex =='RegularTime')]
    
    #Calculating the scrore
    score = 1- (left_join_df['diff_time_spent'].sum()/left_join_df['time_spent_x'].dt.total_seconds().sum()) #.round(3)
    return score

def conditions(df):
    return [
        (df['DayOfTheWeek'] < 4) & (df['Hour']>=22) & (df['Hour']<=23), 
        (df['DayOfTheWeek'] < 4) & (df['Hour']>=0) & (df['Hour']<=6),
        (df['DayOfTheWeek'] < 4) & (df['Hour']>=9) & (df['Hour']<=17),
        (df['DayOfTheWeek'] >= 4) & (df['Hour']>=10) & (df['Hour']<=18)
    ]
