In [1]:
import pandas as pd
import datetime
from numpy.random import laplace
import math

In [2]:
# load in the data
air_bnb = pd.read_csv("AB_NYC_2019.csv")
air_bnb.columns

Index(['id', 'name', 'host_id', 'host_name', 'neighbourhood_group',
       'neighbourhood', 'latitude', 'longitude', 'room_type', 'price',
       'minimum_nights', 'number_of_reviews', 'last_review',
       'reviews_per_month', 'calculated_host_listings_count',
       'availability_365'],
      dtype='object')

In [3]:
air_bnb.head(5)

Unnamed: 0,id,name,host_id,host_name,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365
0,2539,Clean & quiet apt home by the park,2787,John,Brooklyn,Kensington,40.64749,-73.97237,Private room,149,1,9,2018-10-19,0.21,6,365
1,2595,Skylit Midtown Castle,2845,Jennifer,Manhattan,Midtown,40.75362,-73.98377,Entire home/apt,225,1,45,2019-05-21,0.38,2,355
2,3647,THE VILLAGE OF HARLEM....NEW YORK !,4632,Elisabeth,Manhattan,Harlem,40.80902,-73.9419,Private room,150,3,0,,,1,365
3,3831,Cozy Entire Floor of Brownstone,4869,LisaRoxanne,Brooklyn,Clinton Hill,40.68514,-73.95976,Entire home/apt,89,1,270,2019-07-05,4.64,1,194
4,5022,Entire Apt: Spacious Studio/Loft by central park,7192,Laura,Manhattan,East Harlem,40.79851,-73.94399,Entire home/apt,80,10,9,2018-11-19,0.1,1,0


<h2>Mask out EIs and redundant QIs</h2>

In [4]:
# mask out the id, listing name, and the host name
air_bnb["id"] = "*****"
air_bnb["name"] = "*****"
air_bnb["host_name"] = "*****"

# mask the latitude and longitude since that information can already be estimated by the neighborhood
air_bnb["latitude"] = "*****"
air_bnb["longitude"] = "*****"

<h2>Hash out the Host ID</h2>

In [5]:
# hash the host id so it can still be used to group by but it won't reveal any info about a user or listing
# in the case where someone can tie a host id to an owner
air_bnb["host_id"] = air_bnb["host_id"].apply(lambda x: hash(str(x)))

<h2>Generalize the Date</h2>

In [6]:
# round the last review date to the nearest month
air_bnb["last_review"] = air_bnb.last_review.apply(lambda x: str(x)[:7])

<h2>Use K-Anonymity for the neighborhoods</h2>

In [7]:
# get the neighbourhoods with less than 15 entries
temp = air_bnb.groupby(["neighbourhood"]).count().sort_values("id")
other_list = temp[temp["id"]<15].index

# anonymize the neighborhood to k entries each
def k_anonymize_neighborhood(x, k):
    
    # if an entry has one of these eighborhoods then set their neighborhood as other 
    if x in other_list:
        return "other"
    else:
        return x
        
air_bnb["neighbourhood"] =  air_bnb["neighbourhood"].apply(lambda x: k_anonymize_neighborhood(x,15))

<h2>Use differential privacy for price, minimum_nights, and number_of_reviews</h2>

In [8]:
# use the log of the std as the laplace 
price_std = math.log(air_bnb["price"].std())
air_bnb["price"] = air_bnb["price"].apply(lambda x: x + laplace(0, price_std))

# use the log of the std as the laplace 
price_std = math.log(air_bnb["minimum_nights"].std())
air_bnb["minimum_nights"] = air_bnb["minimum_nights"].apply(lambda x: x + laplace(0, price_std))

# use the log of the std as the laplace 
price_std = math.log(air_bnb["number_of_reviews"].std())
air_bnb["number_of_reviews"] = air_bnb["number_of_reviews"].apply(lambda x: x + laplace(0, price_std))

<h2>Output the newly anonymized file into a csv</h2>

In [9]:
air_bnb.head(5)

Unnamed: 0,id,name,host_id,host_name,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365
0,*****,*****,-1540419183100330906,*****,Brooklyn,Kensington,*****,*****,Private room,157.324234,3.548874,6.920243,2018-10,0.21,6,365
1,*****,*****,-6303625318003254822,*****,Manhattan,Midtown,*****,*****,Entire home/apt,222.991412,1.552826,46.989945,2019-05,0.38,2,355
2,*****,*****,2655665583548708572,*****,Manhattan,Harlem,*****,*****,Private room,154.848218,3.455304,0.698164,,,1,365
3,*****,*****,-2939139632100285860,*****,Brooklyn,Clinton Hill,*****,*****,Entire home/apt,87.730973,-9.733857,270.073743,2019-07,4.64,1,194
4,*****,*****,1918944794498209498,*****,Manhattan,East Harlem,*****,*****,Entire home/apt,75.474851,8.16424,4.983751,2018-11,0.1,1,0


In [10]:
air_bnb.to_csv("anonymized_airbnb.csv", index=False)