In [2]:
%matplotlib inline
%config InlineBackend.figure.format = 'retina'

In [3]:
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_context("poster", font_scale=1.3)
import folium

import pandas as pd
import numpy as np
import json
import ijson
from datetime import datetime, timedelta

The input to this preprocess will be the pickle file generated from P2_EDA_airbnb_02_preprocessing.ipynb notebook

Read in the RER csv file to be used for computing distances between Airbnb properties and metro/RER stations

In [4]:
rer = pd.read_csv('paris_rer/accessibilite-des-gares-et-stations-metro-et-rer-ratp.csv',sep=";")

In [5]:
rer.head()

Unnamed: 0,idptar,nomptar,STIF,CodeINSEE,X,Y,coord,UFR,AnnonceSonoreProchainPassage,AnnonceVisuelleProchainPassage,AnnonceSonoreSituationsPerturbees,AnnonceVisuelleSituationsPerturbees,PAQT,AccessibiliteQuaiTrain
0,2067,Bérault,1001100010001,94080,606737,2427450,"48.8453687094, 2.42824450629",0,1,1,1,1,0,0
1,1751,Porte de Vincennes,1001100010001,75120,605457,2427632,"48.8470164776, 2.410816879",0,1,1,1,1,0,0
2,1955,Gare de Lyon,1001100010001,75112,602713,2427468,"48.8455597463, 2.37344920163",0,1,1,1,1,0,0
3,2036,Châtelet,1001100010001,75101,600839,2428915,"48.8585696317, 2.3479332425",0,1,1,1,1,0,0
4,1856,Les Sablons (Jardin d'acclimatation),1001100010001,92051,595260,2431446,"48.8812991447, 2.27191517268",0,1,1,1,1,0,0


In [6]:
rer = rer[['nomptar','CodeINSEE','coord']]
rer.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 955 entries, 0 to 954
Data columns (total 3 columns):
nomptar      955 non-null object
CodeINSEE    955 non-null int64
coord        955 non-null object
dtypes: int64(1), object(2)
memory usage: 22.5+ KB


In [7]:
rer = rer.drop_duplicates(subset=['nomptar', 'CodeINSEE'])
rer.sort_values(by='nomptar').reset_index().to_csv('rer_filtered.csv')

In [9]:
#read the cleaned pickle file from previous notebook

dfParis = pd.read_pickle('airbnb_paris/airbnb_Paris_updt_0615.p')

In [10]:

dfParis.columns

Index(['accommodates', 'amenities', 'availability_30', 'availability_60',
       'availability_90', 'bathrooms', 'bedrooms', 'beds',
       'cancellation_policy', 'guests_included', 'host_since',
       'host_total_listings_count', 'id', 'last_review', 'latitude',
       'longitude', 'minimum_nights', 'neighbourhood_cleansed',
       'number_of_reviews', 'price', 'property_type', 'review_scores_accuracy',
       'review_scores_checkin', 'review_scores_cleanliness',
       'review_scores_communication', 'review_scores_location',
       'review_scores_rating', 'review_scores_value', 'reviews_per_month',
       'room_type', 'summary', 'transit', 'zipcode', 'arrondissement',
       'arrond_name', 'rating_ind', 'Eiffel Tower', 'The Louvre',
       'Jardin du Luxembourg', 'Le Marais', 'Musee d'Orsay', 'Sainte-Chapelle',
       'Palais Garnier - Opera', 'Notre Dame Cathedral',
       'Musee de l'Orangerie', 'Pont Alexandre III', 'closest_attraction',
       'attraction_dist', 'close_to_attrac

In [11]:
dfParis.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 52295 entries, 0 to 52338
Data columns (total 49 columns):
accommodates                   52295 non-null int64
amenities                      51907 non-null object
availability_30                52295 non-null int64
availability_60                52295 non-null int64
availability_90                52295 non-null int64
bathrooms                      52114 non-null object
bedrooms                       52165 non-null float64
beds                           52229 non-null float64
cancellation_policy            52295 non-null object
guests_included                52295 non-null int64
host_since                     52285 non-null object
host_total_listings_count      52285 non-null float64
id                             52295 non-null object
last_review                    39554 non-null object
latitude                       52295 non-null object
longitude                      52295 non-null object
minimum_nights                 52295 non-null

In [12]:
#Define search criteria for recommendations:

search = {'beds': 1, 'bedrooms' : 1, 'accomodates': 2, 'available_30': 5, 
                   'price_max': 85, 'rating_ind': 1, 'rating_score': 100.0}

Extract and Filter out rows based on Recommendations Search Criteria

In [11]:
rows_for_recommend = dfParis[(dfParis.beds >= search['beds']) & (dfParis.bedrooms >= search['bedrooms']) & 
        (dfParis.accommodates >= search['accomodates']) &
        (dfParis.availability_30 >= search['available_30'])& 
        (dfParis.price <= search['price_max']) &
        (dfParis.rating_ind == search['rating_ind']) & 
        (dfParis.review_scores_rating == search['rating_score']) ]

#### This section is for getting distance between Airbnb property to RER station

In [12]:
rer.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 394 entries, 0 to 939
Data columns (total 3 columns):
nomptar      394 non-null object
CodeINSEE    394 non-null int64
coord        394 non-null object
dtypes: int64(1), object(2)
memory usage: 12.3+ KB


In [13]:
from math import radians, cos, sin, asin, sqrt

def haversine(lat1, lon1, lat2, lon2):
    
    '''Calculate the great circle distance between two points 
    on the earth (specified in decimal degrees)'''
    
    lat1 = float(lat1)
    lon1 = float(lon1)
    lat2 = float(lat2)
    lon2 = float(lon2)
    #miles_constant = 3959
    lat1, lon1, lat2, lon2 = map(radians, [lat1, lon1, lat2, lon2])
    #haversine formula
    dlat = lat2 - lat1
    dlon = lon2 - lon1
    a = sin(dlat/2)**2 + cos(lat1) * cos(lat2) * sin(dlon/2)**2
    c = 2 * asin(sqrt(a))
    r = 3959
    mi = r * c
    return mi

Process the filtered file, and get the top 5 stations closest to the Airbnb property.  Set near_station indicator to 1 if at least of the station is within 1 mile to the property.

In [15]:
for i, bnb in rows_for_recommend.iterrows():
    dist_to_station = []  
    #print('bnb id: {}'.format(bnb['id']))
    for j, rer20 in rer.iterrows():
        
        item = {}
        #set the station name
        item['station'] = rer20['nomptar']
        
        #get coord of station
        try:
            lat2, lon2 = rer20['coord'].split(',')
        except:
            print('coord :',rer20['coord'], rer20['nomptar'])
            print('error : lat: {} lon {} j {}'.format(lat2, lon2, j))
            
        #compute distance between airbnb property and metro stations
        item['distance']= haversine(rows_for_recommend.ix[i,'latitude'], 
                                    rows_for_recommend.ix[i,'longitude'],
                                    lat2, lon2)
        
        dist_to_station.append(item)
        
    #sort by 
    dist_sorted = sorted(dist_to_station, key = lambda i: (i['distance'], i['station']))

   
    rows_for_recommend.ix[i,'station1_name'] = dist_sorted[0]['station']
    rows_for_recommend.ix[i,'station1_dist'] = dist_sorted[0]['distance']
    rows_for_recommend.ix[i,'station2_name'] = dist_sorted[1]['station']
    rows_for_recommend.ix[i,'station2_dist'] = dist_sorted[1]['distance']
    rows_for_recommend.ix[i,'station3_name'] = dist_sorted[2]['station']
    rows_for_recommend.ix[i,'station3_dist'] = dist_sorted[2]['distance']
    rows_for_recommend.ix[i, 'station4_name'] = dist_sorted[3]['station']
    rows_for_recommend.ix[i, 'station4_dist'] = dist_sorted[3]['distance']
    rows_for_recommend.ix[i, 'station5_name'] = dist_sorted[5]['station']
    rows_for_recommend.ix[i, 'station5_dist'] = dist_sorted[5]['distance']
    
    #set indicator to 1 if a station within 1.0 miles exist
    #a separate column will allow us to easily visualize properties with a station that
    #is walking distance from the airbnb property
    
    if dist_sorted[0]['distance'] <= 1.0:
        rows_for_recommend.ix[i, 'near_station'] = 1
    else:
        rows_for_recommend.ix[i, 'near_station'] = 0
        
    if i % 100 == 0:
        print('finished processing {}...'.format(i))


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


finished processing 100...
finished processing 300...
finished processing 3900...
finished processing 12500...
finished processing 13900...
finished processing 22400...
finished processing 23400...
finished processing 24800...
finished processing 34100...
finished processing 35800...
finished processing 49200...


In [16]:
rows_for_recommend[(rows_for_recommend.close_to_attraction == 1) & 
                   (rows_for_recommend.near_station == 1)].id.count()

1009

In [17]:
rows_for_recommend.head()

Unnamed: 0,accommodates,amenities,availability_30,availability_60,availability_90,bathrooms,bedrooms,beds,cancellation_policy,guests_included,...,station1_dist,station2_name,station2_dist,station3_name,station3_dist,station4_name,station4_dist,station5_name,station5_dist,near_station
60,4,"Wireless Internet,Kitchen,Heating,Family/kid f...",6,36,66,1,2.0,2.0,strict,1,...,0.098153,Jaurès,0.193794,Riquet,0.320585,L1s Blanc,0.361636,La Chapelle,0.445057,1.0
70,4,"Internet,Wireless Internet,Kitchen,Heating,Fam...",17,37,66,1,1.0,2.0,moderate,2,...,0.178415,Laumière,0.314023,Danube,0.356309,Botzaris,0.361044,Buttes-Chaumont,0.499211,1.0
88,2,"Wireless Internet,Kitchen,Indoor fireplace,Hea...",9,20,20,1,1.0,1.0,strict,1,...,0.181508,Buttes-Chaumont,0.217686,Jourdain,0.222881,Pyrénées,0.268299,Danube,0.440279,1.0
100,4,"TV,Internet,Wireless Internet,Kitchen,Indoor f...",14,40,70,1,1.0,2.0,moderate,2,...,0.093209,Bel-Air,0.201585,Nation,0.348344,Porte de Vincennes,0.36953,Daumesnil (Félix Eboué),0.506145,1.0
133,2,"TV,Wireless Internet,Kitchen,Pets allowed,Heat...",29,59,89,1,1.0,1.0,flexible,1,...,0.157927,Brochant,0.295975,Place de Clichy,0.311161,Rome,0.327582,Blanche,0.494865,1.0


In [20]:
rows_for_recommend.columns

Index(['accommodates', 'amenities', 'availability_30', 'availability_60',
       'availability_90', 'bathrooms', 'bedrooms', 'beds',
       'cancellation_policy', 'guests_included', 'host_since',
       'host_total_listings_count', 'id', 'last_review', 'latitude',
       'longitude', 'minimum_nights', 'neighbourhood_cleansed',
       'number_of_reviews', 'price', 'property_type', 'review_scores_accuracy',
       'review_scores_checkin', 'review_scores_cleanliness',
       'review_scores_communication', 'review_scores_location',
       'review_scores_rating', 'review_scores_value', 'reviews_per_month',
       'room_type', 'summary', 'transit', 'zipcode', 'arrondissement',
       'arrond_name', 'rating_ind', 'Eiffel Tower', 'The Louvre',
       'Jardin du Luxembourg', 'Le Marais', 'Musee d'Orsay', 'Sainte-Chapelle',
       'Palais Garnier - Opera', 'Notre Dame Cathedral',
       'Musee de l'Orangerie', 'Pont Alexandre III', 'closest_attraction',
       'attraction_dist', 'close_to_attrac

In [21]:
df_attractions = pd.read_csv('airbnb_paris/Paris_attractions.csv')


In [22]:
sitelist = list(df_attractions['name'].values)
sitelist

['Eiffel Tower',
 'The Louvre',
 'Jardin du Luxembourg',
 'Le Marais',
 "Musee d'Orsay",
 'Sainte-Chapelle',
 'Palais Garnier - Opera',
 'Notre Dame Cathedral',
 "Musee de l'Orangerie",
 'Pont Alexandre III']

Get the number of Attractions within 1 mile of the airbnb property, and populate new column site_count

In [24]:
for i, bnb in rows_for_recommend.iterrows():
    sites = []
    station = []  
    
    
    station_item = {}
    
    #count number of stations within 0.2 mile
    stn_count = 0
    for x in range(1,6):
        #col_name = 'station{}_name'.format(x)
        col_dist = 'station{}_dist'.format(x)
        if rows_for_recommend.loc[i, col_dist] < 0.2:
            stn_count += 1
            
    rows_for_recommend.loc[i,'station_count'] = stn_count
    
    if bnb['price'] < 50.0:
        rows_for_recommend.loc[i,'price_range'] = '< $50'
    elif bnb['price'] <= 70:
        rows_for_recommend.loc[i,'price_range'] = '$50-$70'
    elif bnb['price'] <= 90:
        rows_for_recommend.loc[i,'price_range'] = '$71-$90'
    elif bnb['price'] <= 100:
        rows_for_recommend.loc[i,'price_range'] = '$91-$100'
    
    #count number of sites within 1 mile
    site_count = 0
    
    for name in sitelist:
        if rows_for_recommend.loc[i, name] < 1.0:
            site_count += 1

            
    
    rows_for_recommend.loc[i,'site_count'] = site_count
        



In [25]:
#saving file to avoid having to reprocess after every run
rows_for_recommend.to_pickle('airbnb_paris/airbnb_paris_recommend_0615.p')

In [14]:
#Restart recommendation process from here
import pandas as pd
rows_for_recommend = pd.read_pickle('airbnb_paris/airbnb_paris_recommend_0615.p')

In [15]:
#look at distribution of station_count for the properties
rows_for_recommend.groupby('station_count').id.count()

station_count
0.0    237
1.0    557
2.0    257
3.0     72
4.0     16
Name: id, dtype: int64

In [16]:
pd.set_option('display.max_columns',500)

Recommendation filter 1:

* properties which are within 1 mile at least one of the top attractions, AND

* Room type == Entire home/apt, AND

* property is within 0.2 miles at least 2 stations

In [30]:
filtered = rows_for_recommend[(rows_for_recommend.site_count > 1) & (rows_for_recommend.room_type == 'Entire home/apt')
                   & (rows_for_recommend.station_count > 2)].sort_values(by=['site_count','station_count'], ascending=False)

In [31]:
filtered['price_range'].value_counts()

$50-$70    8
$71-$90    5
< $50      1
Name: price_range, dtype: int64

In [32]:
output = filtered[
     ['id','arrondissement','arrond_name','room_type','price','review_scores_rating','cancellation_policy','closest_attraction',
       'attraction_dist', 'close_to_attraction','station_count','Eiffel Tower',
       'The Louvre', 'Jardin du Luxembourg','Le Marais',"Musee d'Orsay", 'Sainte-Chapelle',
       'Palais Garnier - Opera','Notre Dame Cathedral',"Musee de l'Orangerie", 'Pont Alexandre III','station1_name',
       'station1_dist', 'station2_name', 'station2_dist', 'station3_name',
       'station3_dist', 'station4_name', 'station4_dist', 'station5_name',
       'station5_dist', 'price_range','site_count','latitude', 'longitude']]

In [33]:
output.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 14 entries, 37098 to 27707
Data columns (total 35 columns):
id                        14 non-null object
arrondissement            14 non-null int64
arrond_name               14 non-null object
room_type                 14 non-null object
price                     14 non-null float64
review_scores_rating      14 non-null float64
cancellation_policy       14 non-null object
closest_attraction        14 non-null object
attraction_dist           14 non-null float64
close_to_attraction       14 non-null float64
station_count             14 non-null float64
Eiffel Tower              14 non-null float64
The Louvre                14 non-null float64
Jardin du Luxembourg      14 non-null float64
Le Marais                 14 non-null float64
Musee d'Orsay             14 non-null float64
Sainte-Chapelle           14 non-null float64
Palais Garnier - Opera    14 non-null float64
Notre Dame Cathedral      14 non-null float64
Musee de l'Orangerie   

In [34]:
###Filter 2 for properties which are close to at least one attraction, and is close (within .2 miles) to 2 or more stations

In [43]:
filtered2 = rows_for_recommend[(rows_for_recommend.site_count >= 1) & (rows_for_recommend.room_type == 'Entire home/apt')
                   & (rows_for_recommend.station_count == 2)].sort_values(by=['site_count','station1_dist'], ascending=False)

In [44]:
output2 = filtered2[
     ['id','arrondissement','arrond_name','room_type','price','review_scores_rating','cancellation_policy','closest_attraction',
       'attraction_dist', 'close_to_attraction','station_count','Eiffel Tower',
       'The Louvre', 'Jardin du Luxembourg','Le Marais',"Musee d'Orsay", 'Sainte-Chapelle',
       'Palais Garnier - Opera','Notre Dame Cathedral',"Musee de l'Orangerie", 'Pont Alexandre III','station1_name',
       'station1_dist', 'station2_name', 'station2_dist', 'station3_name',
       'station3_dist', 'station4_name', 'station4_dist', 'station5_name',
       'station5_dist', 'price_range','site_count','latitude', 'longitude']]

In [52]:
#combine all recommendations, and get top 20
set1_count = output.id.count()
set2_count = 0
if set1_count < 20:
    set1 = output.head(set1_count)
    set2_count = 20 - set1_count
else:
    set1 = output.copy()
    
    
if set2_count > 0:
    set2 = output2.head(set2_count)
    
result = pd.concat([set1, set2], axis=0)

#map recommendations




Results for:  
* Beds: >= 1  
* Accomodates >= 2  
* Available_30 >=  5 days  
* Price Max: $85  
* Rating Ind = 1 (rated properties only)
* Room type = Entire home/Apt
* Review_Scores_Rating = 100.0
   

In [54]:
result.drop(['room_type','review_scores_rating', 'close_to_attraction'], axis=1)

Unnamed: 0,id,arrondissement,arrond_name,price,cancellation_policy,closest_attraction,attraction_dist,station_count,Eiffel Tower,The Louvre,Jardin du Luxembourg,Le Marais,Musee d'Orsay,Sainte-Chapelle,Palais Garnier - Opera,Notre Dame Cathedral,Musee de l'Orangerie,Pont Alexandre III,station1_name,station1_dist,station2_name,station2_dist,station3_name,station3_dist,station4_name,station4_dist,station5_name,station5_dist,price_range,site_count,latitude,longitude
37098,12691441,1,Louvre,55.0,flexible,The Louvre,0.324823,3.0,2.279255,0.324823,1.154168,0.703087,0.819248,0.47351,0.887801,0.687199,0.987702,1.400157,Les Halles,0.100234,Châtelet-Les Halles,0.127479,Louvre-Rivoli,0.177881,Etienne Marcel,0.23604,Châtelet,0.302536,$50-$70,7.0,48.86223720346151,2.344298191899912
4748,1846308,11,Popincourt,80.0,moderate,The Louvre,0.347938,3.0,2.313145,0.347938,1.084469,0.636549,0.849699,0.385578,0.981818,0.590699,1.042652,1.452022,Châtelet-Les Halles,0.080586,Les Halles,0.091112,Louvre-Rivoli,0.19373,Châtelet,0.206811,Etienne Marcel,0.254388,$71-$90,6.0,48.86097809420282,2.3452323445275267
39996,3247039,6,Luxembourg,70.0,moderate,Sainte-Chapelle,0.376824,3.0,1.943428,0.424355,0.572341,1.040491,0.601516,0.376824,1.23375,0.602936,0.909621,1.240556,Mabillon,0.136513,Odéon,0.180855,Saint-Germain des Prés,0.186248,Saint-Michel,0.33141,Pont Neuf,0.369129,$50-$70,6.0,48.85447951841604,2.336830506364772
44566,10489498,1,Louvre,70.0,strict,Palais Garnier - Opera,0.435389,3.0,1.774609,0.436807,1.368737,1.313896,0.462505,0.927324,0.435389,1.199138,0.439555,0.842335,Pyramides,0.112749,Pyramides,0.129119,Tuileries,0.143665,Palais-Royal (Musée du Louvre),0.227875,Opéra,0.340652,$50-$70,6.0,48.86570261463177,2.3319274559507868
9022,16902638,2,Bourse,70.0,flexible,Le Marais,0.60404,3.0,2.541144,0.619101,1.411175,0.60404,1.096025,0.69089,0.948613,0.82583,1.221135,1.633803,Etienne Marcel,0.088837,Réaumur-Sébastopol,0.163637,Sentier,0.18792,Les Halles,0.24529,Arts-et-Métiers,0.302903,$50-$70,5.0,48.86494885382183,2.3495076340203824
29060,17680319,1,Louvre,75.0,flexible,The Louvre,0.338715,3.0,2.035623,0.338715,1.33377,1.05725,0.641084,0.767485,0.534395,1.01984,0.703575,1.112864,Pyramides,0.179079,Palais-Royal (Musée du Louvre),0.186416,Pyramides,0.188924,Bourse,0.257117,Louvre-Rivoli,0.347106,$71-$90,5.0,48.865496179677024,2.3379621719063404
36347,16052249,2,Bourse,55.0,strict,Le Marais,0.600863,3.0,2.556606,0.635784,1.425064,0.600863,1.112219,0.703758,0.956175,0.834661,1.235793,1.648285,Etienne Marcel,0.101911,Réaumur-Sébastopol,0.146928,Sentier,0.187308,Les Halles,0.26074,Arts-et-Métiers,0.287805,$50-$70,5.0,48.86507933032343,2.3498173130555724
3192,15563015,2,Bourse,69.0,strict,Le Marais,0.785244,3.0,2.702447,0.867506,1.711637,0.785244,1.301437,0.991827,0.93589,1.112745,1.367386,1.768143,Strasbourg-Saint-Denis,0.133311,Bonne Nouvelle,0.179496,Réaumur-Sébastopol,0.195368,Sentier,0.221946,Arts-et-Métiers,0.314485,$50-$70,4.0,48.86906032099319,2.3517008654862743
14617,10347142,3,Temple,55.0,moderate,Le Marais,0.428041,3.0,2.716932,0.765681,1.434226,0.428041,1.260136,0.707518,1.149141,0.772453,1.408247,1.821877,Arts-et-Métiers,0.158819,Réaumur-Sébastopol,0.173283,Rambuteau,0.187806,Etienne Marcel,0.213903,Châtelet-Les Halles,0.353382,$50-$70,4.0,48.8638989955195,2.3536801577761324
27196,4280862,5,Pantheon,79.0,flexible,Notre Dame Cathedral,0.530646,3.0,2.704867,1.208722,0.611314,0.996372,1.488503,0.740392,2.0346,0.530646,1.798536,2.114798,Cardinal-Lemoine,0.100877,Place Monge (Jardin des Plantes),0.165242,Jussieu,0.197219,Maubert-Mutualité,0.312084,Cluny-La Sorbonne,0.453983,$71-$90,4.0,48.84533334631346,2.350578985809922


In [56]:
result = result[['id', 'arrondissement', 'arrond_name', 'price',
       'cancellation_policy', 'closest_attraction','attraction_dist','site_count',
       'station_count',
       'Eiffel Tower', 'The Louvre', 'Jardin du Luxembourg', 'Le Marais',
       "Musee d'Orsay", 'Sainte-Chapelle', 'Palais Garnier - Opera',
       'Notre Dame Cathedral', "Musee de l'Orangerie", 'Pont Alexandre III',
       'station1_name', 'station1_dist', 'station2_name', 'station2_dist',
       'station3_name', 'station3_dist', 'station4_name', 'station4_dist',
       'station5_name', 'station5_dist', 
       'latitude', 'longitude']]

In [57]:
result

Unnamed: 0,id,arrondissement,arrond_name,price,cancellation_policy,closest_attraction,attraction_dist,site_count,station_count,Eiffel Tower,The Louvre,Jardin du Luxembourg,Le Marais,Musee d'Orsay,Sainte-Chapelle,Palais Garnier - Opera,Notre Dame Cathedral,Musee de l'Orangerie,Pont Alexandre III,station1_name,station1_dist,station2_name,station2_dist,station3_name,station3_dist,station4_name,station4_dist,station5_name,station5_dist,latitude,longitude
37098,12691441,1,Louvre,55.0,flexible,The Louvre,0.324823,7.0,3.0,2.279255,0.324823,1.154168,0.703087,0.819248,0.47351,0.887801,0.687199,0.987702,1.400157,Les Halles,0.100234,Châtelet-Les Halles,0.127479,Louvre-Rivoli,0.177881,Etienne Marcel,0.23604,Châtelet,0.302536,48.86223720346151,2.344298191899912
4748,1846308,11,Popincourt,80.0,moderate,The Louvre,0.347938,6.0,3.0,2.313145,0.347938,1.084469,0.636549,0.849699,0.385578,0.981818,0.590699,1.042652,1.452022,Châtelet-Les Halles,0.080586,Les Halles,0.091112,Louvre-Rivoli,0.19373,Châtelet,0.206811,Etienne Marcel,0.254388,48.86097809420282,2.3452323445275267
39996,3247039,6,Luxembourg,70.0,moderate,Sainte-Chapelle,0.376824,6.0,3.0,1.943428,0.424355,0.572341,1.040491,0.601516,0.376824,1.23375,0.602936,0.909621,1.240556,Mabillon,0.136513,Odéon,0.180855,Saint-Germain des Prés,0.186248,Saint-Michel,0.33141,Pont Neuf,0.369129,48.85447951841604,2.336830506364772
44566,10489498,1,Louvre,70.0,strict,Palais Garnier - Opera,0.435389,6.0,3.0,1.774609,0.436807,1.368737,1.313896,0.462505,0.927324,0.435389,1.199138,0.439555,0.842335,Pyramides,0.112749,Pyramides,0.129119,Tuileries,0.143665,Palais-Royal (Musée du Louvre),0.227875,Opéra,0.340652,48.86570261463177,2.3319274559507868
9022,16902638,2,Bourse,70.0,flexible,Le Marais,0.60404,5.0,3.0,2.541144,0.619101,1.411175,0.60404,1.096025,0.69089,0.948613,0.82583,1.221135,1.633803,Etienne Marcel,0.088837,Réaumur-Sébastopol,0.163637,Sentier,0.18792,Les Halles,0.24529,Arts-et-Métiers,0.302903,48.86494885382183,2.3495076340203824
29060,17680319,1,Louvre,75.0,flexible,The Louvre,0.338715,5.0,3.0,2.035623,0.338715,1.33377,1.05725,0.641084,0.767485,0.534395,1.01984,0.703575,1.112864,Pyramides,0.179079,Palais-Royal (Musée du Louvre),0.186416,Pyramides,0.188924,Bourse,0.257117,Louvre-Rivoli,0.347106,48.865496179677024,2.3379621719063404
36347,16052249,2,Bourse,55.0,strict,Le Marais,0.600863,5.0,3.0,2.556606,0.635784,1.425064,0.600863,1.112219,0.703758,0.956175,0.834661,1.235793,1.648285,Etienne Marcel,0.101911,Réaumur-Sébastopol,0.146928,Sentier,0.187308,Les Halles,0.26074,Arts-et-Métiers,0.287805,48.86507933032343,2.3498173130555724
3192,15563015,2,Bourse,69.0,strict,Le Marais,0.785244,4.0,3.0,2.702447,0.867506,1.711637,0.785244,1.301437,0.991827,0.93589,1.112745,1.367386,1.768143,Strasbourg-Saint-Denis,0.133311,Bonne Nouvelle,0.179496,Réaumur-Sébastopol,0.195368,Sentier,0.221946,Arts-et-Métiers,0.314485,48.86906032099319,2.3517008654862743
14617,10347142,3,Temple,55.0,moderate,Le Marais,0.428041,4.0,3.0,2.716932,0.765681,1.434226,0.428041,1.260136,0.707518,1.149141,0.772453,1.408247,1.821877,Arts-et-Métiers,0.158819,Réaumur-Sébastopol,0.173283,Rambuteau,0.187806,Etienne Marcel,0.213903,Châtelet-Les Halles,0.353382,48.8638989955195,2.3536801577761324
27196,4280862,5,Pantheon,79.0,flexible,Notre Dame Cathedral,0.530646,4.0,3.0,2.704867,1.208722,0.611314,0.996372,1.488503,0.740392,2.0346,0.530646,1.798536,2.114798,Cardinal-Lemoine,0.100877,Place Monge (Jardin des Plantes),0.165242,Jussieu,0.197219,Maubert-Mutualité,0.312084,Cluny-La Sorbonne,0.453983,48.84533334631346,2.350578985809922


In [65]:
def set_property_id(row):
    return 'Property{}'.format(row['id'])

result['id'] = result.apply(lambda x: set_property_id(x), 1)

result.head(3)

Unnamed: 0,id,arrondissement,arrond_name,price,cancellation_policy,closest_attraction,attraction_dist,site_count,station_count,Eiffel Tower,The Louvre,Jardin du Luxembourg,Le Marais,Musee d'Orsay,Sainte-Chapelle,Palais Garnier - Opera,Notre Dame Cathedral,Musee de l'Orangerie,Pont Alexandre III,station1_name,station1_dist,station2_name,station2_dist,station3_name,station3_dist,station4_name,station4_dist,station5_name,station5_dist,latitude,longitude
37098,Property12691441,1,Louvre,55.0,flexible,The Louvre,0.324823,7.0,3.0,2.279255,0.324823,1.154168,0.703087,0.819248,0.47351,0.887801,0.687199,0.987702,1.400157,Les Halles,0.100234,Châtelet-Les Halles,0.127479,Louvre-Rivoli,0.177881,Etienne Marcel,0.23604,Châtelet,0.302536,48.86223720346151,2.344298191899912
4748,Property1846308,11,Popincourt,80.0,moderate,The Louvre,0.347938,6.0,3.0,2.313145,0.347938,1.084469,0.636549,0.849699,0.385578,0.981818,0.590699,1.042652,1.452022,Châtelet-Les Halles,0.080586,Les Halles,0.091112,Louvre-Rivoli,0.19373,Châtelet,0.206811,Etienne Marcel,0.254388,48.86097809420282,2.3452323445275267
39996,Property3247039,6,Luxembourg,70.0,moderate,Sainte-Chapelle,0.376824,6.0,3.0,1.943428,0.424355,0.572341,1.040491,0.601516,0.376824,1.23375,0.602936,0.909621,1.240556,Mabillon,0.136513,Odéon,0.180855,Saint-Germain des Prés,0.186248,Saint-Michel,0.33141,Pont Neuf,0.369129,48.85447951841604,2.336830506364772


In [67]:
#get all distinct stations close to airbnb properties and map

station_dict = {'name':[],'coord':[]}
stnlist = []
for i, bnb in result.iterrows():

    #get all station information in row and populate station_dict
   
    
    for x in range(1,5):
        col_name = 'station{}_name'.format(x)
        #print(bnb[col_name])
        #print(rer.loc[rer['nomptar'] == bnb[col_name]].coord.values[0])
        if bnb[col_name] not in station_dict.keys():
            station_dict['name'].append(bnb[col_name].replace("'",''))
            station_dict['coord'].append(rer[rer['nomptar'] == bnb[col_name]]['coord'].values[0])

df = pd.DataFrame(station_dict).drop_duplicates()

In [68]:
result.to_pickle('airbnb_paris/recommendation_results_0615.p')

In [69]:
for i, row in df.iterrows():
    df.loc[i,'latitude'] = float(row['coord'].split(',')[0])
    df.loc[i,'longitude'] = float(row['coord'].split(',')[1])

In [70]:
df.head()

Unnamed: 0,coord,name,latitude,longitude
0,"48.86201193, 2.34647644857",Les Halles,48.862012,2.346476
1,"48.8614636698, 2.34684412827",Châtelet-Les Halles,48.861464,2.346844
2,"48.8608798168, 2.34097327248",Louvre-Rivoli,48.86088,2.340973
3,"48.8637103257, 2.34898319279",Etienne Marcel,48.86371,2.348983
7,"48.8585696317, 2.3479332425",Châtelet,48.85857,2.347933


In [71]:
df.to_pickle('airbnb_paris/stations_for_map.p')