## Importing

In [71]:
import pandas as pd
from sklearn.neighbors import KNeighborsRegressor
from sklearn.preprocessing import MinMaxScaler

In [112]:
df = pd.read_csv('../raw_data/raw_data_combined_v1.03.csv', encoding='utf-8-sig')
df.columns

Index(['Unnamed: 0', 'original_index', 'place_id', 'tripadvisor_id', 'fsq_id',
       'name', 'business_status', 'types', 'price_level', 'ta_price_level',
       'rating', 'user_ratings_total', 'ta_rating', 'ta_user_ratings_total',
       'combined_rating', 'geometry.location.lat', 'geometry.location.lng',
       'formatted_address', 'zip_code', 'city', 'international_phone_number',
       'website', 'combined_cuisine_type', 'ta_4sq_cuisine1',
       'ta_4sq_cuisine2', 'ta_4sq_cuisine3', 'ta_4sq_cuisine4',
       'ta_4sq_cuisine5', 'norm_price_level'],
      dtype='object')

# Training K-means

In [46]:
df = df[['geometry.location.lat', 'geometry.location.lng', 'combined_rating', 'norm_price_level']]

In [47]:
df

Unnamed: 0,geometry.location.lat,geometry.location.lng,combined_rating,norm_price_level
0,46.205232,6.114945,4.1,0.583333
1,46.204931,6.114021,3.7,
2,46.205752,6.114763,3.9,0.666667
3,46.196924,6.115081,4.5,0.500000
4,46.205248,6.115503,,
...,...,...,...,...
1721,46.192811,6.169873,4.0,0.583333
1722,46.199892,6.174117,4.1,0.583333
1723,46.199773,6.174561,4.3,0.500000
1724,46.199695,6.175007,4.7,0.666667


In [108]:
X = df.loc[df['combined_rating']>0] #Droping all rows without price level or rating

In [49]:
X = X.loc[X['norm_price_level']>0] #Droping all rows without price level

In [50]:
X

Unnamed: 0,geometry.location.lat,geometry.location.lng,combined_rating,norm_price_level
0,46.205232,6.114945,4.1,0.583333
2,46.205752,6.114763,3.9,0.666667
3,46.196924,6.115081,4.5,0.500000
6,46.210396,6.114510,3.9,0.500000
7,46.210587,6.113854,4.3,0.500000
...,...,...,...,...
1721,46.192811,6.169873,4.0,0.583333
1722,46.199892,6.174117,4.1,0.583333
1723,46.199773,6.174561,4.3,0.500000
1724,46.199695,6.175007,4.7,0.666667


In [51]:
X.describe()

Unnamed: 0,geometry.location.lat,geometry.location.lng,combined_rating,norm_price_level
count,898.0,898.0,898.0,898.0
mean,46.204196,6.144178,4.238085,0.589226
std,0.007575,0.009731,0.332481,0.156231
min,46.182515,6.113736,2.3,0.25
25%,46.200052,6.138564,4.1,0.583333
50%,46.203348,6.144387,4.2,0.583333
75%,46.209786,6.148949,4.5,0.666667
max,46.229425,6.175007,5.0,1.0


In [115]:
df.isnull().sum()#/df.shape[0]

Unnamed: 0                       0
original_index                   0
place_id                         0
tripadvisor_id                 761
fsq_id                        1038
name                             0
business_status                  0
types                            0
price_level                   1148
ta_price_level                 939
rating                         312
user_ratings_total             312
ta_rating                      824
ta_user_ratings_total          824
combined_rating                287
geometry.location.lat            0
geometry.location.lng            0
formatted_address                0
zip_code                         0
city                             0
international_phone_number       0
website                        979
combined_cuisine_type          647
ta_4sq_cuisine1                  6
ta_4sq_cuisine2                 68
ta_4sq_cuisine3                160
ta_4sq_cuisine4                195
ta_4sq_cuisine5                125
norm_price_level    

In [111]:
X.isnull().sum()

Unnamed: 0                      0
original_index                  0
place_id                        0
tripadvisor_id                480
fsq_id                        782
name                            0
business_status                 0
types                           0
price_level                   862
ta_price_level                653
rating                         25
user_ratings_total             25
ta_rating                     537
ta_user_ratings_total         537
combined_rating                 0
geometry.location.lat           0
geometry.location.lng           0
formatted_address               0
zip_code                        0
city                            0
international_phone_number      0
website                       927
combined_cuisine_type         406
ta_4sq_cuisine1                 6
ta_4sq_cuisine2                64
ta_4sq_cuisine3               159
ta_4sq_cuisine4               195
ta_4sq_cuisine5               122
norm_price_level                0
dtype: int64

## Function to merge and normalize price levels

In [120]:
def merge_price(row):
    if pd.isna(row['price_level']):
        
        if pd.isna(row['ta_price_level']):
            if pd.isna(row['combined_rating']):
                #return NaN if both values are empty
                return float('nan')
            else:
                #return the prediction if theres is a Rating
                return knn.predict([row[['geometry.location.lat', 'geometry.location.lng', 'combined_rating']]])[0]
            
        else:
            #if 'ta_price_level' is not null
            return row['ta_price_level']/3
        
    elif pd.isna(row['ta_price_level']):
        return row['price_level']/4
    
    else:
        #if both values exists
        return (row['ta_price_level']/3 + row['price_level']/4)/2

In [121]:
df['norm_price_level'] = df.apply(merge_price, axis=1)

## Fiting model

In [55]:
y = X['norm_price_level']

In [72]:
X = X[['geometry.location.lat', 'geometry.location.lng', ' combined_rating ']].copy()
scaler = MinMaxScaler()
X_rescaled = scaler.fit_transform(X)

In [77]:
knn = KNeighborsRegressor(n_neighbors=2)

In [78]:
knn.fit(X_rescaled,y)

In [79]:
knn.score(X_rescaled,y)

0.5520987595888744

In [80]:
X_rescaled

array([[0.48426149, 0.01972548, 0.66666667],
       [0.49534005, 0.01676976, 0.59259259],
       [0.30716216, 0.0219435 , 0.81481481],
       ...,
       [0.36788218, 0.99272086, 0.74074074],
       [0.36623435, 1.        , 0.88888889],
       [0.40876224, 0.99358261, 0.74074074]])

In [81]:
knn.predict(X[0:1])[0],X[0:1]

(1.8,
        lat     lon  rating
 0  46.2052  6.1149     4.2)

In [82]:
X_test = df.loc[df['price_level'].isna()]
X_test

Unnamed: 0,lat,lon,rating,price_level
1,46.2049,6.1140,3.7,
2,46.2058,6.1148,3.9,
3,46.2058,6.1148,,
5,46.2052,6.1155,,
6,46.2054,6.1153,5.0,
...,...,...,...,...
1965,46.2039,6.1690,,
1967,46.2085,6.1693,3.0,
1970,46.1996,6.1730,,
1972,46.1997,6.1750,4.8,


In [83]:
X_test = X_test.drop(['price_level'], axis=1).copy()
X_test

Unnamed: 0,lat,lon,rating
1,46.2049,6.1140,3.7
2,46.2058,6.1148,3.9
3,46.2058,6.1148,
5,46.2052,6.1155,
6,46.2054,6.1153,5.0
...,...,...,...
1965,46.2039,6.1690,
1967,46.2085,6.1693,3.0
1970,46.1996,6.1730,
1972,46.1997,6.1750,4.8


In [98]:
knn.predict(X_test[1:2])[0]

2.0

# Saving the database

In [126]:
df.to_csv('../raw_data/raw_data_combined_v1.04.csv', encoding='utf-8-sig')

In [125]:
df.describe()

Unnamed: 0.1,Unnamed: 0,original_index,tripadvisor_id,price_level,ta_price_level,rating,user_ratings_total,ta_rating,ta_user_ratings_total,combined_rating,geometry.location.lat,geometry.location.lng,zip_code,norm_price_level
count,1726.0,1726.0,965.0,578.0,787.0,1414.0,1414.0,902.0,902.0,1439.0,1726.0,1726.0,1726.0,1441.0
mean,862.5,994.238702,9952802.0,1.956747,1.969504,4.295396,192.40099,4.036585,102.652993,4.262891,46.204096,6.144163,1205.06489,0.587295
std,498.397599,578.391678,7565951.0,0.523153,0.450146,0.513222,383.099036,0.627516,203.518677,0.49781,0.007482,0.009767,6.869159,0.123418
min,0.0,0.0,697832.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,46.182515,6.113736,1121.0,0.25
25%,431.25,484.5,3399588.0,2.0,2.0,4.1,21.25,3.5,11.0,4.0,46.200116,6.138342,1201.0,0.583333
50%,862.5,985.5,8308648.0,2.0,2.0,4.3,89.5,4.0,41.0,4.3,46.203384,6.144387,1204.0,0.583333
75%,1293.75,1487.75,15291020.0,2.0,2.0,4.6,216.0,4.5,114.0,4.6,46.209778,6.149149,1205.0,0.583333
max,1725.0,2019.0,24861940.0,4.0,3.0,5.0,7885.0,5.0,2304.0,5.0,46.229425,6.175007,1292.0,1.0
