## Importing

In [78]:
import pandas as pd
import numpy as np
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.preprocessing import MinMaxScaler

In [136]:
df = pd.read_csv('../raw_data/raw_data_combined_v1.07.csv', encoding='utf-8-sig')
df.columns

Index(['Unnamed: 0', 'original_index', 'place_id', 'tripadvisor_id', 'fsq_id',
       'name', 'business_status', 'types', 'price_level', 'ta_price_level',
       'price_level_combined', 'rating', 'user_ratings_total', 'ta_rating',
       'ta_user_ratings_total', ' combined_rating ', 'geometry.location.lat',
       'geometry.location.lng', 'formatted_address', 'zip_code', 'city',
       'international_phone_number', 'website', 'google_category',
       'ta_4sq_cuisine1', 'ta_4sq_cuisine2', 'ta_4sq_cuisine3',
       'ta_4sq_cuisine4', 'ta_4sq_cuisine5', 'main_category_1',
       'main_category_2', 'main_category_3', 'main_category_4',
       'combined_main_category', 'sub_category', ' neighboorhood_area_km2 ',
       '  resident_population_total ', ' population_age_0_14 ',
       ' population_15_19 ', ' population_20_24 ', ' population_25_34 ',
       ' population_35_44 ', ' population_45_54 ', ' population_54_64 ',
       ' population_65+ ', ' resident_per_km2 ',
       ' brut_income_an

In [131]:
df.describe()

Unnamed: 0.1,Unnamed: 0,original_index,tripadvisor_id,price_level,ta_price_level,price_level_combined,rating,user_ratings_total,ta_rating,ta_user_ratings_total,...,population_age_0_14,population_15_19,population_20_24,population_25_34,population_35_44,population_45_54,population_54_64,population_65+,resident_per_km2,brut_income_annual_per_person
count,1613.0,1613.0,959.0,592.0,791.0,910.0,1393.0,1393.0,896.0,896.0,...,1613.0,1613.0,1613.0,1613.0,1613.0,1613.0,1613.0,1613.0,1613.0,1613.0
mean,806.0,985.544327,9955535.0,1.959459,1.969659,0.588645,4.298126,194.832735,4.032924,103.012277,...,2941.856169,963.586485,1431.595784,4564.722877,4140.012399,3589.794172,3015.025418,3501.278983,15656.592684,124549.434594
std,465.777307,575.34485,7577631.0,0.534791,0.44901,0.157276,0.494695,385.398577,0.626936,204.095078,...,1203.196211,371.513422,611.197059,2046.078977,1663.05891,1407.847004,1150.857196,1232.8496,6100.611208,33048.914179
min,0.0,0.0,697832.0,1.0,1.0,0.25,1.0,1.0,1.0,0.0,...,522.0,174.0,220.0,313.0,477.0,619.0,534.0,700.0,1174.0,94723.0
25%,403.0,481.0,3393660.0,2.0,2.0,0.583333,4.1,23.0,3.5,11.0,...,2666.0,780.0,996.0,3501.0,3555.0,3028.0,2638.0,3155.0,8608.0,103373.0
50%,806.0,983.0,8308648.0,2.0,2.0,0.583333,4.3,92.0,4.0,41.0,...,3423.0,1156.0,1611.0,5163.0,5389.0,4675.0,3853.0,4059.0,16282.0,103373.0
75%,1209.0,1468.0,15310680.0,2.0,2.0,0.666667,4.6,221.0,4.5,114.25,...,3886.0,1264.0,1937.0,5621.0,5549.0,4812.0,3966.0,4362.0,19143.0,132730.0
max,1612.0,2019.0,24861940.0,4.0,3.0,1.0,5.0,7885.0,5.0,2304.0,...,6055.0,2026.0,2299.0,7285.0,5620.0,5138.0,4618.0,5625.0,23371.0,198684.0


## Merging price levels

In [147]:
def chooser(row):
    '''
    Combines the price levels from Google and TripAdvisor normalizing both, or choosing the existing value
    '''
    if pd.isna(row['price_level']) and pd.isna(row['ta_price_level']):
        pass
    elif pd.isna(row['price_level']):
        return row['ta_price_level']/3
    elif pd.isna(row['ta_price_level']):
        return row['price_level']/4
    else:
        return (row['ta_price_level']/3+row['price_level']/4)/2

In [147]:
def combinator(row):
    '''
    Combines the rating from Google and TripAdvisor, weighting by the total number of ratings.
    '''
    if pd.isna(row['rating']) and pd.isna(row['ta_rating']):
        pass
    elif pd.isna(row['rating']):
        return row['ta_rating'],row['ta_user_ratings_total']
    elif pd.isna(row['ta_rating']):
        return row['rating'],row['user_ratings_total']
    else:
        return (row['ta_rating']*row['ta_user_ratings_total'] + row['rating']*row['user_ratings_total'])/row['ta_user_ratings_total']+row['user_ratings_total']

In [133]:
df['price_level_combined'] = df.apply(lambda x: chooser(x,df) ,axis=1)

In [16]:
df = df[['original_index', 'place_id', 'tripadvisor_id', 'fsq_id', 'name',
       'business_status', 'types', 'price_level', 'ta_price_level', 'price_level_combined', 'rating',
       'user_ratings_total', 'ta_rating', 'ta_user_ratings_total',
       ' combined_rating ', 'geometry.location.lat', 'geometry.location.lng',
       'formatted_address', 'zip_code', 'city', 'international_phone_number',
       'website', 'google_category', 'ta_4sq_cuisine1', 'ta_4sq_cuisine2',
       'ta_4sq_cuisine3', 'ta_4sq_cuisine4', 'ta_4sq_cuisine5',
       'main_category_1', 'main_category_2', 'main_category_3',
       'main_category_4', 'combined_main_category', 'sub_category',
       ' neighboorhood_area_km2 ', '  resident_population_total ',
       ' population_age_0_14 ', ' population_15_19 ', ' population_20_24 ',
       ' population_25_34 ', ' population_35_44 ', ' population_45_54 ',
       ' population_54_64 ', ' population_65+ ', ' resident_per_km2 ',
       ' brut_income_annual_per_person ']]

In [18]:
df.to_csv('../raw_data/raw_data_combined_v1.07.csv', encoding='utf-8-sig')

# Training K-means

In [143]:
knn = KNeighborsClassifier()

In [144]:
df = df[['geometry.location.lat', 'geometry.location.lng', 
         'rating','user_ratings_total','ta_rating','ta_user_ratings_total',
         'price_level_combined']]

In [145]:
df

Unnamed: 0,geometry.location.lat,geometry.location.lng,rating,user_ratings_total,ta_rating,ta_user_ratings_total,price_level_combined
0,46.205232,6.114945,4.2,158.0,3.5,34.0,0.583333
1,46.204931,6.114021,3.7,141.0,2.0,3.0,
2,46.205752,6.114763,3.9,56.0,4.0,5.0,0.666667
3,46.196924,6.115081,4.5,113.0,,,0.500000
4,46.205248,6.115503,,,,,
...,...,...,...,...,...,...,...
1608,46.192811,6.169873,4.2,100.0,3.5,31.0,0.583333
1609,46.199892,6.174117,4.3,376.0,3.5,123.0,0.583333
1610,46.199773,6.174561,4.3,250.0,,,0.500000
1611,46.199695,6.175007,4.8,15.0,4.0,3.0,0.666667


In [146]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1613 entries, 0 to 1612
Data columns (total 7 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   geometry.location.lat  1613 non-null   float64
 1   geometry.location.lng  1613 non-null   float64
 2   rating                 1393 non-null   float64
 3   user_ratings_total     1393 non-null   float64
 4   ta_rating              896 non-null    float64
 5   ta_user_ratings_total  896 non-null    float64
 6   price_level_combined   910 non-null    float64
dtypes: float64(7)
memory usage: 88.3 KB


In [139]:
X = df.loc[df['price_level_combined']>0] #Droping all rows without price level

In [140]:
X = X.loc[X['ta_rating']>0] #Droping all rows without rating

In [141]:
X = X.loc[X['rating']>0] #Droping all rows without rating 

In [142]:
X.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 787 entries, 0 to 1612
Data columns (total 7 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   geometry.location.lat  787 non-null    float64
 1   geometry.location.lng  787 non-null    float64
 2   rating                 787 non-null    float64
 3   user_ratings_total     787 non-null    float64
 4   ta_rating              787 non-null    float64
 5   ta_user_ratings_total  787 non-null    float64
 6   price_level_combined   787 non-null    float64
dtypes: float64(7)
memory usage: 49.2 KB


In [117]:
y = X['price_level_combined']

In [118]:
X.drop(columns=['price_level_combined'], inplace=True)

In [119]:
X_scale = MinMaxScaler().fit_transform(X)

In [120]:
X_scale

array([[7.33333333e-01, 1.99137494e-02, 6.25000000e-01, 1.47569444e-02],
       [6.33333333e-01, 6.97615424e-03, 7.50000000e-01, 2.17013889e-03],
       [1.00000000e+00, 1.26839168e-04, 8.75000000e-01, 3.03819444e-03],
       ...,
       [7.66666667e-01, 4.75646880e-02, 6.25000000e-01, 5.33854167e-02],
       [9.33333333e-01, 1.77574835e-03, 7.50000000e-01, 1.30208333e-03],
       [7.66666667e-01, 1.57280568e-02, 7.50000000e-01, 6.51041667e-03]])

In [121]:
from sklearn.model_selection import cross_val_score
knn = KNeighborsClassifier()

In [122]:
cross = cross_val_score(knn,X_scale,y,scoring='accuracy',cv=10,n_jobs=-1, error_score='raise')



In [123]:
cross

array([0.48101266, 0.46835443, 0.40506329, 0.5443038 , 0.4556962 ,
       0.55696203, 0.51898734, 0.48717949, 0.42307692, 0.55128205])

In [96]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression

In [97]:
logi = LogisticRegression()

In [106]:
logi.fit(X_scale,y)

ValueError: Unknown label type: 'continuous'

In [104]:
grid = GridSearchCV(logi, param_grid={
    'solver':['newton-cg', 'lbfgs', 
#               'liblinear', 'sag', 'saga'
             ],
}, n_jobs=-1, 
                    scoring='r2', 
                    cv=7, error_score='raise')

In [124]:
grid = GridSearchCV(knn, param_grid={
    'n_neighbors':np.arange(2,10),
    'p':[1,2,3,4,5]
}, n_jobs=-1, 
                    scoring='accuracy', 
                    cv=7, error_score='raise')

In [125]:
grid.fit(X_scale,y)



In [126]:
grid.best_params_

{'n_neighbors': 8, 'p': 4}

In [127]:
grid.best_score_

0.5210064114141232

In [40]:
grid.cv_results_

{'mean_fit_time': array([0.00212921, 0.00195878, 0.00291678, 0.00172339, 0.00198354,
        0.00184563, 0.00144359, 0.00158535, 0.00156345, 0.00188507,
        0.0024411 , 0.00193582, 0.00161968, 0.00181198, 0.00147976,
        0.00124645]),
 'std_fit_time': array([3.75204814e-04, 8.36141726e-05, 2.07281086e-03, 5.15884963e-04,
        5.85054027e-04, 6.28155446e-04, 1.58166011e-04, 2.06862233e-04,
        2.62657486e-04, 4.43643117e-04, 2.52483174e-03, 5.86725057e-04,
        2.12920810e-04, 8.21558686e-04, 3.22535736e-04, 1.92399509e-04]),
 'mean_score_time': array([0.00556653, 0.00506735, 0.00513666, 0.00571881, 0.00545277,
        0.00661421, 0.00507481, 0.004687  , 0.00471221, 0.00417522,
        0.00706962, 0.00562651, 0.00495335, 0.00443046, 0.00501616,
        0.00376119]),
 'std_score_time': array([0.00122196, 0.00108004, 0.0012567 , 0.00203684, 0.0014589 ,
        0.00335495, 0.00070731, 0.00072123, 0.00067703, 0.00060073,
        0.00224701, 0.0011698 , 0.00098573, 0.000339

In [43]:
y.value_counts(normalize=True)

2.0    0.784861
1.0    0.117530
3.0    0.083665
4.0    0.013944
Name: price_level, dtype: float64

In [56]:
from sklearn.model_selection import train_test_split

In [59]:
X_train, X_test, y_train, y_test = train_test_split(X,y)

In [61]:
knn2 = KNeighborsClassifier(n_neighbors=6, p=1)

In [73]:
knn2.score(X_test,y_test)

0.753968253968254

In [63]:
knn2.fit(X_train,y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=6, p=1,
                     weights='uniform')

In [64]:
prediction = knn2.predict(X_test)

In [66]:
pd.Series(prediction).value_counts()

2.0    110
1.0      9
3.0      7
dtype: int64

In [68]:
df_analise = pd.DataFrame({"prediction":prediction,"valor_real":y_test})

In [70]:
df_analise[df_analise['prediction'] != df_analise['valor_real']]

Unnamed: 0,prediction,valor_real
335,1.0,2.0
1137,1.0,2.0
806,3.0,2.0
867,2.0,1.0
958,2.0,1.0
756,2.0,1.0
1362,1.0,2.0
868,2.0,3.0
107,2.0,1.0
1612,2.0,1.0
