## Importing

In [35]:
import pandas as pd
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import MinMaxScaler

In [8]:
df = pd.read_csv('../raw_data/raw_data_combined_v1.06.csv', encoding='utf-8-sig')
df.columns

Index(['original_index', 'place_id', 'tripadvisor_id', 'fsq_id', 'name',
       'business_status', 'types', 'price_level', 'ta_price_level', 'rating',
       'user_ratings_total', 'ta_rating', 'ta_user_ratings_total',
       ' combined_rating ', 'geometry.location.lat', 'geometry.location.lng',
       'formatted_address', 'zip_code', 'city', 'international_phone_number',
       'website', 'google_category', 'ta_4sq_cuisine1', 'ta_4sq_cuisine2',
       'ta_4sq_cuisine3', 'ta_4sq_cuisine4', 'ta_4sq_cuisine5',
       'main_category_1', 'main_category_2', 'main_category_3',
       'main_category_4', 'combined_main_category', 'sub_category',
       ' neighboorhood_area_km2 ', '  resident_population_total ',
       ' population_age_0_14 ', ' population_15_19 ', ' population_20_24 ',
       ' population_25_34 ', ' population_35_44 ', ' population_45_54 ',
       ' population_54_64 ', ' population_65+ ', ' resident_per_km2 ',
       ' brut_income_annual_per_person '],
      dtype='object')

# Training K-means

In [9]:
knn = KNeighborsClassifier(n_neighbors=2)

In [11]:
#df = df[['geometry.location.lat', 'geometry.location.lng', 'rating',
#         'user_ratings_total','ta_rating','ta_user_ratings_total','price_level']]

In [44]:
df = df[['rating','user_ratings_total','ta_rating','ta_user_ratings_total','price_level']]

In [45]:
df

Unnamed: 0,rating,user_ratings_total,ta_rating,ta_user_ratings_total,price_level
0,4.2,158.0,3.5,34.0,2.0
1,3.7,141.0,2.0,3.0,
2,3.9,56.0,4.0,5.0,
3,4.5,113.0,,,2.0
4,,,,,
...,...,...,...,...,...
1608,4.2,100.0,3.5,31.0,2.0
1609,4.3,376.0,3.5,123.0,2.0
1610,4.3,250.0,,,2.0
1611,4.8,15.0,4.0,3.0,


In [13]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1613 entries, 0 to 1612
Data columns (total 7 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   geometry.location.lat  1613 non-null   float64
 1   geometry.location.lng  1613 non-null   float64
 2   rating                 1393 non-null   float64
 3   user_ratings_total     1393 non-null   float64
 4   ta_rating              896 non-null    float64
 5   ta_user_ratings_total  896 non-null    float64
 6   price_level            592 non-null    float64
dtypes: float64(7)
memory usage: 88.3 KB


In [46]:
X = df.loc[df['price_level']>0] #Droping all rows without price level

In [47]:
X = X.loc[X['ta_rating']>0] #Droping all rows without rating (6 rows)

In [48]:
y = X['price_level']

In [49]:
X.drop(columns=['price_level'], inplace=True)

In [50]:
X.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 502 entries, 0 to 1612
Data columns (total 4 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   rating                 502 non-null    float64
 1   user_ratings_total     502 non-null    float64
 2   ta_rating              502 non-null    float64
 3   ta_user_ratings_total  502 non-null    float64
dtypes: float64(4)
memory usage: 19.6 KB


In [51]:
X_scale = MinMaxScaler().fit_transform(X)

In [23]:
knn.fit(X,y)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=2, p=2,
                     weights='uniform')

In [24]:
knn.score(X,y)

0.8306772908366534

In [26]:
from sklearn.model_selection import cross_val_score
cross = cross_val_score(knn,X,y,scoring='accuracy',cv=10,n_jobs=-1)



In [27]:
cross

array([0.64705882, 0.66666667, 0.74      , 0.66      , 0.54      ,
       0.72      , 0.74      , 0.7       , 0.62      , 0.68      ])

In [28]:
from sklearn.model_selection import GridSearchCV

In [52]:
grid = GridSearchCV(knn, param_grid={'n_neighbors':np.arange(2,10),'p':[1,2]}, n_jobs=-1, scoring='accuracy', cv=7)

In [53]:
grid.fit(X_scale,y)

GridSearchCV(cv=7, error_score=nan,
             estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30,
                                            metric='minkowski',
                                            metric_params=None, n_jobs=None,
                                            n_neighbors=2, p=2,
                                            weights='uniform'),
             iid='deprecated', n_jobs=-1,
             param_grid={'n_neighbors': array([2, 3, 4, 5, 6, 7, 8, 9]),
                         'p': [1, 2]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='accuracy', verbose=0)

In [54]:
grid.best_params_

{'n_neighbors': 6, 'p': 1}

In [55]:
grid.best_score_

0.7769673597138386

In [40]:
grid.cv_results_

{'mean_fit_time': array([0.00212921, 0.00195878, 0.00291678, 0.00172339, 0.00198354,
        0.00184563, 0.00144359, 0.00158535, 0.00156345, 0.00188507,
        0.0024411 , 0.00193582, 0.00161968, 0.00181198, 0.00147976,
        0.00124645]),
 'std_fit_time': array([3.75204814e-04, 8.36141726e-05, 2.07281086e-03, 5.15884963e-04,
        5.85054027e-04, 6.28155446e-04, 1.58166011e-04, 2.06862233e-04,
        2.62657486e-04, 4.43643117e-04, 2.52483174e-03, 5.86725057e-04,
        2.12920810e-04, 8.21558686e-04, 3.22535736e-04, 1.92399509e-04]),
 'mean_score_time': array([0.00556653, 0.00506735, 0.00513666, 0.00571881, 0.00545277,
        0.00661421, 0.00507481, 0.004687  , 0.00471221, 0.00417522,
        0.00706962, 0.00562651, 0.00495335, 0.00443046, 0.00501616,
        0.00376119]),
 'std_score_time': array([0.00122196, 0.00108004, 0.0012567 , 0.00203684, 0.0014589 ,
        0.00335495, 0.00070731, 0.00072123, 0.00067703, 0.00060073,
        0.00224701, 0.0011698 , 0.00098573, 0.000339

In [43]:
y.value_counts(normalize=True)

2.0    0.784861
1.0    0.117530
3.0    0.083665
4.0    0.013944
Name: price_level, dtype: float64

In [56]:
from sklearn.model_selection import train_test_split

In [59]:
X_train, X_test, y_train, y_test = train_test_split(X,y)

In [61]:
knn2 = KNeighborsClassifier(n_neighbors=6, p=1)

In [73]:
knn2.score(X_test,y_test)

0.753968253968254

In [63]:
knn2.fit(X_train,y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=6, p=1,
                     weights='uniform')

In [64]:
prediction = knn2.predict(X_test)

In [66]:
pd.Series(prediction).value_counts()

2.0    110
1.0      9
3.0      7
dtype: int64

In [68]:
df_analise = pd.DataFrame({"prediction":prediction,"valor_real":y_test})

In [70]:
df_analise[df_analise['prediction'] != df_analise['valor_real']]

Unnamed: 0,prediction,valor_real
335,1.0,2.0
1137,1.0,2.0
806,3.0,2.0
867,2.0,1.0
958,2.0,1.0
756,2.0,1.0
1362,1.0,2.0
868,2.0,3.0
107,2.0,1.0
1612,2.0,1.0
