## Importing

In [1]:
import pandas as pd
import numpy as np
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
#from sklearn.linear_model import LogisticRegression

In [112]:
df = pd.read_csv('../data/data_combined_v1.00.csv', encoding='utf-8-sig')
df.columns

Index(['place_id', 'tripadvisor_id', 'fsq_id', 'name', 'business_status',
       'types', 'price_level', 'ta_price_level', 'rating',
       'user_ratings_total', 'ta_rating', 'ta_user_ratings_total',
       'combined_rating', 'geometry.location.lat', 'geometry.location.lng',
       'formatted_address', 'zip_code', 'city', 'international_phone_number',
       'website', 'google_category', 'ta_4sq_cuisine1', 'ta_4sq_cuisine2',
       'ta_4sq_cuisine3', 'ta_4sq_cuisine4', 'ta_4sq_cuisine5',
       'main_category_1', 'main_category_2', 'main_category_3',
       'main_category_4', 'combined_main_category', 'sub_category',
       'neighboorhood_area_km2', 'resident_population_total',
       'population_age_0_14', 'population_15_19', 'population_20_24',
       'population_25_34', 'population_35_44', 'population_45_54',
       'population_54_64', 'population_65+', 'resident_per_km2',
       'brut_income_annual_per_person', 'cluster'],
      dtype='object')

In [90]:
df.describe()

Unnamed: 0.1,Unnamed: 0,original_index,tripadvisor_id,price_level,ta_price_level,price_level_combined,rating,user_ratings_total,ta_rating,ta_user_ratings_total,...,population_age_0_14,population_15_19,population_20_24,population_25_34,population_35_44,population_45_54,population_54_64,population_65+,resident_per_km2,brut_income_annual_per_person
count,1613.0,1613.0,959.0,592.0,791.0,910.0,1393.0,1393.0,896.0,896.0,...,1613.0,1613.0,1613.0,1613.0,1613.0,1613.0,1613.0,1613.0,1613.0,1613.0
mean,806.0,985.544327,9955535.0,1.959459,1.969659,0.588645,4.298126,194.832735,4.032924,103.012277,...,2941.856169,963.586485,1431.595784,4564.722877,4140.012399,3589.794172,3015.025418,3501.278983,15656.592684,124549.434594
std,465.777307,575.34485,7577631.0,0.534791,0.44901,0.157276,0.494695,385.398577,0.626936,204.095078,...,1203.196211,371.513422,611.197059,2046.078977,1663.05891,1407.847004,1150.857196,1232.8496,6100.611208,33048.914179
min,0.0,0.0,697832.0,1.0,1.0,0.25,1.0,1.0,1.0,0.0,...,522.0,174.0,220.0,313.0,477.0,619.0,534.0,700.0,1174.0,94723.0
25%,403.0,481.0,3393660.0,2.0,2.0,0.583333,4.1,23.0,3.5,11.0,...,2666.0,780.0,996.0,3501.0,3555.0,3028.0,2638.0,3155.0,8608.0,103373.0
50%,806.0,983.0,8308648.0,2.0,2.0,0.583333,4.3,92.0,4.0,41.0,...,3423.0,1156.0,1611.0,5163.0,5389.0,4675.0,3853.0,4059.0,16282.0,103373.0
75%,1209.0,1468.0,15310680.0,2.0,2.0,0.666667,4.6,221.0,4.5,114.25,...,3886.0,1264.0,1937.0,5621.0,5549.0,4812.0,3966.0,4362.0,19143.0,132730.0
max,1612.0,2019.0,24861940.0,4.0,3.0,1.0,5.0,7885.0,5.0,2304.0,...,6055.0,2026.0,2299.0,7285.0,5620.0,5138.0,4618.0,5625.0,23371.0,198684.0


## Merging price levels

In [107]:
def classifier(level):
    if level < .25:
        return 1
    elif level < .5:
        return 2
    elif level < .75:
        return 3
    else:
        return 4

In [175]:
def chooser(row):
    '''
    Combines the price levels from Google and TripAdvisor normalizing both, or choosing the existing value
    '''
    if pd.isna(row['price_level']) and pd.isna(row['ta_price_level']):
        #Activate Only if the model is ready
        if pd.isna(row['combined_rating']):
            pass
        else:
            return classifier(knn2.predict([row[feat]])[0])
        pass
    elif pd.isna(row['price_level']):
        return classifier(row['ta_price_level']/3)
    elif pd.isna(row['ta_price_level']):
        return classifier(row['price_level']/4)
    else:
        return classifier((row['ta_price_level']/3+row['price_level']/4)/2)

In [176]:
df['price_level_combined'] = df.apply(chooser,axis=1)









In [114]:
df.columns

Index(['place_id', 'tripadvisor_id', 'fsq_id', 'name', 'business_status',
       'types', 'price_level', 'ta_price_level', 'rating',
       'user_ratings_total', 'ta_rating', 'ta_user_ratings_total',
       'combined_rating', 'geometry.location.lat', 'geometry.location.lng',
       'formatted_address', 'zip_code', 'city', 'international_phone_number',
       'website', 'google_category', 'ta_4sq_cuisine1', 'ta_4sq_cuisine2',
       'ta_4sq_cuisine3', 'ta_4sq_cuisine4', 'ta_4sq_cuisine5',
       'main_category_1', 'main_category_2', 'main_category_3',
       'main_category_4', 'combined_main_category', 'sub_category',
       'neighboorhood_area_km2', 'resident_population_total',
       'population_age_0_14', 'population_15_19', 'population_20_24',
       'population_25_34', 'population_35_44', 'population_45_54',
       'population_54_64', 'population_65+', 'resident_per_km2',
       'brut_income_annual_per_person', 'cluster', 'price_level_combined'],
      dtype='object')

In [115]:
df = df[['place_id', 'tripadvisor_id', 'fsq_id', 'name', 'business_status',
       'types', 'price_level', 'ta_price_level', 'price_level_combined', 'rating',
       'user_ratings_total', 'ta_rating', 'ta_user_ratings_total',
       'combined_rating', 'geometry.location.lat', 'geometry.location.lng',
       'formatted_address', 'zip_code', 'city', 'international_phone_number',
       'website', 'google_category', 'ta_4sq_cuisine1', 'ta_4sq_cuisine2',
       'ta_4sq_cuisine3', 'ta_4sq_cuisine4', 'ta_4sq_cuisine5',
       'main_category_1', 'main_category_2', 'main_category_3',
       'main_category_4', 'combined_main_category', 'sub_category',
       'neighboorhood_area_km2', 'resident_population_total',
       'population_age_0_14', 'population_15_19', 'population_20_24',
       'population_25_34', 'population_35_44', 'population_45_54',
       'population_54_64', 'population_65+', 'resident_per_km2',
       'brut_income_annual_per_person', 'cluster']]

In [189]:
df.to_csv('../data/data_combined_v1.01.csv', encoding='utf-8-sig')

# Training K-means

In [117]:
df.isnull().sum()

place_id                            0
tripadvisor_id                    654
fsq_id                            936
name                                0
business_status                     0
types                               0
price_level                      1021
ta_price_level                    819
price_level_combined              703
rating                            220
user_ratings_total                220
ta_rating                         717
ta_user_ratings_total             717
combined_rating                   197
geometry.location.lat               0
geometry.location.lng               0
formatted_address                   0
zip_code                            0
city                                0
international_phone_number        219
website                           963
google_category                     0
ta_4sq_cuisine1                     6
ta_4sq_cuisine2                    66
ta_4sq_cuisine3                   158
ta_4sq_cuisine4                   193
ta_4sq_cuisi

In [118]:
feat = ['geometry.location.lat', 'geometry.location.lng',
        'combined_rating','zip_code',
        'neighboorhood_area_km2', 'resident_population_total',
        'population_age_0_14', 'population_15_19', 'population_20_24',
        'population_25_34', 'population_35_44', 'population_45_54',
        'population_54_64', 'population_65+', 'resident_per_km2',
        'brut_income_annual_per_person'
       ]

targ = ['price_level_combined']

In [134]:
X = df[feat]
y = df[targ]

In [135]:
y = y.loc[X['combined_rating']>0] #Droping all rows without rating 
X = X.loc[X['combined_rating']>0]

In [136]:
X = X.loc[y['price_level_combined']>0] #Dropping all the rows witouth price level
y = y.loc[y['price_level_combined']>0]

In [122]:
X.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
geometry.location.lat,908.0,46.204179,0.007552,46.182515,46.200023,46.203367,46.209768,46.229425
geometry.location.lng,908.0,6.144197,0.009713,6.113736,6.138609,6.14444,6.14892,6.175007
combined_rating,908.0,4.235573,0.332212,2.3,4.1,4.2,4.5,5.0
zip_code,908.0,1204.987885,6.502384,1201.0,1201.0,1204.0,1205.0,1292.0
neighboorhood_area_km2,908.0,1.581344,0.637841,1.06,1.35,1.43,1.56,7.69
resident_population_total,908.0,24024.323789,9473.541058,3790.0,20878.0,29787.0,33448.0,35778.0
population_age_0_14,908.0,2925.263216,1203.642776,522.0,2666.0,3423.0,3886.0,6055.0
population_15_19,908.0,958.830396,370.951166,174.0,780.0,1156.0,1264.0,2026.0
population_20_24,908.0,1430.302863,616.312836,220.0,996.0,1611.0,1937.0,2299.0
population_25_34,908.0,4549.694934,2071.918937,313.0,3501.0,5163.0,5621.0,7285.0


In [137]:
y.describe().T

AttributeError: 'numpy.ndarray' object has no attribute 'describe'

In [123]:
X.isnull().sum(),y.isnull().sum()

(geometry.location.lat            0
 geometry.location.lng            0
 combined_rating                  0
 zip_code                         0
 neighboorhood_area_km2           0
 resident_population_total        0
 population_age_0_14              0
 population_15_19                 0
 population_20_24                 0
 population_25_34                 0
 population_35_44                 0
 population_45_54                 0
 population_54_64                 0
 population_65+                   0
 resident_per_km2                 0
 brut_income_annual_per_person    0
 dtype: int64,
 price_level_combined    0
 dtype: int64)

In [138]:
X_scale = MinMaxScaler().fit_transform(X)

In [139]:
knn = KNeighborsClassifier(n_neighbors=5)

In [141]:
knn.fit(X_scale,np.ravel(y))

In [143]:
cross = cross_val_score(knn,X_scale,y,cv=10,n_jobs=-1, error_score='raise', scoring='accuracy')

In [144]:
cross

array([0.75824176, 0.64835165, 0.7032967 , 0.74725275, 0.54945055,
       0.57142857, 0.62637363, 0.61538462, 0.66666667, 0.67777778])

In [104]:
grid = GridSearchCV(logi, param_grid={
    'solver':['newton-cg', 'lbfgs', 
#               'liblinear', 'sag', 'saga'
             ],
}, n_jobs=-1, 
                    scoring='r2', 
                    cv=7, error_score='raise')

In [157]:
grid = GridSearchCV(knn, param_grid={
                        'n_neighbors':np.arange(2,50),
                        'p':[1,2],
                        'weights':['uniform', 'distance'],
                    },
                    n_jobs=-1, 
                    scoring='accuracy', 
                    cv=7,
                    error_score='raise')

In [158]:
grid.fit(X_scale,y)

In [159]:
grid.best_params_

{'n_neighbors': 28, 'p': 2, 'weights': 'uniform'}

In [160]:
grid.best_score_

0.7533009626032882

In [161]:
grid.cv_results_

{'mean_fit_time': array([0.00047554, 0.00058143, 0.0006907 , 0.00048975, 0.00046744,
        0.00050507, 0.00052582, 0.00046689, 0.00044833, 0.00044363,
        0.00060977, 0.00049799, 0.00050943, 0.0003548 , 0.00044462,
        0.00050017, 0.00046907, 0.00050129, 0.00053419, 0.00050412,
        0.00048872, 0.00048201, 0.00048791, 0.0003672 , 0.00033518,
        0.00031686, 0.00038062, 0.00041103, 0.00051645, 0.00046263,
        0.00053859, 0.00049455, 0.00049363, 0.00049465, 0.00047244,
        0.00046219, 0.00051441, 0.00050742, 0.00067977, 0.00047558,
        0.00050276, 0.00049942, 0.00056216, 0.00047289, 0.00074768,
        0.00050552, 0.00045395, 0.00046069, 0.0004405 , 0.00051621,
        0.00033689, 0.00036785, 0.00044979, 0.00053195, 0.00050371,
        0.00049867, 0.00048075, 0.00055603, 0.00058559, 0.00038954,
        0.00048344, 0.00060592, 0.00057234, 0.00049441, 0.00052541,
        0.00064087, 0.00056434, 0.00051165, 0.00054703, 0.00045409,
        0.00049206, 0.00039295,

In [162]:
X_train, X_test, y_train, y_test = train_test_split(X,y)

In [163]:
knn2 = KNeighborsRegressor(n_neighbors=28, p=2, weights='uniform')

In [164]:
knn2.fit(X_train,y_train)

In [165]:
knn2.score(X_test,y_test)

0.047198240303584105

In [59]:
prediction = knn2.predict(X_test)

## Filling Up the missing values

In [168]:
def chooser2(row):
    '''
    Combines the price levels from Google and TripAdvisor normalizing both, or choosing the existing value
    '''
    if pd.isna(row['price_level']) and pd.isna(row['ta_price_level']):
        if pd.isna(row['rating']) or pd.isna(row['ta_rating']):
            pass
        else:
            return knn2.predict([row[feat]])[0]
    else:
        pass

In [169]:
df['price_level_combined'] = df.apply(chooser2,axis=1)



In [99]:
df = df[['original_index', 'place_id', 'tripadvisor_id', 'fsq_id', 'name',
       'business_status', 'types', 'price_level', 'ta_price_level',
       'price_level_combined', 'price_level_predicted', 'rating', 'user_ratings_total', 'ta_rating',
       'ta_user_ratings_total', ' combined_rating ', 'geometry.location.lat',
       'geometry.location.lng', 'formatted_address', 'zip_code', 'city',
       'international_phone_number', 'website', 'google_category',
       'ta_4sq_cuisine1', 'ta_4sq_cuisine2', 'ta_4sq_cuisine3',
       'ta_4sq_cuisine4', 'ta_4sq_cuisine5', 'main_category_1',
       'main_category_2', 'main_category_3', 'main_category_4',
       'combined_main_category', 'sub_category', ' neighboorhood_area_km2 ',
       '  resident_population_total ', ' population_age_0_14 ',
       ' population_15_19 ', ' population_20_24 ', ' population_25_34 ',
       ' population_35_44 ', ' population_45_54 ', ' population_54_64 ',
       ' population_65+ ', ' resident_per_km2 ',
       ' brut_income_annual_per_person ']]

In [105]:
df.to_csv('../raw_data/raw_data_combined_v1.08.csv', encoding='utf-8-sig')

In [188]:
df.loc[~df['price_level_combined'].isna() & df['ta_price_level'].isna() & df['price_level'].isna()]

Unnamed: 0,place_id,tripadvisor_id,fsq_id,name,business_status,types,price_level,ta_price_level,price_level_combined,rating,...,population_15_19,population_20_24,population_25_34,population_35_44,population_45_54,population_54_64,population_65+,resident_per_km2,brut_income_annual_per_person,cluster
1,ChIJX68uubZkjEcRKKxpZ7OrgqU,13322506.0,,Café Restaurant Palma,OPERATIONAL,"['restaurant', 'point_of_interest', 'food', 'e...",,,4.0,3.7,...,1547,1937,5163,5549,4924,4290,5625,16282,109266,2
5,ChIJW8zU8rZkjEcR_Ke8t7c7hVc,,,Canteen De La Residence Gardens Du Rhône,OPERATIONAL,"['restaurant', 'food', 'point_of_interest', 'e...",,,4.0,5.0,...,1547,1937,5163,5549,4924,4290,5625,16282,109266,2
12,ChIJoxrpttF7jEcRkUB1EIDg3w4,,,Café Simeoni Fleurs - Espace Gourmand,OPERATIONAL,"['restaurant', 'food', 'point_of_interest', 'e...",,,4.0,5.0,...,1956,2148,4721,5127,5017,3956,5342,7187,123472,2
16,ChIJj3bA8zRljEcREidF78D_1A4,,94399f95f3f54a395db2022d,Café Tulipe,OPERATIONAL,"['restaurant', 'food', 'point_of_interest', 'e...",,,4.0,4.1,...,2026,2299,5081,5204,5138,4618,5432,4654,110332,2
20,ChIJ7UScbe5ljEcRzKMw2YXEW94,,,La Chamade,OPERATIONAL,"['restaurant', 'meal_takeaway', 'point_of_inte...",,,4.0,4.5,...,1547,1937,5163,5549,4924,4290,5625,16282,109266,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1592,ChIJFfqNfd9ljEcRv6Zp7VmfiYc,,5e577f832884f40008ed5a4f,Five Guys,OPERATIONAL,"['meal_takeaway', 'restaurant', 'food', 'point...",,,4.0,3.2,...,1156,1611,5621,5389,4675,3853,4059,19143,103373,0
1596,ChIJ5a1lgRRljEcRu2olk1HqT7Q,24086293.0,,El Rincon Caleño,OPERATIONAL,"['restaurant', 'food', 'point_of_interest', 'e...",,,4.0,3.7,...,780,996,3839,3699,3048,2638,3503,15694,132730,0
1598,ChIJL46f7F9ljEcRHBLsdIMBEww,23983964.0,,Sushi Shop,OPERATIONAL,"['restaurant', 'food', 'point_of_interest', 'e...",,,4.0,,...,780,996,3839,3699,3048,2638,3503,15694,132730,0
1599,ChIJlRj-eIdljEcRT0LzfPisvto,,,Brezelkönig,OPERATIONAL,"['restaurant', 'food', 'store', 'point_of_inte...",,,4.0,5.0,...,780,996,3839,3699,3048,2638,3503,15694,132730,0
