In [1]:
from sklearn.metrics.pairwise import cosine_similarity, cosine_distances, euclidean_distances, rbf_kernel, laplacian_kernel
import pandas as pd
import numpy as np
from sklearn import preprocessing

In [None]:
# bad distances: linear_kernel, polynomial_kernel, sigmoid_kernel
# good distances: cosine_similarity/cosine_distance, euclidean_distances, rbf_kernel, laplacian_kernel

In [2]:
# data reading
DATA_PATH = "numbeo.csv"
df = pd.read_csv(DATA_PATH)

# data preprocessing
df['id'] = df['CountryName'] + "#" + df['CityName']
cities_features = preprocessing.StandardScaler().fit_transform(df.drop(['CountryName', 'CityName', 'id'],axis=1).values)

cities_features.shape

(764, 30)

In [3]:
def get_similarity_matr(features):

    #computing normilized desired distances from given features
    cos_dis = cosine_distances(features, features)
    cos_dis = 1.0 - (cos_dis / np.max(cos_dis))

    rbf_ker = rbf_kernel(features, features)
    rbf_ker = rbf_ker / np.max(rbf_ker)

    lap_ker = laplacian_kernel(features, features)
    lap_ker = lap_ker / np.max(lap_ker)

    euc_dis = euclidean_distances(features, features)
    euc_dis = 1.0 - (euc_dis / np.max(euc_dis))

    similarity_matrix = np.zeros(cos_dis.shape)

    # averaging distances
    for i in range(similarity_matrix.shape[0]):
        for j in range(similarity_matrix.shape[1]):

            similarity_matrix[i,j] = (cos_dis[i, j] + rbf_ker[i, j] + 
                                      lap_ker[i, j] + euc_dis[i, j]) / 4.0

    return similarity_matrix

In [4]:
similarity_matrix = get_similarity_matr(cities_features)

In [5]:
mapping = pd.Series(df.index, index=df['id'])
print(mapping)

id
Albania#Tirana                0
Algeria#Algiers               1
Angola#Luanda                 2
Argentina#Buenos Aires        3
Argentina#Cordoba             4
                           ... 
Venezuela#Caracas           759
Vietnam#Da Nang             760
Vietnam#Hanoi               761
Vietnam#Ho Chi Minh City    762
Zimbabwe#Harare             763
Length: 764, dtype: int64


In [10]:
def get_similarity_score(id, mapping, similarity_matrix):
    try:
        index = mapping[id]
    except:
        print("ERROR: Wrong input city ID")
        return 1
    similarity_score = list(enumerate(similarity_matrix[index]))
    return similarity_score

In [17]:
similarity_score = get_similarity_score('Russia#Moscow', mapping, similarity_matrix)

In [31]:
def recsys_top_results(similarity_score, df, user_cities_list):
    similarity_score = sorted(similarity_score, key=lambda x: x[1], reverse=True)
    #similarity_score = similarity_score[1:25]
    city_indeces = [i[0] for i in similarity_score]
    top_cities = [[x, y[1]] for x, y in zip(df['id'].iloc[city_indeces], similarity_score) if x not in user_cities_list]
    return top_cities

In [32]:
recsys_top_results(similarity_score, df, ['Russia#Moscow'])

[['Russia#Saint Petersburg', 0.8678341134574866],
 ['Russia#Sochi', 0.7992148345002578],
 ['Russia#Krasnodar', 0.7524996042717704],
 ['Russia#Kazan', 0.7426671531056478],
 ['Russia#Samara', 0.7371541580031618],
 ['Russia#Vladivostok', 0.7349797890743488],
 ['China#Hangzhou', 0.721555804011487],
 ['Italy#Rome', 0.7165138337758609],
 ['Poland#Warsaw', 0.7141223441268015],
 ['Russia#Perm', 0.7125010299094581],
 ['Russia#Novosibirsk', 0.7124843160568393],
 ['China#Xiamen', 0.7102706706208064],
 ['China#Zhuhai', 0.6969064859744706],
 ['Russia#Khabarovsk', 0.6956529501108862],
 ['Russia#Yekaterinburg', 0.6926701114754679],
 ['Panama#Panama City', 0.6926076165009112],
 ['China#Tianjin', 0.6923483017655068],
 ['Russia#Rostov-na-donu', 0.6868923030488955],
 ['China#Guangzhou', 0.6863319090583895],
 ['Thailand#Bangkok', 0.6819921543221635],
 ['Russia#Voronezh', 0.6790397289451091],
 ['Mexico#Mexico City', 0.6779784684211583],
 ['Qatar#Doha', 0.6778220753329463],
 ['China#Shanghai', 0.67500501720

## Example of use

User input data

In [48]:
INPUT_USER_CITIES_LIST = []
PRIORITIES = [] # for future work
USER_SIMILARITY_SCORES = []

Cities data

In [49]:
CITIES_SIM_MATR = get_similarity_matr(cities_features)
CITIES_MAPPING = pd.Series(df.index, index=df['id'])

Function for scores update with multiple cities input

In [61]:
def update_scores(new_city, user_city_list=INPUT_USER_CITIES_LIST,
                  prev_sim_scores=USER_SIMILARITY_SCORES,
                  cities_sim_matr=CITIES_SIM_MATR,
                  cities_mapping=CITIES_MAPPING):

    new_sim_scores = get_similarity_score(new_city, cities_mapping, cities_sim_matr)
    
    if new_sim_scores == 1:
        return 1, 1
    
    if len(user_city_list) != 0:
        scores = []
        for i in range(len(prev_sim_scores)):
            scores.append((i, new_sim_scores[i][1] + prev_sim_scores[i][1]))
        
        new_sim_scores=scores

    user_city_list.append(new_city)

    return user_city_list, new_sim_scores

User chooses first city

In [62]:
user_input = 'Russia#Moscow'
INPUT_USER_CITIES_LIST, USER_SIMILARITY_SCORES = update_scores(new_city=user_input, 
                                                               user_city_list=INPUT_USER_CITIES_LIST,
                                                               prev_sim_scores=USER_SIMILARITY_SCORES,
                                                               cities_sim_matr=CITIES_SIM_MATR,
                                                               cities_mapping=CITIES_MAPPING)

And gets new recommendations

In [63]:
recsys_top_results(USER_SIMILARITY_SCORES, df, INPUT_USER_CITIES_LIST)

[['Russia#Saint Petersburg', 1.582311755080607],
 ['Russia#Perm', 1.5732477930286806],
 ['Russia#Samara', 1.5426427607400055],
 ['Russia#Rostov-na-donu', 1.536369515000918],
 ['Russia#Kazan', 1.5330478551598508],
 ['Russia#Novosibirsk', 1.5221777257562796],
 ['Kyrgyzstan#Bishkek', 1.5058497633620913],
 ['Russia#Yekaterinburg', 1.5056831503761814],
 ['Russia#Krasnodar', 1.49983967938288],
 ['Ukraine#Kiev (Kyiv)', 1.4826589975459914],
 ['Russia#Omsk', 1.4794339675212909],
 ['Russia#Voronezh', 1.4744646677060276],
 ['Belarus#Minsk', 1.465073638791628],
 ['Brazil#Vitoria', 1.4622264886736285],
 ['Russia#Vladivostok', 1.4601574435647953],
 ['Mexico#Mexico City', 1.4498773474503421],
 ['Ukraine#Dnipro', 1.4478039030021954],
 ['Ukraine#Kharkiv', 1.4405910043460683],
 ['Russia#Kaliningrad', 1.4384801368072837],
 ['Mexico#Monterrey', 1.437420356902229],
 ['Azerbaijan#Baku', 1.4372001055621166],
 ['Ecuador#Quito', 1.4363201717376064],
 ['Mexico#Leon', 1.4326077642290738],
 ['Fiji#Suva', 1.430351

User chooses another city

In [64]:
user_input = 'Russia#Krasnoyarsk'
INPUT_USER_CITIES_LIST, USER_SIMILARITY_SCORES = update_scores(new_city=user_input, 
                                                               user_city_list=INPUT_USER_CITIES_LIST,
                                                               prev_sim_scores=USER_SIMILARITY_SCORES,
                                                               cities_sim_matr=CITIES_SIM_MATR,
                                                               cities_mapping=CITIES_MAPPING)

And gets new recommendations

In [65]:
recsys_top_results(USER_SIMILARITY_SCORES, df, INPUT_USER_CITIES_LIST)

[['Russia#Perm', 2.433038624880756],
 ['Russia#Omsk', 2.421505153994494],
 ['Russia#Novosibirsk', 2.417624605692053],
 ['Russia#Rostov-na-donu', 2.381301550528926],
 ['Russia#Samara', 2.3668043208409535],
 ['Russia#Saint Petersburg', 2.3651569488187913],
 ['Russia#Yekaterinburg', 2.360996122100799],
 ['Russia#Kazan', 2.3542776035222914],
 ['Russia#Krasnodar', 2.3089435903456357],
 ['Russia#Voronezh', 2.3058307923049046],
 ['Kyrgyzstan#Bishkek', 2.3050643975910226],
 ['Russia#Vladivostok', 2.291961506936048],
 ['Ukraine#Dnipro', 2.2572318808554415],
 ['Azerbaijan#Baku', 2.255398370622588],
 ['Cambodia#Phnom Penh', 2.254907158636258],
 ['Indonesia#Jakarta', 2.2541437160148146],
 ['Ukraine#Kiev (Kyiv)', 2.252446235268007],
 ['Georgia#Tbilisi', 2.2497714125647374],
 ['Brazil#Vitoria', 2.2417542018773347],
 ['Mongolia#Ulaanbaatar', 2.227650936167268],
 ['Egypt#Cairo', 2.22619833687247],
 ['Russia#Irkutsk', 2.2153100980708667],
 ['Mexico#Monterrey', 2.208240339329269],
 ['Brazil#Sao Paulo', 

And another city....

In [66]:
user_input = 'Kazakhstan#Almaty'
INPUT_USER_CITIES_LIST, USER_SIMILARITY_SCORES = update_scores(new_city=user_input, 
                                                               user_city_list=INPUT_USER_CITIES_LIST,
                                                               prev_sim_scores=USER_SIMILARITY_SCORES,
                                                               cities_sim_matr=CITIES_SIM_MATR,
                                                               cities_mapping=CITIES_MAPPING)

And gets new recommendations

In [67]:
recsys_top_results(USER_SIMILARITY_SCORES, df, INPUT_USER_CITIES_LIST)

[['Russia#Perm', 3.2937853879999786],
 ['Russia#Rostov-na-donu', 3.2307787624809485],
 ['Russia#Omsk', 3.22934199011223],
 ['Russia#Novosibirsk', 3.2273180153914933],
 ['Kyrgyzstan#Bishkek', 3.2128130667550754],
 ['Russia#Yekaterinburg', 3.1740091610015124],
 ['Russia#Samara', 3.172292923577797],
 ['Russia#Kazan', 3.1446583055764945],
 ['Ukraine#Dnipro', 3.115697752813271],
 ['Russia#Voronezh', 3.101255731065823],
 ['Russia#Saint Petersburg', 3.079634590441912],
 ['Azerbaijan#Baku', 3.079243914702579],
 ['Ukraine#Kiev (Kyiv)', 3.0768840118550944],
 ['Mongolia#Ulaanbaatar', 3.0643564454009162],
 ['Cambodia#Phnom Penh', 3.056878829101345],
 ['Russia#Krasnodar', 3.0562836654567453],
 ['Brazil#Vitoria', 3.056207895600143],
 ['Indonesia#Jakarta', 3.04469372425144],
 ['Georgia#Tbilisi', 3.0392565808559477],
 ['Egypt#Cairo', 3.0326437861413353],
 ['Indonesia#Denpasar', 3.0244350165752367],
 ['Ukraine#Kharkiv', 3.0225558064533287],
 ['Belarus#Minsk', 3.0219895684288693],
 ['Russia#Vladivostok'