In [175]:
# imports
import pandas as pd
import numpy as np
import itertools
import random

In [176]:
# load the cities dataset
cities = pd.read_csv('../data/movehubqualityoflife.csv')
cities.head()

Unnamed: 0,City,Movehub Rating,Purchase Power,Health Care,Pollution,Quality of Life,Crime Rating
0,Caracas,65.18,11.25,44.44,83.45,8.61,85.7
1,Johannesburg,84.08,53.99,59.98,47.39,51.26,83.93
2,Fortaleza,80.17,52.28,45.46,66.32,36.68,78.65
3,Saint Louis,85.25,80.4,77.29,31.33,87.51,78.13
4,Mexico City,75.07,24.28,61.76,18.95,27.91,77.86


In [177]:
def generate_dataset(multiplier = 5):
    
    # 0 is none, 1 is low, 2 is mid, 3 is high
    choices = [[0, 1, 2, 3], [0, 1, 2, 3], [0, 1, 2, 3], [0, 1, 2, 3], [0, 1, 2, 3]]
    
    # create all possible combinations and repeat them to create bigger ds
    combinations = list(itertools.product(*choices)) * multiplier
    
    dataset = pd.DataFrame(columns=['Weights', 'City'])
    dataset['Weights'] = pd.Series(combinations)
    
    for index, row in dataset.iterrows():
        row['City'] = get_city(np.array(row['Weights']))
        
    return dataset

In [178]:
def get_city(weights):
    
    # rank cities according to the weights given by the person
    ranked_cities = rank_cities(weights).reset_index(drop=True)
    
    # sample from uniform distribution and "randomly" select a city
    sample = np.random.uniform(0, 1, len(ranked_cities))
    prob_score = sample * list(ranked_cities['Score'])
    maximum = max(prob_score)
    if maximum != 0:
        chosen = [int(i/maximum) for i in prob_score]
    else:
        chosen = [0] * len(ranked_cities)
        chosen[random.randint(0, len(chosen))] = 1
        
    ranked_cities['Chosen'] = pd.Series(chosen)
        
    return ranked_cities[ranked_cities['Chosen'] == 1]['City'].values[0]

In [179]:
def rank_cities(weights):
    features = ['Purchase Power', 'Health Care', 'Quality of Life', 'Pollution', 'Crime Rating']
    
    # pollution and crime rating have a negative impact, whist the other features have a positive one
    weights *= [1, 1, 1, -1, -1]
    
    norm = lambda xs: (xs-xs.min())/(xs.max()-xs.min())
    # e^(2x) to increase the probability of getting the most compatible city
    cities['Score'] = np.exp(norm(cities[features].dot(weights)) * 10)
    
    return cities.sort_values('Score', ascending=False).fillna(0)

In [181]:
people = generate_dataset(5)

In [182]:
people

Unnamed: 0,Weights,City
0,"(0, 0, 0, 0, 0)",Varna
1,"(0, 0, 0, 0, 1)",Malaga
2,"(0, 0, 0, 0, 2)",Malaga
3,"(0, 0, 0, 0, 3)",Abu Dhabi
4,"(0, 0, 0, 1, 0)",Glasgow
...,...,...
5115,"(3, 3, 3, 2, 3)",Dresden
5116,"(3, 3, 3, 3, 0)",Zurich
5117,"(3, 3, 3, 3, 1)",Aachen
5118,"(3, 3, 3, 3, 2)",Zurich
