In [22]:
# imports
import pandas as pd
import numpy as np
import itertools
import random
from tqdm import tqdm
from geopy.distance import geodesic

import warnings
warnings.filterwarnings('ignore')

In [2]:
# load the cities dataset
MHQoL = pd.read_csv('../data/movehubqualityoflife.csv')
cities = pd.read_csv('../data/cities.csv')
HappinessIndex = pd.read_csv('../data/2019.csv')
UnescoSites = pd.read_csv('../data/whc-sites-2019.csv')

In [3]:
MHQoL.head()

Unnamed: 0,City,Movehub Rating,Purchase Power,Health Care,Pollution,Quality of Life,Crime Rating,lat,lng
0,Caracas,65.18,11.25,44.44,83.45,8.61,85.7,10.480594,-66.903606
1,Johannesburg,84.08,53.99,59.98,47.39,51.26,83.93,-26.204103,28.047305
2,Fortaleza,80.17,52.28,45.46,66.32,36.68,78.65,-3.732714,-38.526998
3,Saint Louis,85.25,80.4,77.29,31.33,87.51,78.13,38.627003,-90.199404
4,Mexico City,75.07,24.28,61.76,18.95,27.91,77.86,19.432608,-99.133208


In [4]:
cities.head()

Unnamed: 0,City,Country
0,Oakland,United States
1,Oakville,Canada
2,Oaxaca de Juárez,Mexico
3,Oberhausen,Germany
4,Obihiro,Japan


In [5]:
HappinessIndex.head()

Unnamed: 0,Overall rank,Country or region,Score,GDP per capita,Social support,Healthy life expectancy,Freedom to make life choices,Generosity,Perceptions of corruption
0,1,Finland,7.769,1.34,1.587,0.986,0.596,0.153,0.393
1,2,Denmark,7.6,1.383,1.573,0.996,0.592,0.252,0.41
2,3,Norway,7.554,1.488,1.582,1.028,0.603,0.271,0.341
3,4,Iceland,7.494,1.38,1.624,1.026,0.591,0.354,0.118
4,5,Netherlands,7.488,1.396,1.522,0.999,0.557,0.322,0.298


In [6]:
UnescoSites.head()

Unnamed: 0,category,states_name_en,region_en,unique_number,id_no,rev_bis,name_en,short_description_en,justification_en,date_inscribed,...,date_end,danger_list,longitude,latitude,area_hectares,criteria_txt,category_short,iso_code,udnp_code,transboundary
0,Cultural,Afghanistan,Asia and the Pacific,230,208,Rev,Cultural Landscape and Archaeological Remains ...,The cultural landscape and archaeological rem...,<em>Criterion (i):</em> The Buddha statues an...,2003,...,,Y 2003,67.82525,34.84694,158.9265,(i)(ii)(iii)(iv)(vi),C,af,afg,0
1,Cultural,Afghanistan,Asia and the Pacific,234,211,Rev,Minaret and Archaeological Remains of Jam,"The 65m-tall Minaret of Jam is a graceful, so...",<em>Criterion (ii):</em> The innovative archi...,2002,...,,Y 2002,64.515889,34.396417,70.0,(ii)(iii)(iv),C,af,afg,0
2,Cultural,Albania,Europe and North America,1590,569,Bis,Historic Centres of Berat and Gjirokastra,Berat and Gjirokastra are inscribed as rare e...,,2005,...,,,20.133333,40.069444,58.9,(iii)(iv),C,al,alb,0
3,Cultural,Albania,Europe and North America,1563,570,ter,Butrint,"Inhabited since prehistoric times, Butrint ha...",,1992,...,2005.0,P 1997-2005,20.026111,39.751111,,(iii),C,al,alb,0
4,Cultural,Algeria,Arab States,111,102,,Al Qal'a of Beni Hammad,In a mountainous site of extraordinary beauty...,,1980,...,,,4.78684,35.81844,150.0,(iii),C,dz,dza,0


In [7]:
def get_unesco(city):
    distances = UnescoSites.apply(lambda sites : geodesic((sites['latitude'],sites['longitude']), (city['lat'], city['lng'])).kilometers,1)
    return sum(map(lambda x : x<100, distances))


In [8]:
City_Country = pd.merge(MHQoL[['City','Purchase Power', 'Health Care', 'Pollution', 'Quality of Life', 'Crime Rating','lat','lng']],
                        cities,
                        on='City')

HappinessIndex = HappinessIndex.rename(columns={'Country or region': 'Country', 'Score': 'Happiness_Score'})

City_Country_Happiness = pd.merge(City_Country,
                                  HappinessIndex[['Country','Happiness_Score']],
                                  on='Country')

City_Country_Happiness['Unesco'] = City_Country_Happiness.apply(get_unesco, 1)

City_Country_Happiness.head()

Unnamed: 0,City,Purchase Power,Health Care,Pollution,Quality of Life,Crime Rating,lat,lng,Country,Happiness_Score,Unesco
0,Caracas,11.25,44.44,83.45,8.61,85.7,10.480594,-66.903606,Venezuela,4.707,1
1,Barcelona,45.68,58.47,71.75,47.18,41.47,36.088106,-95.924131,Venezuela,4.707,0
2,Valencia,49.11,72.07,35.24,64.89,26.04,28.521792,-81.463577,Venezuela,4.707,0
3,Johannesburg,53.99,59.98,47.39,51.26,83.93,-26.204103,28.047305,South Africa,4.722,0
4,Cape Town,60.36,71.67,75.98,78.73,68.06,-33.924869,18.424055,South Africa,4.722,2


In [9]:
features = ['Purchase Power', 'Health Care', 'Quality of Life', 'Pollution', 'Crime Rating', 'Happiness_Score', 'Unesco']

norm = lambda xs: (xs-xs.min())/(xs.max()-xs.min())
City_Country_Happiness[features] = (norm(City_Country_Happiness[features]) * 100)

City_Country_Happiness['QoL_H'] = (City_Country_Happiness['Quality of Life'] + City_Country_Happiness['Happiness_Score']) / 2

# reorder columns
cities_ds = City_Country_Happiness[['City', 'Purchase Power', 'Health Care', 'Pollution', 'QoL_H', 'Crime Rating', 'Unesco', 'lat', 'lng', 'Country']]

cities_ds.head()

Unnamed: 0,City,Purchase Power,Health Care,Pollution,QoL_H,Crime Rating,Unesco,lat,lng,Country
0,Caracas,5.697906,31.425529,90.294309,16.12885,100.0,9.090909,10.480594,-66.903606,Venezuela
1,Barcelona,45.981046,50.099827,77.634711,38.103554,40.900588,0.0,36.088106,-95.924131,Venezuela
2,Valencia,49.99415,68.201784,38.130275,48.193572,20.283271,0.0,28.521792,-81.463577,Venezuela
3,Johannesburg,55.703756,52.109677,51.27678,40.603268,97.634955,0.0,-26.204103,28.047305,South Africa
4,Cape Town,63.156663,67.669373,82.211643,56.253906,76.429717,18.181818,-33.924869,18.424055,South Africa


In [10]:
cities_ds.to_csv('../data/cities_ds.csv')

In [23]:
def generate_dataset(multiplier = 5):
    
    # 0 is none, 1 is low, 2 is mid, 3 is high
    choices = [[0, 1, 2, 3]] * 6
    
    # create all possible combinations and repeat them to create bigger ds
    combinations = list(itertools.product(*choices)) * multiplier
    
    dataset = pd.DataFrame(columns=['Weights', 'City'])
    dataset['Weights'] = pd.Series(combinations)
    
    for index, row in tqdm(dataset.iterrows()):
        row['City'] = get_city(np.array(row['Weights']))
        
    return dataset

In [24]:
def get_city(weights):
    
    # rank cities according to the weights given by the person
    ranked_cities = rank_cities(weights).reset_index(drop=True)
    
    # sample from uniform distribution and "randomly" select a city
    sample = np.random.uniform(0, 1, len(ranked_cities))
    prob_score = sample * list(ranked_cities['Score'])
    maximum = max(prob_score)
    if maximum != 0:
        chosen = [int(i/maximum) for i in prob_score]
    else:
        chosen = [0] * len(ranked_cities)
        chosen[random.randint(0, len(chosen))] = 1
        
    ranked_cities['Chosen'] = pd.Series(chosen)
        
    return ranked_cities[ranked_cities['Chosen'] == 1]['City'].values[0]

In [25]:
def rank_cities(weights):
    features = ['Purchase Power', 'Health Care', 'Pollution', 'QoL_H', 'Crime Rating', 'Unesco']
    
    # pollution and crime rating have a negative impact, whist the other features have a positive one
    weights *= [2, 2, -2, 2, -2, 1]
    
    norm = lambda xs: (xs-xs.min())/(xs.max()-xs.min())
    # e^(2x) to increase the probability of getting the most compatible city
    cities_ds['Score'] = np.exp(norm(cities_ds[features].dot(weights)) * 10)
    
    return cities_ds.sort_values('Score', ascending=False).fillna(0)

In [26]:
people = generate_dataset(10)

40960it [02:01, 336.36it/s]


In [27]:
people.head()

Unnamed: 0,Weights,City
0,"(0, 0, 0, 0, 0, 0)",Newark
1,"(0, 0, 0, 0, 0, 1)",Brussels
2,"(0, 0, 0, 0, 0, 2)",Jerusalem
3,"(0, 0, 0, 0, 0, 3)",Brussels
4,"(0, 0, 0, 0, 1, 0)",Aachen


In [28]:
people.to_csv('../data/people_ds.csv')