In [45]:
import numpy as np
import pandas as pd
import json
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import euclidean_distances
from sklearn.metrics import pairwise_distances
from sklearn.cluster import KMeans, AgglomerativeClustering
from sklearn.mixture import GaussianMixture
from sklearn.cluster import DBSCAN
from sklearn.cluster import SpectralClustering
from scipy.cluster.hierarchy import dendrogram, linkage
import seaborn as sns
from matplotlib.colors import LinearSegmentedColormap
import plotly.express as px
from datetime import datetime
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import imageio.v2 as imageio
import os
from sklearn.cluster import AffinityPropagation

In [62]:
categorized_df = pd.read_csv("clustered_features.csv")
categorized_df = categorized_df.drop(["Unnamed: 0"], axis=1)

In [63]:
def cluster_data(df, exclude_columns, n_clusters=10):
    clustering_columns = df.columns.difference(exclude_columns)
    data_for_clustering = df[clustering_columns]
    print(clustering_columns)

    kmeans = KMeans(n_clusters=n_clusters, random_state=0).fit(data_for_clustering)

    initial_cluster_labels = kmeans.labels_
    centroids = kmeans.cluster_centers_

    min_vals = data_for_clustering.min(axis=0)
    max_vals = data_for_clustering.max(axis=0)
    centroids_normalized = (centroids - min_vals.values) / (max_vals - min_vals).values

    centroid_sums = centroids_normalized.sum(axis=1)

    sorted_indices = np.argsort(centroid_sums)

    label_mapping = {sorted_index: new_label for new_label, sorted_index in enumerate(sorted_indices)}

    new_labels = np.array([label_mapping[label] for label in initial_cluster_labels])

    df['cluster'] = new_labels

    return df['cluster'], data_for_clustering, kmeans, label_mapping

labels = [
 'roads',
 'cities',
 'parks',
 'banks',
 'churches',
 'edu',
 'elevs',
 'hotels',
 'kindergartens',
 'libraries',
 'medicine',
 'shops',
 'mobile',
 'post'
]

categorized_df['cluster'], dt_for_clustering, k_model, label_map =  cluster_data(categorized_df, ['loc_parks', 'nat_parks', 'reg_parks', 'combined_parks', 'id', 'quality', 'cluster'])
average_categories = categorized_df[['roads', 'cities', 'parks', 'banks', 'churches', 'edu', 'elevs', 'hotels', 'kindergartens', 'libraries', 'medicine', 'shops', 'mobile', 'post']].mean(axis=1)

categorized_df['quality'] = 1.055555 * 0.1 * average_categories + 0.055555 * 0.1 * categorized_df['cluster']

df = categorized_df.drop(['loc_parks', 'nat_parks', 'reg_parks', 'combined_parks'], axis=1).copy()

Index(['banks', 'churches', 'cities', 'edu', 'elevs', 'hotels',
       'kindergartens', 'libraries', 'medicine', 'mobile', 'parks', 'post',
       'roads', 'shops'],
      dtype='object')


In [64]:
df.head()

Unnamed: 0,id,roads,cities,parks,banks,churches,edu,elevs,hotels,kindergartens,libraries,medicine,shops,mobile,post,cluster,quality
0,UA2124886201,0,2,5,9,0,0,4,0,1,1,1,1,7,0,1,0.239286
1,UA2124883302,1,2,5,9,0,1,5,1,1,4,3,1,9,1,1,0.329762
2,UA2124883301,1,2,5,9,0,0,5,0,1,4,2,1,8,1,1,0.299603
3,UA2124881503,2,4,0,9,0,0,5,0,1,4,2,1,9,1,1,0.292063
4,UA2124881502,1,4,0,9,0,0,5,0,0,4,2,1,5,1,1,0.246825


In [65]:
df.to_csv("clustered_featured_and_quality.csv")

In [82]:
def find_closest(vector, vectors):
    temp = vectors - vector
    temp = np.minimum(temp, 0)

    distances = pairwise_distances(temp, [np.zeros(vector.shape[0])])
    index = np.argmin(distances)

    return index, distances[index]

def find_farest(vector, vectors):
    temp = vectors - vector
    temp = np.minimum(temp, 0)

    distances = pairwise_distances(temp, [np.zeros(vector.shape[0])])
    index = np.argmax(distances)

    return index, distances[index]

In [52]:
df.head()

Unnamed: 0,id,roads,cities,parks,banks,churches,edu,elevs,hotels,kindergartens,libraries,medicine,shops,mobile,post,cluster,quality
0,UA2124886201,0,2,5,9,0,0,4,0,1,1,1,1,7,0,0,0.23373
1,UA2124883302,1,2,5,9,0,1,5,1,1,4,3,1,9,1,0,0.324206
2,UA2124883301,1,2,5,9,0,0,5,0,1,4,2,1,8,1,0,0.294047
3,UA2124881503,2,4,0,9,0,0,5,0,1,4,2,1,9,1,0,0.286508
4,UA2124881502,1,4,0,9,0,0,5,0,0,4,2,1,5,1,0,0.24127


In [97]:
per_cluster = {}
for cluster in range(10):
    subdf = df[df["cluster"] == cluster].copy()
    per_cluster[cluster] = {
        "info": subdf[["id", "cluster", "quality"]],
        "vectors": subdf.drop(["id", "cluster", "quality"], axis=1).to_numpy()
    }

output = {
    "pcode": [],
    "cluster": [],
    "quality": [],
    "ref_pcode": [],
    "ref_cluster": [],
    "ref_quality": [],
    "distance": []
}

for label in labels: 
    output[label] = []
    output["ref_" + label] = []
    output["diff_" + label] = []

for i, row in df.iterrows():
    pcode = row["id"]
    cluster = row["cluster"]
    vector = row.drop(["id", "cluster", "quality"]).to_numpy()

    # reverse_label_map = {v: k for k, v in label_map.items()} # mapping from kmeans cluster to ordered cluster

    # input_cluster_label = k_model.predict([vector])[0]
    # input_cluster = reverse_label_map[input_cluster_label]

    for target in range(max(1, cluster)):
        info = per_cluster[target]["info"]
        vectors = per_cluster[target]["vectors"]

        if cluster > 0:
            index, dist = find_closest(vector, vectors)
        else:
            index, dist = find_farest(vector, vectors)

        reference = vectors[index]
        diff = vectors[index] - vector

        output["pcode"].append(pcode)
        output["cluster"].append(cluster)
        output["quality"].append(row['quality'])
        output["ref_pcode"].append(info.iloc[index]['id'])
        output["ref_cluster"].append(target)
        output["ref_quality"].append(info.iloc[index]['quality'])
        output["distance"].append(dist[0])


        for i in range(len(labels)):
            output[labels[i]].append(vector[i])
            output["ref_" + labels[i]].append(reference[i])
            output["diff_" + labels[i]].append(diff[i])
            
        # print(f"From {cluster} to {target} for {pcode} (quality: {row['quality']}):")
        # print(f"\tReference village: {info.iloc[index]['id']} (quality: {info.iloc[index]['quality']})")
        # print(f"\tDistance to reference: {dist[0]}")
        # print(f"\tDifference vector:")
        # print(f"\t{vectors[index] - vector}")
        # print()

inter_df = pd.DataFrame(output)
inter_df.head()

Unnamed: 0,pcode,cluster,quality,ref_pcode,ref_cluster,ref_quality,distance,roads,ref_roads,diff_roads,...,diff_medicine,shops,ref_shops,diff_shops,mobile,ref_mobile,diff_mobile,post,ref_post,diff_post
0,UA2124886201,1,0.239286,UA5625886002,0,0.150794,5.0,0,0,0,...,-1,1,0,-1,7,6,-1,0,0,0
1,UA2124883302,1,0.329762,UA6122481001,0,0.24127,6.244998,1,0,-1,...,0,1,2,1,9,9,0,1,0,-1
2,UA2124883301,1,0.299603,UA2621255301,0,0.23373,6.244998,1,0,-1,...,0,1,2,1,8,9,1,1,0,-1
3,UA2124881503,1,0.292063,UA5122084801,0,0.286508,6.324555,2,1,-1,...,1,1,2,1,9,9,0,1,0,-1
4,UA2124881502,1,0.246825,UA0722183603,0,0.24127,5.567764,1,0,-1,...,1,1,1,0,5,5,0,1,0,-1


In [98]:
inter_df.to_csv("policy_planning.csv")