# Electrify_Clusters

### Enter all input data here

In [2]:
input_file = "in_files/GHS_clusters_joined.shp"  # must be polygons with attributes pop_sum, area_m2, connected

minimum_pop = 0 # exclude any population below this

### Then we do all the necessary Python imports

In [3]:
%matplotlib inline
import time
from math import sqrt
import numpy as np
import pandas as pd
import geopandas as gpd
from shapely.geometry import LineString
from astroML.clustering import HierarchicalClustering, get_graph_segments

In [4]:
start = time.time()

### Read in the clusters file, convert to desired CRS (ostensibly better for distances) and convert to points, filter on population along the way

In [5]:
clusters = gpd.read_file(input_file)
# This is the Africa Albers Equal Area Conic EPSG: 102022
epsg102022 = '+proj=aea +lat_1=20 +lat_2=-23 +lat_0=0 +lon_0=25 +x_0=0 +y_0=0 +ellps=WGS84 +datum=WGS84 +units=m +no_defs'
clusters = clusters.to_crs(epsg102022)

clusters = clusters.sort_values('pop_sum', ascending=False)  # so that biggest (and thus connected) city gets index=0
clusters = clusters.reset_index().drop(columns=['index'])
clusters = clusters.reset_index()  # this adds the index as a column again, properly ordered

clusters_points = clusters.copy()
clusters_points = clusters_points.loc[clusters_points['pop_sum'] > minimum_pop]
clusters_points.geometry = clusters_points['geometry'].centroid
clusters_points['X'] = clusters_points.geometry.x
clusters_points['Y'] = clusters_points.geometry.y

### We then take all the clusters and calculate the optimum network that connects them all together. The ML model returns T_x and T_y containing the start and end points of each new arc created

In [6]:
df = pd.DataFrame(clusters_points)
points = df[['X', 'Y']].as_matrix()

# min_cluster set like this to group everything into one network
# could actually exlpore with lower settings?
min_cluster = len(df.index) - 1
model = HierarchicalClustering(n_neighbors=min_cluster, edge_cutoff=0.9, min_cluster_size=min_cluster)
model.fit(points)
T_x, T_y = get_graph_segments(model.X_train_, model.full_tree_)

### This point and line data is then copied into two arrays, called *nodes* and *network*, containing the clusters and lines, respectively. Each element represents a single cluster or joining arc, and has data within describing the coordinates and more.

**Structure for nodes**  
0   index  
1   x  
2   y  
3   area_m2  
4   pop_sum  
5   connected  
6   new_conn  
7   off_grid_cost  
8   [connected arc indices]  

**Structure for network**  
0   index  
1   xs  
2   ys  
3   xe  
4   ye  
5   node index first point  
6   node index last point  
7   existing  
8   arc length  
9   whether enabled

In [7]:
df['new_conn'] = df['connected']
df['off_grid_cost'] = 0
nodes = df[['X', 'Y', 'area_m2', 'pop_sum', 'connected', 'new_conn', 'off_grid_cost']].reset_index().values.astype(int).tolist()

# add an empty list at position 8 for connected arc indices
for node in nodes:
    node.append([])

counter = 0
network = []
for xs, ys, xe, ye in zip(T_x[0], T_y[0], T_x[1], T_y[1]):
    xs = int(xs)
    ys = int(ys)
    xe = int(xe)
    ye = int(ye)
    length = int(sqrt((xe - xs)**2 + (ye - ys)**2))
    network.append([counter, xs, ys, xe, ye, None, None, 1, length, 1])
    counter += 1

### Then we need to tell each arc which nodes it is connected to, and likewise for each node
Each arc connects two nodes, each node can have 1+ arcs connected to it

In [8]:
def connect_network(nodes, network, index):
    cur_node = nodes[index]
    for arc in network:
        found = 0
        if arc[5] == None and arc[6] == None:  # if this arc has no connected nodes
            if (arc[1] == cur_node[1] and arc[2] == cur_node[2]):  # if the xs and ys match a node
                found = 3  # point towards position 3 (xe) for the next node
            if (arc[3] == cur_node[1] and arc[4] == cur_node[2]):  # if the xe and ye match a node
                found = 1  # point towards position 1 (xs) for the next node

            if found:
                arc[5] = cur_node[0] # tell this arc that this node is its starting point
            
                for node in nodes:
                    if node[0] != cur_node[0]:  # make sure we look at hte other end of the arc
                        if node[1] == arc[found] and node[2] == arc[found+1]:
                            arc[6] = node[0] # tell this arc that this node is its ending point                  
                            nodes, network = connect_network(nodes, network, node[0]) # and investigate downstream
                            break
    
    return nodes, network

nodes, network = connect_network(nodes, network, 0)

# for every node, add references to every arc that connects to it
for arc in network:
    nodes[arc[5]][8].append(arc[0])
    nodes[arc[6]][8].append(arc[0])
    
# set which arcs don't already exist (and the remainder do!)
for node in nodes:
    if node[5] == 0:
        connected_arcs = [network[arc_index] for arc_index in node[8]]
        for arc in connected_arcs:
            arc[7] = 0
            arc[9] = 0 

### First calcaulte the off-grid cost for each unconnected settlement

In [9]:
# off-grid costs
demand_per_person_kwh_month = 2 # 6kWh/month = MTF Tier 2
demand_per_person_kw_peak = demand_per_person_kwh_month / (4*30)  # 130 4hours/day*30days/month based on MTF numbers, should use a real demand curve
mg_gen_cost_per_kw = 7000
mg_cost_per_m2 = 2

for node in nodes:
    if node[5] == 0:
        node[7] = node[4]*demand_per_person_kw_peak*mg_gen_cost_per_kw + node[3]*mg_cost_per_m2

# grid costs
cost_wire_per_m = 39
grid_cost_per_m2 = 2

### Then we're ready to calculate the optimum grid extension.
This is done by expanding out from each already connected node, finding the optimum connection of nearby nodes. This is then compared to the off-grid cost and if better, these nodes are marked as connected. Then the loop continues until no new connections are found.

In [10]:
# This function recurses through the network, dragging a current c_ values along with it.
# These aren't returned, so are left untouched by aborted side-branch explorations.
# The best b_ values are returned, and are updated whenever a better configuration is found.
# Thus these will remmber the best solution including all side meanders.

def find_best(nodes, network, index, prev_arc, b_pop, b_length, b_nodes, b_arcs, c_pop, c_length, c_nodes, c_arcs):
    if nodes[index][6] == 0:  # don't do anything with already connected nodes
        
        
        c_pop += nodes[index][4]
        c_length += network[prev_arc][8]
        c_nodes = c_nodes[:] + [index]
        c_arcs = c_arcs[:] + [prev_arc]
              
        if c_pop/c_length > b_pop/b_length:
            b_pop = c_pop
            b_length = c_length
            b_nodes[:] = c_nodes[:]
            b_arcs[:] = c_arcs[:]
    
        connected_arcs = [network[arc_index] for arc_index in nodes[index][8]]
        for arc in connected_arcs:
            if arc[9] == 0 and arc[0] != prev_arc:

                goto = 6 if arc[5] == index else 5  # make sure we look at the other end of the arc
                nodes, network, b_pop, b_length, best_nodes, best_arcs = find_best(
                    nodes, network, arc[goto], arc[0], b_pop, b_length, b_nodes, b_arcs, c_pop, c_length, c_nodes, c_arcs)
                
    return nodes, network, b_pop, b_length, b_nodes, b_arcs

In [11]:
while True:  # keep looping until no further connections are added
    to_be_connected = []
    
    for node in nodes:
        if node[6] == 1:  # only start searches from currently connected nodes
            
            connected_arcs = [network[arc_index] for arc_index in node[8]]
            for arc in connected_arcs:
                if arc[9] == 0:
                    goto = 6 if arc[5] == node[0] else 5
                    
                    # function call a bit of a mess with all the c_ and b_ values
                    nodes, network, b_length, b_pop, b_nodes, b_arcs = find_best(
                        nodes, network, arc[goto], arc[0], 0, 1e-9, [], [], 0, 1e-9, [], [])                

                    # calculate the mg and grid costs of the resultant configuration
                    best_nodes = [nodes[i] for i in b_nodes]
                    best_arcs = [network[i] for i in b_arcs]
                    mg_cost = sum([node[7] for node in best_nodes])
                    grid_cost = (cost_wire_per_m * sum(arc[8] for arc in best_arcs) + 
                                 grid_cost_per_m2 * sum([node[3] for node in best_nodes]))

                    if grid_cost < mg_cost:
                        # check if any nodes are already in to_be_connected
                        add = True
                        for index, item in enumerate(to_be_connected):
                            if set(b_nodes).intersection(item[1]):
                                if b_pop/b_length < item[0]:
                                    del to_be_connected[index]
                                else:
                                    add = False  # if the existing one is better, we don't add the new one
                                break

                        if add:
                            to_be_connected.append((b_pop/b_length, b_nodes, b_arcs))
        
    # mark all to_be_connected as actually connected
    if len(to_be_connected) >= 1:
        for item in to_be_connected:
            for node in item[1]:
                nodes[node][6] = 1
            for arc in item[2]:
                network[arc][9] = 1
    
    else:
        break  # exit the loop once nothing is added

### And then do a join to get the results back into a polygon shapefile

In [12]:
# prepare nodes and join with original clusters gdf
nodes_df = pd.DataFrame(columns=['index', 'X', 'Y', 'area_m2', 'pop_sum', 'connected', 'new_conn',
                                  'og_cost', 'arcs'], data=nodes)
nodes_df = nodes_df[['index', 'new_conn', 'og_cost']]
clusters_joined = clusters.merge(nodes_df, how='left', on='index')

# do the same for the network array
network_df = pd.DataFrame(columns=['index', 'xs', 'ys', 'xe', 'ye', 'node_start', 'node_end',
                                   'existing', 'length', 'enabled'], data=network)
network_geometry = [LineString([(arc[1], arc[2]), (arc[3], arc[4])]) for arc in network]
network_gdf = gpd.GeoDataFrame(network_df, crs=clusters.crs, geometry=network_geometry)

In [13]:
clusters_joined.to_file('GHS_calc_clusters.shp')
network_gdf.to_file('GHS_calc_arcs.shp')

### And display some summary results

In [25]:
new_conns = clusters_joined.loc[clusters_joined['new_conn'] == 1].loc[clusters_joined['connected'] == 0]
og = clusters_joined.loc[clusters_joined['new_conn'] == 0]
arcs = network_df.loc[network_df['existing'] == 0].loc[network_df['enabled'] == 1]
cost = og['og_cost'].sum() + cost_wire_per_m * arcs['length'].sum() + grid_cost_per_m2 * new_conns['area_m2'].sum()

print(f'{len(new_conns)} connected')
print(f'{len(og)} off-grid')
print(f'{len(arcs)} lines')
print()
print(f'${cost:,.0f}')

3882 connected
2220 off-grid
3883 lines

$8,768,671,181


In [15]:
time.strftime('It took %Hh %Mm %Ss', time.gmtime(time.time() - start))

'It took 00h 01m 02s'