# Algorithm X application to constituency data

Previously we found all sets of 2 / 3 / 4 constituencies which are neighbours, i.e. those constituencies which share a border, which we shall call sets (with a unique identifier `set_no`). We will now apply Algorithm X to these merged constituencies and find (a subset of) solutions so that every constituency is selected once and only once. We shall do this on a region-by-region basis for two reasons:

1. it will reduce the amount of possible combinations substantially
1. it also (mostly) ensures consistency of political parties, so that e.g. we wouldn't have one constituency on England and one in Wales, so that Plaid Cymru vote would potentially halve.

There are often times when the total number of constituencies in a region is not divisible by 2 / 3 / 4. For these cases we shall remove a set from a different constituency size until they are divisible, e.g. for the North East we have 29 constituencies so if we want to find all solutions where we merge 2 constituencies we shall pick at random one of the sets where 3 constituencies have been merged and remove them from our initial analysis. We shall repeat this, removing another of the 3-way merged sets, until we get a large enough sample.

For some of the sets we have a large number of solutions, so we will only keep a subset of them. When there are a large number of solutions we shall rerun the analysis with the dataframe resampled and this can change the initial solutions given.

The (sampled) solutions will be saved as csv files.

All functions used are stored in the `algox_modules.py` file.


In [1]:
import numpy as np
import pandas as pd
from AlgorithmX import *
from joblib import Parallel, delayed
from random import random, sample
from algox_modules import *
import os

In [2]:
const_pairs = pd.read_csv("../Analysis/Data/const_pairs.csv.gz")
const_tris = pd.read_csv("../Analysis/Data/const_tris.csv.gz")
const_quads = pd.read_csv("../Analysis/Data/const_quads.csv.gz")

In [3]:
regions = np.unique(const_pairs['region'])

In [4]:
# Remove any files that were created in a previous run
!rm Logs/solns/soln_*.csv
!rm Logs/log_*.log
!rm Logs/DataFrames/df_*.csv.gz
!rm Solutions/solns_*.csv.gz
!rm Logs/check/solns_*.csv

rm: cannot remove ‘Solutions/solns_*.csv.gz’: No such file or directory
rm: cannot remove ‘Logs/check/solns_*.csv’: No such file or directory


In [None]:
# Command to run with joblib.
element_information = Parallel(n_jobs=4, verbose=10)(
    delayed(get_solns)(const_pairs, const_tris, const_quads, seats, region, max_solns=2.5e5) 
        for region in regions for seats in [2,3,4] )

[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   5 tasks      | elapsed: 21.5min
[Parallel(n_jobs=4)]: Done  10 tasks      | elapsed: 35.2min


In [None]:
get_solns(const_pairs, const_tris, const_quads, 3, "East", max_solns=5e5) 

In [None]:
import datetime
for region in regions:
    for seats in [2,3,4]:
        print(f"Region {region} with {seats} seats. Start time {datetime.datetime.now()}")
        get_solns(const_pairs, const_tris, const_quads, seats, region, max_solns=5e5) 
        

In [None]:
# Command to run with joblib.
element_information = Parallel(n_jobs=4, verbose=10)(
    delayed(get_solns)(const_pairs, const_tris, const_quads, seats, region, max_solns=5e5) 
        for region in regions for seats in [2,3,4] )

In [None]:
import sys
import importlib
importlib.reload(sys.modules['algox_modules'])

In [None]:
# Command to run with joblib.
element_information = Parallel(n_jobs=4, verbose=10)(
    delayed(get_solns)(const_pairs, const_tris, const_quads, seats, region, max_solns=2.5e5) 
        for region in regions for seats in [2,3,4] )

In [None]:
import glob
soln_dict = {}
for i in range(15):
    files = glob.glob(f"Logs/solns/soln_East_3_d_{i}.csv")
    if len(files) == 1:
        soln_dict[i] = pd.read_csv(files[0])
    else:
        files = glob.glob(f"Logs/solns/soln_East_3_d_{i}_*.csv")
        d = {}
        for file in files:
            j = int(file.replace(".csv", "").replace("Logs/solns/soln_East_3_d_", "").split("_")[1])
            d[j] = pd.read_csv(file)
        try:
            soln_dict[i] = pd.concat(d, ignore_index=True)
        except:
            print(f"For i = {i} cannot concatenate")

In [None]:
for i in range(len(soln_dict)):
    print(soln_dict[i].shape)

In [None]:
soln_dict[6].head(10)

In [None]:
soln_dict[1].head(10)

In [None]:
solns = pd.concat(soln_dict) #, ignore_index=True)

In [None]:
solns.shape

In [None]:
file = "Logs/solns/soln_East_3_d_10_11.csv"
int(file.replace(".csv", "").replace("Logs/solns/soln_East_3_d_", "").split("_")[1])

In [None]:
for region in regions:
    r = region.replace(" ", "_")
    try:
#         test2 = pd.read_csv(f"Solutions/solns_{r}_2.csv.gz")
#         test3 = pd.read_csv(f"Solutions/solns_{r}_3.csv.gz")
        test4 = pd.read_csv(f"Solutions/solns_{r}_4.csv.gz")
        print(f"We have {test4.shape[0]:,} solutions for the {region} region for 4 seats respectively.")
#         print(f"We have {test2.shape[0]:,}, {test3.shape[0]:,} and {test4.shape[0]:,} solutions for the {region} region for 2, 3 and 4 seats respectively.")
    except:
        pass

In [None]:
get_solns(const_pairs, const_tris, const_quads, 4, "East Midlands", max_solns=5e5) 


In [None]:
test = pd.read_csv(f"Solutions/solns_Scotland_3.csv.gz")
test.shape

In [None]:
# Command to run with joblib.
element_information = Parallel(n_jobs=4, verbose=10)(
    delayed(get_solns)(const_pairs, const_tris, const_quads, seats, region, max_solns=5e5) 
        for region in regions for seats in [2,3,4] )

In [None]:
all_files = glob.glob(f"Logs/solns/soln_*_[0-9]_d_[0-9].csv")
all_files[:10]

In [None]:
r = 'West_Midlands'
s = 3
files = glob.glob(f"Logs/solns/soln_{r}_{s}_d_[0-9].csv")
len(files)

In [None]:
for region in ['Wales']: # regions:
    r = region.replace(" ", "_")
    for s in [2, 3]:
        files = glob.glob(f"Logs/solns/soln_{r}_{s}_d_[0-9].csv")
        print(f"{region}: {s} has {len(files)} files.")
        if len(files) > 0:
            for file in files:
                i = re.findall("[0-9]", file)[1]
                d[i] = pd.read_csv(file, converters={'soln': literal_eval})
            test = pd.concat(d)


In [None]:
import glob
import re
from ast import literal_eval
for region in regions:
    print(region)
    r = region.replace(" ", "_")
    for seats in [3]:
        print(seats)
        files = glob.glob(f"Logs/solns/soln_{r}_{seats}_d_*.csv")
        d = {}
        for file in files:
            print(file)
            i = re.findall("[0-9]", file)[1]
            print(i)
            d[i] = pd.read_csv(file, converters={'soln': literal_eval})
#             print(d[i].shape)
            
            
        test = pd.concat(d)
#             print(file)
#             print(re.findall("[0-9]", file)[1])

In [None]:
?re.findall

In [None]:
!tail -f "Logs/log_East_3.log"

In [None]:
def const_mapper(df):
    """
    As the AlgorithmX code requires inputs starting from zero we shall take all values in the dataframes
    and map them to ints. This function will return the solver required.
    The df is always randomly resampled when we run this so that we get a different initial answer each time.
    """
    name_cols = get_name_cols(df)
    const_list = np.unique(df[name_cols].stack())
    n = len(const_list)
    mapping = {}
    for i in range(n):
        mapping[const_list[i]] = i
    for col in name_cols:
        df = df.replace({col: mapping})
    solver = AlgorithmX(n)
    for index, row in df.iterrows():
        solver.appendRow([r for r in row[name_cols]], row['set_no'])
    return solver

def return_solutions(df, max_soln = 1e7, resampled=False, log_df_name=None):
    """
    This function returns the solutions from the AlgorithmX code.
    prop - states what proportion of the solutions are returned (useful for when they get too big)
    max_soln - maximum number of solutions to derive
    resampled - is this solution being rerun
    """
    max_returned = 2.5e6
    
    solver = const_mapper(df)
    solns = 0
    dict_solns = {}
    try:
        with timeout(90, exception=RuntimeError): 
            # Stop calculations if taking too long, either there is no solution or having difficulty finding first one
            for solution in solver.solve():
                dict_solns[solns] = solution
                solns += 1
                if solns == max_soln:
                    resampled = True # As we will be rerunning this with a dataframe 'resampled' data frame
                    break
            soln_returned = solns > 0

            # If the result is too big take a sample. If the solution is going to be resampled take a small proportion
            # otherwise take a larger one
            if soln_returned:
                if not resampled and solns <= max_returned:
                    sampled_solns = pd.DataFrame({'soln': dict_solns}).reset_index(drop=True)
                else:
                    if not resampled:
                        keys = sample(list(dict_solns.keys()), max_returned)
                    else:
                        keys = sample(list(dict_solns.keys()), int(max_soln*0.0025))
                    dict_solns2 = {}
                    for k in keys:
                        dict_solns2[k] = dict_solns[k]
                    sampled_solns = pd.DataFrame({'soln': dict_solns2}).reset_index(drop=True)
                # Sort out the solutions at this point to save time later.
                sampled_solns = sampled_solns.assign(soln = [list(np.sort(s)) for s in sampled_solns['soln']])
                return soln_returned, sampled_solns, resampled
            else:
                soln_returned = False
                return soln_returned, None, None
    except RuntimeError:
        soln_returned = False
        return soln_returned, None, None

In [None]:
East
East Midlands
London
North West
South East


In [None]:
region = 'London'
r = region.replace(" ", "_")
df = pd.read_csv(f"Logs/DataFrames/df_{r}_3.csv.gz")
solver = const_mapper(df)

In [None]:

const_pairs2 = const_pairs.query("region == @region")
const_tris2 = const_tris.query("region == @region")
const_quads2 = const_quads.query("region == @region")
name_cols = get_name_cols(const_tris2)
# How many times should we rerun Algorithm X when we cannot return all solutions.
RERUN_COUNTER = 5 #* (1 + (seats >= 4))
# How many times should we rerun Algorithm X when we have to remove different sized sets.
COUNTER = 5 #* (1 + (seats >= 4))

seats = 3
n = get_n(df, name_cols)

file_name = f"Solutions/solns_{r}_{seats}.csv.gz"
log_file_name = f"Logs/log_{r}_{seats}.log"
log_df_name = f"Logs/DataFrames/df_{r}_{seats}.csv.gz"
log = custom_logger(log_file_name)
log.info(f'Starting code for region {region} with {seats} seats.')
max_solns = 1e5

In [None]:
n

In [None]:
# Get the solutions multiple times with different random elements removed.
soln_dict = {}
i = 0
removed = {}
removed['triplet'] = [123]
while i < COUNTER:
#     df, removed = remove_random_const(const_pairs2, const_tris2, const_quads2, seats, region, n)
    soln_returned, soln_dict[i], resampled = return_solutions(df, resampled=False, max_soln=max_solns, log_df_name=log_df_name)
    if soln_returned:
        if resampled:
            d = {}
            d[0] = soln_dict[i].copy()
            j = 1
            while j < RERUN_COUNTER and soln_returned:
                if soln_returned:
                    j += 1
                    soln_returned, d[j], resampled = return_solutions(df, resampled=True, max_soln=max_solns, log_df_name=log_df_name)
                else:
                    break
            if soln_returned:
                soln_dict[i] = pd.concat(d)
    if soln_returned:
        # Add in the set_no's that were removed from the solutions
        soln_dict[i][list(removed.keys())[0]] = str(list(removed.values())[0])
        i += 1
        solns = pd.concat(soln_dict)
if len(solns) > 0:
    solns = solns.assign(region = region)

In [None]:
for region in regions:
    print(region)
    get_solns(const_pairs, const_tris, const_quads, 2, region, max_solns=1e5)

In [None]:
get_solns(const_pairs, const_tris, const_quads, 2, "South East", max_solns=1e5)

In [None]:
import sys
import importlib
importlib.reload(sys.modules['algox_modules'])

In [None]:
# Command to run with joblib.
element_information = Parallel(n_jobs=4, verbose=10)(
    delayed(get_solns)(const_pairs, const_tris, const_quads, seats, region, max_solns=1e7) 
        for seats in [2,3,4] for region in regions)


In [None]:
get_solns(const_pairs, const_tris, const_quads, 3, 'Yorkshire and the Humber', max_solns=1e5)

In [None]:
for i in [2,3,4]:
    get_solns(const_pairs, const_tris, const_quads, i, 'London', max_solns=1e5)

In [None]:
get_solns(const_pairs, const_tris, const_quads, 3, 'Wales', max_solns=1e7)