# STEP 2: Compute partition candidates

This is the second step in the computation. We will try to use the pre-partitioned shapes of municipalities to find feasible candidates of cell partitions. It is required to complete STEP 0 before beginning.



# Packages

Load modules 

In [1]:
import os

import numpy as np
import pandas as pd
import time

from tqdm import tqdm
import geopandas as gpd

from sqr.core.config import years, years_hh, years_pers, cell_label, minimum_cols
from sqr.core.network import local_graph
from sqr.core.scoring import partition_score
from sqr.core.shape import make_gdf_square_data
from sqr.main_assign import get_assignment, data_cols
from sqr.miscellaneous import read_parse_mun

# Prepare input

The configuration of how to run the assignment is set below. Subsequently a list of jobs with inputs for making the partitions is created.

Recall that some municipalities are split subparts. As a consequence we run procedure for the municiptal level (i.e., 'mun') but also at the sub-municipality level (i.e. 'submun') for municipalities with larger areas.



In [9]:
# load data 
gdf_kommuner = read_parse_mun() 

# parameters
trade = True
num_iter = 4

# init job list
job_list = []

# get input/output references
for level in ('mun', 'submun'):
    if level == 'mun':
        in_file = 'data/parsed/sqr_mun.hdf' 
        out_file = 'data/candidates.hdf'
        selection = (gdf_kommuner.to_assign) & (gdf_kommuner.cell_count<5000) 
        mun_indices = gdf_kommuner[selection].index.tolist()

    elif level == 'submun':    
        in_file = 'data/parsed/sqr_mun_sub.hdf'
        out_file = 'data/candidates_sub.hdf'
        selection = (gdf_kommuner.to_assign) & (gdf_kommuner.cell_count>=5000) 
        mun_indices = gdf_kommuner[selection].index.tolist()
    
    
    # fetch keys
    datastore = pd.HDFStore(in_file)
    datakeys = datastore.keys()    
    keys = pd.DataFrame(data = [(k.split('_')[0][6:], k[6:], k[1:]) for k in datakeys], 
                        columns = ['mun_idx','idx','key'])    
    keys = keys[keys.mun_idx.astype(int).isin(mun_indices)]
    datastore.close()
    
    
    mun_pop = {'pers':{}, 'hh':{}}
    mun_cell_count = {}
    
    # fill up job list
    # load input for processing
    for (i,row) in keys.iterrows():
        df = pd.read_hdf(in_file, key=row.key)        

        if df.shape[0]>1:
            try:
                accomplished = pd.read_hdf(out_file, key='munidx%s' % row.idx).shape[0]
                remain_num_iter = max(0, num_iter - accomplished)

            except:
                remain_num_iter = num_iter

            if remain_num_iter > 0:
                G = local_graph(df)
                big_G = local_graph(df, max_dist=3)


                pers_density = df[years_pers].fillna(0).mean().mean()
                pers_count = df[years_pers].sum(0).min()
                hh_density = df[years_hh].fillna(0).mean().mean()
                hh_count = df[years_hh].sum(0).min()

                mun_pop['pers'][row.idx] = pers_count
                mun_pop['hh'][row.idx] = hh_count
                mun_cell_count[row.idx] = df.shape[0]

                if (pers_count>=100)&(hh_count>=50):                
                    job_list += [(row.idx,df,G,big_G,pers_density,hh_density,trade) 
                                 for _ in range(remain_num_iter)
                                ]

np.random.shuffle(job_list)                    

# Run partition algorithm
Below we run the algorithm  for partitioning the municipal data. There are two options - use a single core for computation or parallelizing the code.

In [4]:
# make output folder
os.makedirs('data/temp_output', exist_ok=True)

Single core computation

In [4]:
# for i in range(len(job_list[:])):
#     idx = job_list[i][0].split('_')[0]
#     print(gdf_kommuner.KOMNAVN[int(idx)])
#     get_assignment(job_list[i])

Multi core computation

In [11]:
from joblib import Parallel, delayed
proc = Parallel(n_jobs=2)(delayed(get_assignment)(job) for job in job_list[:])

# Parse temporary files
Check for available files

In [12]:
files = os.listdir('data/temp_output')
len(files)

5

Parse data

In [13]:
def file_loader(f):
    try:
        return pd.read_csv('data/temp_output/%s' % f)
    except:
        return pd.DataFrame()

In [14]:
files = os.listdir('data/temp_output')

if len(files)>0:    
    
    output = pd.concat([file_loader(f) for f in files], axis=0)

    by_mun_idx = output.groupby('mun_idx')
    
    old_keys = []
    for out_file_name in ('candidates', 'candidates_sub'):    
        out_file = f'data/{out_file_name}.hdf'
        datastore = pd.HDFStore(out_file)
        old_keys += [k[1:] for k in datastore.keys()]
        datastore.close()

    for idx, df_input in by_mun_idx:                
        out_file = 'data/candidates_sub.hdf' if ('_' in str(idx)) else 'data/candidates.hdf'
        print (idx, out_file)

        if ('munidx%s' % idx) in old_keys:
            existing = pd.read_hdf(out_file, key='munidx%s' % idx)
            output = pd\
                    .concat([df_input, existing])\
                    .drop_duplicates(subset=['finish_ts'])

        else:
            output = df_input
    
        output.to_hdf(out_file, key='munidx%s' % idx)

    print(len(files), 'files processed')
    for f in files: 
        os.remove('data/temp_output/%s' % f)

179_18 data/candidates_sub.hdf
179_19 data/candidates_sub.hdf
179_20 data/candidates_sub.hdf
179_8 data/candidates_sub.hdf
5 files processed
