# STEP 2: Compute partition candidates

This is the second step in the computation. We will try to use the pre-partitioned shapes of municipalities to find feasible candidates of cell partitions. It is required to complete STEP 0 before beginning.



# Packages

Setup multiprocessing module (note that cluster needs to be activated, see [here](https://ipyparallel.readthedocs.io/en/latest/process.html)).

In [None]:
import os

from ipyparallel import Client
rc = Client()
dview = rc[:]

number_of_engines = 42
dview.map(os.chdir, [os.getcwd()]*number_of_engines)

Load core modules across cores

In [None]:
%%px --local
import numpy as np
import pandas as pd
import time

from sqr.core.network import local_graph
from sqr.main_assign import get_assignment, data_cols

Load remaining modules for single core

In [None]:
from tqdm import tqdm
import geopandas as gpd
from sqr.core.scoring import partition_score
from sqr.core.shape import make_gdf_square_data
from sqr.miscellaneous import read_parse_mun

# Multiple engine approach

### Settings

The configuration of how to run the assignment is set below. 

Recall that some municipalities are split subparts. As a consequence we need to run procedure for the municiptal level (i.e., 'mun') but also at the sub-municipality level (i.e. 'submun') for municipalities with larger areas.

In [None]:
# load data 
kommuner = read_parse_mun()

# main info
level = 'submun'

# additional info
trade = True
job_list = []
num_iter = 1

# get input/output references
if level == 'mun':
    in_file = 'data/parsed/sqr_mun.hdf' 
    out_file = 'data/candidates.hdf'
    selection = (kommuner.to_assign) & (kommuner.cell_count<5000) 
    mun_indices = kommuner[selection].index.tolist()
    
elif level == 'submun':    
    in_file = 'data/parsed/sqr_mun_sub.hdf'
    out_file = 'data/candidates_sub.hdf'
    selection = (kommuner.to_assign) & (kommuner.cell_count>=5000) 
    mun_indices = kommuner[selection].index.tolist()
    
else:
    raise ValueError('Must specify level')

### Prepare input data

get references for input files and select

In [None]:
datastore = pd.HDFStore(in_file)
datakeys = datastore.keys()    
keys = pd.DataFrame(data = [(k.split('_')[0][6:], k[6:], k[1:]) for k in datakeys], 
                    columns = ['mun_idx','idx','key'])    
keys = keys[keys.mun_idx.astype(int).isin(mun_indices)]
datastore.close()

load file info and make joblist

In [None]:
mun_pop = {}
mun_cell_count = {}

years = list(map(str,range(1986,2016)))

# load input for processing
for (i,row) in keys.iterrows():
    df = pd.read_hdf(in_file, key=row.key)        
    
    if df.shape[0]>1:
        try:
            accomplished = pd.read_hdf(out_file, key='munidx%s' % row.idx).shape[0]
            remain_num_iter = max(0, num_iter - accomplished)
            
        except:
            remain_num_iter = num_iter
        
        if remain_num_iter > 0:
            G = local_graph(df)
            big_G = local_graph(df, max_dist=3)
            
            pop_density = df[years].fillna(0).mean().mean()
            pop_count = df[years].sum(0).min()

            mun_pop[row.idx] = pop_count
            mun_cell_count[row.idx] = df.shape[0]

            if pop_count>=100:                
                job_list += [(row.idx,df,G,big_G,pop_density,trade) for _ in range(remain_num_iter)]

pd.np.random.shuffle(job_list)  
print(len(job_list))

# make output folder
os.makedirs('data/temp_output', exist_ok=True)

### Execute joblist
Single core computation

In [None]:
# get_assignment(job_list[0])

Multi core computation

In [None]:
dview.map_async(get_assignment, job_list)

# Parse temporary files
Check for available files

In [None]:
files = os.listdir('data/temp_output')
len(files)

Parse data

In [None]:
def file_loader(f):
    try:
        return pd.read_csv('data/temp_output/%s' % f)
    except:
        return pd.DataFrame()

In [None]:
files = os.listdir('data/temp_output')

if len(files)>0:    
    
    output = pd.concat([file_loader(f) for f in files], axis=0)

    by_mun_idx = output.groupby('mun_idx')
    
    datastore = pd.HDFStore(out_file)
    old_keys = [k[1:] for k in datastore.keys()]
    datastore.close()

    for idx, df_input in by_mun_idx:        

#         print (idx)
        if ('munidx%s' % idx) in old_keys:
            existing = pd.read_hdf(out_file, key='munidx%s' % idx)
            output = pd\
                    .concat([df_input, existing])\
                    .drop_duplicates(subset=['finish_ts'])

        else:
            output = df_input
    
        output.to_hdf(out_file, key='munidx%s' % idx)

    print(len(files))
    for f in files: 
        os.remove('data/temp_output/%s' % f)

# Diagnostics
Get output statistics

In [None]:
output_stat = []
for idx in kommuner.index:
    df = pd.read_hdf('data/parsed/sqr_mun.hdf', 'sqidx%i' % idx)
    if df.minimum.sum()>0:
        try:        
            run_info = pd.read_hdf('data/candidates.hdf', key='munidx%i' % idx)
            run_info_trade = run_info#[run_info.trade]
            if run_info_trade.shape[0]>0:
                output_stat.append([idx, run_info_trade.shape[0], run_info_trade.delta_t.median(),
                                    df.minimum.shape[0], int(df.minimum.sum())])
        except:
            runs = 0
#             output_stat.append([idx, runs, df.minimum.shape[0], int(df.minimum.sum())])

output_stat = pd.DataFrame(output_stat, columns=['idx','run_count','run_time' ,'cell_count','pop_count'])

Plot running time

In [None]:
import seaborn as sns
%matplotlib inline

# output_stat['oko5'] = output_stat.cell_count>3000

fig = sns.lmplot(y='run_time',x='cell_count',order=2,data=output_stat)

sns.plt.xlim(0,)
sns.plt.ylim(0,)
# fig.savefig('runtime_cellcount.pdf')