In [1]:
import pandas as pd
import numpy as np
import geopandas as gpd

%matplotlib inline

In [2]:
#Read in data - Unsure on age categories - so have just named them Age1 through Age7
df=pd.read_csv("MSOAflows.csv", header=None)
df=df.rename(columns={0:"From", 1:"To", 2:"Age1", 3:"Age2",
                  4:"Age3", 5:"Age4", 6:"Age5", 7:"Age6", 8:"Age7"})
df.head()

Unnamed: 0,From,To,Age1,Age2,Age3,Age4,Age5,Age6,Age7
0,95AA01S1,95AA01S1,15,1,6,7,1,0,0
1,95AA01S1,95AA01S2,207,42,105,60,0,0,0
2,95AA01S1,95AA01S3,3,0,2,1,0,0,0
3,95AA01S1,95AA02W1,1,0,0,1,0,0,0
4,95AA01S1,95AA04W1,5,1,2,2,0,0,0


## Shapefile merging, read from internet sources. Areas not included in flow data

Outputs geom - full geometry shapefile for total UK countries. Length > dataset - unsure how you want shapefiles coded that aren't in the flow dataset - so have left them enumerated at the bottom, but they just don't have a mapping in the flows data.

In [3]:
EWMSOA=gpd.read_file('https://opendata.arcgis.com/datasets/02aa733fc3414b0ea4179899e499918d_0.zip?outSR=%7B%22latestWkid%22%3A27700%2C%22wkid%22%3A27700%7D')

In [4]:
SIZ=gpd.read_file('http://sedsh127.sedsh.gov.uk/Atom_data/ScotGov/ZippedShapefiles/SG_IntermediateZoneBdry_2011.zip')

In [5]:
NISOA=gpd.read_file('https://www.nisra.gov.uk/sites/nisra.gov.uk/files/publications/SOA2011_Esri_Shapefile_0.zip')

In [6]:
EWMSOA=EWMSOA.to_crs("EPSG:4326")
NISOA=NISOA.to_crs("EPSG:4326")
SIZ=SIZ.to_crs("EPSG:4326")

In [7]:
EWMSOA.rename(columns={"msoa11cd":"cd"}, inplace=True)
NISOA.rename(columns={"SOA_CODE":"cd"}, inplace=True)
SIZ.rename(columns={"InterZone":"cd"}, inplace=True)

In [8]:
geom=gpd.GeoDataFrame(EWMSOA[{"cd","geometry"}])
geom=geom.append(NISOA[{"cd","geometry"}], ignore_index=True)
geom=geom.append(SIZ[{"cd","geometry"}], ignore_index=True)
geom.shape

(9370, 2)

### Create Master lookup for True IDs (TID) including empty-cells and non UK destinations

In [9]:
lookup=pd.DataFrame({"CD":list(df.To.unique())})
lookup=lookup.merge(geom, left_on="CD", right_on="cd", how="outer")
lookup.cd.fillna(lookup.CD, inplace=True)
lookup['TID']=np.arange(10149)+1

In [10]:
lookupdict=dict(zip(lookup.cd, lookup.TID))

In [11]:
df.From=df.From.map(lookupdict)
df.To=df.To.map(lookupdict)

In [12]:
#Output flows by age with correct IDs

df.to_csv('ageflows.csv')

In [13]:
df.head()

Unnamed: 0,From,To,Age1,Age2,Age3,Age4,Age5,Age6,Age7
0,1,1,15,1,6,7,1,0,0
1,1,2,207,42,105,60,0,0,0
2,1,3,3,0,2,1,0,0,0
3,1,4,1,0,0,1,0,0,0
4,1,5,5,1,2,2,0,0,0


In [14]:
#Output shapefiles with Correct IDs

geom['ID']=geom.cd.map(lookupdict)
geom=geom.sort_values(by=['ID'])
geom.to_file("fullgeom.shp")

In [15]:
## Generating Work and Play Files

In [16]:
work=pd.DataFrame(df[{"From", "Age1", "Age2", "Age3", "Age4", 
                      "Age5", "Age6", "Age7"}].groupby('From').sum())
work['sum']=work.loc[:,work.columns != "From"].sum(axis=1)
work.reset_index(inplace=True)
work=work[['From', 'sum']]
work.to_csv('worksize.csv')

## Generating Play file - take MSOA, IZ, SOA population estimates from gov sources and compile into master list.
 Merge with work file and subtract work population flow from the total population to get estimate of "play". 
 
 Note 571 areas have "from" flows recorded which are higher than the estimated population - meaning negative estimates
 for play population.

In [17]:
#Taking Scottish Data and converting to Two Column DataFrame with Code (cd) and Total Population (tot)
scotpop=pd.read_csv("https://www.opendata.nhs.scot/dataset/7f010430-6ce1-4813-b25c-f7f335bdc4dc/resource/93df4c88-f74b-4630-abd8-459a19b12f47/download/iz2011-pop-est_02042020.csv"
                   , skiprows=3, header=None)
scotpop1=scotpop[scotpop[0]==2018]
scotpop1=scotpop1[{1,3,4}]
scotpop1=scotpop1.iloc[2:,]
scotpop1=scotpop1.groupby(1).sum()
scotpop1=scotpop1.reset_index()
scotpop1=scotpop1.rename(columns={1:"cd", 4:"tot"})

In [18]:
#Taking Northern Irish Data and converting to Two Column DataFrame with Code (cd) and Total Population (tot)
nipop=pd.read_csv("https://www.opendatani.gov.uk/dataset/2135b9bf-eb38-4c81-b614-7b07e8fdfc82/resource/fc3f91dd-a0c7-488a-9996-bd057a7cfffe/download/super-output-areas-soas-by-gender-and-broad-age-bands-mid-2001-to-mid-2018.csv")
nipop1=nipop[nipop['Age_Group']=="All ages"]
nipop1=nipop1[nipop1['Mid_Year_Ending']==2018]
nipop1=nipop1[nipop1['Gender']=="All persons"]
nipop1=nipop1[{"Geo_Code", "Population_Estimate"}]
nipop1=nipop1.rename(columns={"Geo_Code":"cd", "Population_Estimate":"tot"})

In [19]:
## File from https://www.ons.gov.uk/file?uri=%2fpeoplepopulationandcommunity%2fpopulationandmigration%2fpopulationestimates%2fdatasets%2fmiddlesuperoutputareamidyearpopulationestimatesnationalstatistics%2fmid2018sape21dt14a/sape21dt15mid2018msoaquinaryestimatesunformatted.zip
#Taking England and Wales Data and converting to Two Column DataFrame with Code (cd) and Total Population (tot)
# Not read directly from link as indexing subsheets in zipped xlsx format is lengthy. Can be updated
ewpop=pd.read_excel("SAPE21DT15-mid-2018-msoa-quinary-estimates-unformatted.xlsx", sheet_name="Mid-2018 Persons", skiprows=4)
ewpop=ewpop[{"Area Codes", "All Ages"}]
ewpop=ewpop.rename(columns={"Area Codes":"cd", "All Ages":"tot"})

In [20]:
# Generate master population estimate DataFrame (cd, tot)
totpop=ewpop.append(nipop1, ignore_index=True).append(scotpop1, ignore_index=True)
totpop['ID']=totpop.cd.map(lookupdict)
totpop.sort_values(by=['tot'])

Unnamed: 0,tot,cd,ID
8065,382,95YY15S1,673
7679,881,95MM14S1,1889
8780,885,S02001925,9560
7201,1026,95AA01S1,1
7795,1038,95QQ03W1,1744
...,...,...,...
4330,19374,E02000891,331
243,19855,E02001731,929
242,19868,E02001730,1758
5103,21716,E02003475,2759


#### Finally generate playsize.csv by merging total population file with work file and generating "play" column

In [21]:
#Merge totpop with work file and use lookupdict to convert cd into master list of areas.  
#newpop=totpop.merge(lookup[{"CD", "TID"}], left_on="cd", right_on="CD", how="left")
newpop=totpop.merge(work, left_on="ID", right_on="From", how="left")
newpop['sum'].replace(np.nan, 0, inplace=True)
newpop['play']=newpop['tot'] - newpop['sum']
playpop=newpop[['ID', 'play']]
playpop=playpop.sort_values(by=['ID'])
playpop.to_csv('playsize.csv')

In [22]:
playpop[playpop['play']<=0].shape

(571, 2)