# Prepare FOI data for analysis

Prepare the FOI data for analysis.

TODO:
- Sort out the colours on all the charts, publish and password-protect them

In [4]:
import pandas as pd
import pycountry_convert as pc

## Concatenate and tidy the raw files

In [5]:
files = ["2010-01", "2012-01", "2014-01", "2016-01", "2018-01", "2020-01", "2021-08"]
df = pd.concat((pd.read_csv("./data/count-by-country-and-la-%s.csv" % f)
                 .set_index(['COUNTRY_NAME','DIST_NAME_VAR', 'TENURE'])
        .add_suffix("_%s" % f) for f in files), axis=1)
df = df.reset_index()

Some of the files have rogue whitespace, deal with this.

In [6]:
df.COUNTRY_NAME = df.COUNTRY_NAME.str.strip()
df.DIST_NAME_VAR = df.DIST_NAME_VAR.str.strip()
df.TENURE = df.TENURE.str.strip()

### Tidy up countries

Tidy up Ireland, which looks like the only significant duplicate country name.

In [7]:
df.replace({"EIRE": "IRELAND"}, inplace=True)

In [8]:
Add regions for countries.

SyntaxError: invalid syntax (851070858.py, line 1)

In [9]:
# df['CONTINENT'] = df.COUNTRY_NAME.apply(lambda x: pc.country_name_to_country_alpha2(x, cn_name_format="upper"))
# df.head()
def map_continent(country):
    british_territories_and_channel_islands = [
        'ANGUILLA', 'BERMUDA', 'BRITISH VIRGIN ISLANDS', 'CAYMAN ISLANDS', 
        'ASCENSION ISLAND', 'BRITISH ANTARCTIC TERRITORY', 'ST HELENA',
        'JERSEY', 'GUERNSEY', 'ALDERNEY', 'CHANNEL ISLANDS', 'SARK'
    ]
    continents = {
        'NA': 'North America',
        'SA': 'South America', 
        'AS': 'Asia',
        'OC': 'Australia',
        'AF': 'Africa',
        'EU': 'Europe',
        '': ''
    }
    missing_countries = {
        'ANTIGUA & BARBUDA': "North America",
        'ST LUCIA': "North America",
        'ST MARTIN': "North America",
        'ST VINCENT & GRENADINES': "North America",
        "CURACAO": "North America",
        "ST KITTS & NEVIS": "North America",
        'BOSNIA & HERZEGOVINA': 'Europe',
        'SERBIA AND MONTENEGRO': 'Europe',
        'FYR MACEDONIA': 'Europe',
        'KOSOVO': 'Europe',
        'THE PEOPLES REPUBLIC OF KOREA': 'Asia',
        'REPUBLIC OF KOREA': 'Asia',
        "EAST TIMOR": "Asia",
        'NIUE ISLAND': 'Asia',
        'PEOPLES REPUBLIC OF CONGO': 'Africa',
        'DEMOCRATIC REPUBLIC OF CONGO': 'Africa',
        "COTE D'IVOIRE": "Africa",
        # Not sure what to do about these tax havens(?), but the numbers are 
        # small so it doesn't matter very much for the purposes of the chart.
        'NETHERLANDS ANTILLES': 'Europe',
        'REUNION': 'Europe', 
        'SPANISH TERRITORY OF NORTH AFRICA': 'Eruope',
        'ST BARTHELEMY': 'Europe',
        'AUSTRALIAN ANTARCTIC TERRITORY': 'Australia',
        "US VIRGIN ISLANDS": "North America"
    }
    continent = ''
    if country in british_territories_and_channel_islands:
        continent = 'BOTs'
    else:
        try:
            country_code = pc.country_name_to_country_alpha2(country, cn_name_format="upper")
            continent_name = pc.country_alpha2_to_continent_code(country_code)
            continent = continents[continent_name]
        except KeyError:
            continent = missing_countries[country]
    return continent

continent_name = pc.country_alpha2_to_continent_code("AS")
df['CONTINENT'] = df.COUNTRY_NAME.apply(map_continent)
df.head()

Unnamed: 0,COUNTRY_NAME,DIST_NAME_VAR,TENURE,NO_TITLES_2010-01,NO_TITLES_2012-01,NO_TITLES_2014-01,NO_TITLES_2016-01,NO_TITLES_2018-01,NO_TITLES_2020-01,NO_TITLES_2021-08,CONTINENT
0,AFGHANISTAN,ASHFIELD,L,,,,,,,1.0,Asia
1,AFGHANISTAN,EAST RIDING OF YORKSHIRE,F,,,,,,,1.0,Asia
2,AFGHANISTAN,ASHFIELD,L,,,,,,1.0,,Asia
3,AFGHANISTAN,EAST RIDING OF YORKSHIRE,F,1.0,1.0,1.0,1.0,1.0,1.0,,Asia
4,ALBANIA,CORNWALL,L,,,,,,,1.0,Europe


### Tidy up local authorities

Tidy up local authority names - this is because there were administative changes during the period covered by the data.

In [10]:
df.replace({
    "SOUTH BUCKS": "BUCKINGHAMSHIRE",
    "CHILTERN": "BUCKINGHAMSHIRE",
    "WYCOMBE": "BUCKINGHAMSHIRE",
    "AYLESBURY VALE": "BUCKINGHAMSHIRE",
    "NORTH DORSET": "DORSET",
    "WEST DORSET": "DORSET",
    "EAST DORSET": "DORSET",
    "WEYMOUTH AND PORTLAND": "DORSET",
    "PURBECK": "DORSET",
    "BOURNEMOUTH": "BOURNEMOUTH, CHRISTCHURCH AND POOLE",
    "CHRISTCHURCH": "BOURNEMOUTH, CHRISTCHURCH AND POOLE",
    "POOLE": "BOURNEMOUTH, CHRISTCHURCH AND POOLE",
    "CORBY": "NORTH NORTHAMPTONSHIRE",
    "EAST NORTHAMPTONSHIRE": "NORTH NORTHAMPTONSHIRE",
    "KETTERING": "NORTH NORTHAMPTONSHIRE",
    "WELLINGBOROUGH": "NORTH NORTHAMPTONSHIRE",
    "DAVENTRY": "WEST NORTHAMPTONSHIRE",
    "NORTHAMPTON": "WEST NORTHAMPTONSHIRE",
    "SOUTH NORTHAMPTONSHIRE": "WEST NORTHAMPTONSHIRE",
    "ST EDMUNDSBURY": "WEST SUFFOLK",
    "FOREST HEATH": "WEST SUFFOLK",
    "SUFFOLK COASTAL": "EAST SUFFOLK",
    "WAVENEY": "EAST SUFFOLK",
    "SHEPWAY": "FOLKESTONE AND HYTHE",
    "TAUNTON DEANE": "SOMERSET WEST AND TAUNTON",
    "WEST SOMERSET": "SOMERSET WEST AND TAUNTON",
    "PENWITH": "CORNWALL",
    "KERRIER": "CORNWALL",
    "CARRICK": "CORNWALL",
    "RESTORMEL": "CORNWALL",
    "CARADON": "CORNWALL",
    "NORTH CORNWALL": "CORNWALL",
    "DURHAM": "COUNTY DURHAM",
    "EASINGTON": "COUNTY DURHAM",
    "SEDGEFIELD": "COUNTY DURHAM",
    "TEESDALE": "COUNTY DURHAM",
    "WEAR VALLEY": "COUNTY DURHAM",
    "DERWENTSIDE": "COUNTY DURHAM",
    "CHESTER-LE-STREET": "COUNTY DURHAM",
    "BLYTH VALLEY": "NORTHUMBERLAND",
    "WANSBECK": "NORTHUMBERLAND",
    "CASTLE MORPETH": "NORTHUMBERLAND",
    "TYNEDALE": "NORTHUMBERLAND",
    "ALNWICK": "NORTHUMBERLAND",
    "BERWICK-UPON-TWEED": "NORTHUMBERLAND",
    "NORTH SHROPSHIRE": "SHROPSHIRE",
    "OSWESTRY": "SHROPSHIRE",
    "SHREWSBURY AND ATCHAM": "SHROPSHIRE",
    "SOUTH SHROPSHIRE": "SHROPSHIRE",
    "BRIDGNORTH": "SHROPSHIRE",
    "SALISBURY": "WILTSHIRE",
    "WEST WILTSHIRE": "WILTSHIRE",
    "KENNET": "WILTSHIRE",
    "NORTH WILTSHIRE": "WILTSHIRE",
    "ELLESMERE PORT AND NESTON": "CHESHIRE WEST AND CHESTER",
    "VALE ROYAL": "CHESHIRE WEST AND CHESTER",
    "CHESTER": "CHESHIRE WEST AND CHESTER",
    "CREWE AND NANTWICH": "CHESHIRE EAST",
    "CONGLETON": "CHESHIRE EAST",
    "MACCLESFIELD": "CHESHIRE EAST",
    "SOUTH BEDFORDSHIRE": "CENTRAL BEDFORDSHIRE",
    "MID BEDFORDSHIRE": "CENTRAL BEDFORDSHIRE",
    "BEDFORDSHIRE": "CENTRAL BEDFORDSHIRE"
}, inplace=True)

In [11]:
df = df.groupby(["COUNTRY_NAME", "DIST_NAME_VAR", "TENURE"]).sum().reset_index()

In [12]:
df.fillna(0, inplace=True)

In [13]:
for f in files:
    df["TITLES_%s" % f.replace("-", "_")] = df["NO_TITLES_%s" % f].astype(int)
    df.drop("NO_TITLES_%s" % f, axis=1, inplace=True)
df.rename(columns={"COUNTRY_NAME": "COUNTRY", "DIST_NAME_VAR": "DIST"}, inplace=True)

In [14]:
df.head()

Unnamed: 0,COUNTRY,DIST,TENURE,TITLES_2010_01,TITLES_2012_01,TITLES_2014_01,TITLES_2016_01,TITLES_2018_01,TITLES_2020_01,TITLES_2021_08
0,AFGHANISTAN,ASHFIELD,L,0,0,0,0,0,1,1
1,AFGHANISTAN,EAST RIDING OF YORKSHIRE,F,1,1,1,1,1,1,1
2,ALBANIA,CORNWALL,L,0,0,0,0,0,1,1
3,ALBANIA,ENFIELD,L,0,0,0,0,0,1,0
4,ALBANIA,LAMBETH,L,0,0,0,0,0,1,1


In [15]:
df.to_csv("all_data.csv", index=False)

## Create a country-only version

It's useful to have a separate CSV file for easy review, and to save fiddling in the analysis notebook.

In [16]:
df_by_country = df.groupby("COUNTRY").sum().reset_index()
df_by_country["abs_change_10_21"] = df_by_country["TITLES_2021_08"] - df_by_country["TITLES_2010_01"]
df_by_country["percent_change_10_21"] = df_by_country["abs_change_10_21"] / df_by_country["TITLES_2010_01"] * 100.0

In [17]:
df_by_country.sort_values("TITLES_2021_08", ascending=False).to_csv("by_country.csv", index=False)

Long format, for use in the Flourish heatmap.

In [18]:
df_by_country\
    .drop(["abs_change_10_21", "percent_change_10_21"], axis=1)\
    .sort_values(by="TITLES_2021_08", ascending=False)\
    .head(15)\
    .melt(id_vars=["COUNTRY"])\
    .to_csv("by_country_long.csv", index=False)

Pivoted version, for use in the bar chart.

In [19]:
df_by_country.head()

Unnamed: 0,COUNTRY,TITLES_2010_01,TITLES_2012_01,TITLES_2014_01,TITLES_2016_01,TITLES_2018_01,TITLES_2020_01,TITLES_2021_08,abs_change_10_21,percent_change_10_21
0,AFGHANISTAN,1,1,1,1,1,2,2,1,100.0
1,ALBANIA,0,0,0,0,0,6,5,5,inf
2,ALDERNEY,31,37,51,49,52,49,47,16,51.612903
3,ALGERIA,16,25,37,37,40,54,55,39,243.75
4,ANDORRA,25,46,75,89,95,97,100,75,300.0


In [20]:
df_by_country\
    .drop(["abs_change_10_21", "percent_change_10_21"], axis=1)\
    .sort_values("TITLES_2021_08", ascending=False).set_index('COUNTRY')\
    .transpose().to_csv("barchart.csv", encoding='utf-8')

## Create a local authority-only version

For use in the grid of charts.

In [21]:
df_by_dist = df.groupby("DIST").sum().reset_index()
df_by_dist["abs_change_10_21"] = df_by_dist["TITLES_2021_08"] - df_by_dist["TITLES_2010_01"]
df_by_dist["percent_change_10_21"] = df_by_dist["abs_change_10_21"] / df_by_dist["TITLES_2010_01"] * 100.0
df_by_dist.sort_values("TITLES_2021_08", ascending=False).to_csv("by_dist.csv", index=False)

Long format, for use in the Flourish heatmap.

In [22]:
df_by_dist\
    .drop(["abs_change_10_21", "percent_change_10_21"], axis=1)\
    .sort_values(by="TITLES_2021_08", ascending=False)\
    .head(15)\
    .melt(id_vars=["DIST"])\
    .to_csv("by_dist_long.csv", index=False)

In [25]:
# Remapping for the Flourish LA maps.


    
    "WESTMINSTER": "CITY OF WESTMINSTER",
    
    
name_map = {
    "KINGSTON UPON HULL, CITY OF": "CITY OF KINGSTON UPON HULL",
    "NOTTINGHAM": "CITY OF NOTTINGHAM",
    "PETERBOROUGH": "CITY OF PETERBOROUGH",    
    "PLYMOUTH": "CITY OF PLYMOUTH",
    "BRISTOL, CITY OF": "CITY OF BRISTOL",
    "DERBY": "CITY OF DERBY", 
    "HEREFORDSHIRE, COUNTY OF": "HEREFORDSHIRE",
    "RHONDDA CYNON TAF": "RHONDDA CYNON TAFF",
    "ST. HELENS": "ST HELENS",
    "VALE OF GLAMORGAN": "THE VALE OF GLAMORGAN",
    "TELFORD AND WREKIN": "WREKIN",
}
# ROWS IN LAND REG DATA:

# North Northamptonshire was crated by the merger of the four existing non-metropolitan districts 
# of Corby, East Northamptonshire, Kettering, and Wellingborough.
# CORBY
# EAST NORTHAMPTONSHIRE
# KETTERING
# WELLINGBOROUGH

# West Northamptonshire was formed on 1 April 2021 through the merger of the three existing non-metropolitan 
# districts of Daventry, Northampton, and South Northamptonshire, 
# it absorbed the functions of these districts, plus those of the abolished Northamptonshire County Council.
# DAVENTRY
# DERBY
# NORTHAMPTON
# SOUTH NORTHAMPTONSHIRE

df_by_dist_flourish = df_by_dist.copy()
df_by_dist_flourish.DIST.replace(name_map, inplace=True)
df_by_dist_flourish.head(50)
# df_by_dist_flourish.to_csv("df_by_dist_flourish.csv", index=False)

Unnamed: 0,DIST,TITLES_2010_01,TITLES_2012_01,TITLES_2014_01,TITLES_2016_01,TITLES_2018_01,TITLES_2020_01,TITLES_2021_08,abs_change_10_21,percent_change_10_21
0,ADUR,41,61,71,79,86,95,96,55,134.146341
1,ALLERDALE,71,90,122,158,183,202,194,123,173.239437
2,AMBER VALLEY,44,67,90,129,137,161,163,119,270.454545
3,ARUN,196,258,290,326,366,446,438,242,123.469388
4,ASHFIELD,49,86,109,204,271,311,315,266,542.857143
5,ASHFORD,195,334,490,550,583,649,663,468,240.0
6,BABERGH,83,118,129,131,134,152,146,63,75.903614
7,BARKING AND DAGENHAM,169,219,273,316,289,321,311,142,84.023669
8,BARNET,777,1170,1553,1762,2056,2534,2579,1802,231.917632
9,BARNSLEY,120,156,289,352,420,574,463,343,285.833333


## Create a country and local authority version

In [69]:
df_by_country_and_dist = df.groupby(["COUNTRY", "DIST"]).sum().reset_index()
df_by_country_and_dist["abs_change_10_21"] = \
    df_by_country_and_dist["TITLES_2021_08"] - df_by_country_and_dist["TITLES_2010_01"]
df_by_country_and_dist["percent_change_10_21"] = \
    df_by_country_and_dist["abs_change_10_21"] / df_by_country_and_dist["TITLES_2010_01"] * 100.0
df_by_country_and_dist\
    .sort_values("TITLES_2021_08", ascending=False)\
    .to_csv("df_by_country_and_dist.csv", index=False)

## Have a quick look at tenures

In [70]:
df_by_tenure = df.groupby("TENURE").sum().reset_index()
df_by_tenure.sort_values("TITLES_2021_08", ascending=False).head()

Unnamed: 0,TENURE,TITLES_2010_01,TITLES_2012_01,TITLES_2014_01,TITLES_2016_01,TITLES_2018_01,TITLES_2020_01,TITLES_2021_08
0,F,50747,67976,84926,100991,112356,122611,123621
1,L,37063,52066,67377,83009,103213,121034,123395


In [71]:
# Rename cols (no longer used, but keeping in case useful)
# df_by_la_country_columns = df_by_la_country_columns.rename(columns=lambda x: x.strip())
# cols = [
#     "DIST_NAME_VAR", 
#      "GUERNSEY", "HONG KONG", "JERSEY", "ISLE OF MAN", "MALAYSIA",
#     "SINGAPORE", "UNITED ARAB EMIRATES", "UNITED STATES OF AMERICA",
# ]

In [72]:
# Pivot up the countries (no longer used, but keeping in case useful)
# df_by_la_country_columns = \
#      df_by_country_and_la.pivot(index="DIST_NAME_VAR", columns="COUNTRY_NAME", values="NO_TITLES")\
#.     .fillna(0).reset_index()
# df_by_la_country_columns.head()