# Preparation of Mapping

### Mapping Bricks to Region.xlsx
This notebook prepares the mapping from `Mapping Bricks to Region.xlsx`.

In [1]:
# Load required packages
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
import os

## Load data

In [2]:
# Read in data frame
mapping = pd.read_excel("../../0_raw_data/novartis_data/Mapping Bricks to Region.xlsx")

# Look at entire data frame
mapping

Unnamed: 0,Country,Brick/ County,Sweden Breast Cancer,Sweden Melanoma Territory,Population
0,SE,02 Norrtälje,Stockholm,Stockholm ONCO,56845.0
1,SE,04 Uppsala,Uppsala,Uppsala ONCO,262099.0
2,SE,03 Enköping,Uppsala,Uppsala ONCO,54106.0
3,SE,05 Nyköping,Sörmland-Eskilstuna,Sörmland-Eskilstuna ONCO,86530.0
4,SE,06 Katrineholm,Sörmland-Eskilstuna,Sörmland-Eskilstuna ONCO,57921.0
...,...,...,...,...,...
75,SE,91 Malmö,Skåne-Lund,Skåne ONCO,312994.0
76,SE,92 Lund,Skåne-Lund,Skåne ONCO,206173.0
77,SE,93 Trelleborg,Skåne-Lund,Skåne ONCO,96711.0
78,SE,999 Unknown,SE-other,SE-other ONCO,


## Preparatory steps

In [3]:
# Rename all columns
mapping = mapping.rename(columns = {"Country": "country", "Brick/ County": "brick", "Sweden Breast Cancer": "sweden_bc", 
                                   "Sweden Melanoma Territory": "sweden_me", "Population": "population"})

The population column should also be dropped because it does not contain the population information we want to use. We instead want to work with the population information from 'Sweden Population by brick 2022'.

In [4]:
# Drop irrelevant columns
mapping.drop(["country", "population"], axis = 1, inplace = True)

In [5]:
# Cast to appropriate data type
mapping["brick"] = mapping["brick"].astype('category')
mapping["sweden_bc"] = mapping["sweden_bc"].astype('category')
mapping["sweden_me"] = mapping["sweden_me"].astype('category')

In [6]:
mapping

Unnamed: 0,brick,sweden_bc,sweden_me
0,02 Norrtälje,Stockholm,Stockholm ONCO
1,04 Uppsala,Uppsala,Uppsala ONCO
2,03 Enköping,Uppsala,Uppsala ONCO
3,05 Nyköping,Sörmland-Eskilstuna,Sörmland-Eskilstuna ONCO
4,06 Katrineholm,Sörmland-Eskilstuna,Sörmland-Eskilstuna ONCO
...,...,...,...
75,91 Malmö,Skåne-Lund,Skåne ONCO
76,92 Lund,Skåne-Lund,Skåne ONCO
77,93 Trelleborg,Skåne-Lund,Skåne ONCO
78,999 Unknown,SE-other,SE-other ONCO


In [7]:
# Remove rows with brick == '999 Unknown' and brick == '99 Unknown'
mapping = mapping[(mapping.brick != '999 Unknown') & (mapping.brick != '99 Unknown')].reset_index(drop=True)

In [8]:
mapping

Unnamed: 0,brick,sweden_bc,sweden_me
0,02 Norrtälje,Stockholm,Stockholm ONCO
1,04 Uppsala,Uppsala,Uppsala ONCO
2,03 Enköping,Uppsala,Uppsala ONCO
3,05 Nyköping,Sörmland-Eskilstuna,Sörmland-Eskilstuna ONCO
4,06 Katrineholm,Sörmland-Eskilstuna,Sörmland-Eskilstuna ONCO
...,...,...,...
73,85 Kungälv,Västra Götaland-Göteborg,Västra Götaland-Göteborg ONCO
74,86 Lerum/Alingsås,Västra Götaland-Alingsås,Västra Götaland-SÄS ONCO
75,91 Malmö,Skåne-Lund,Skåne ONCO
76,92 Lund,Skåne-Lund,Skåne ONCO


In [9]:
# Save the prepared data frame
route0 = "../processed_data"

if not os.path.exists(route0):
    os.mkdir(route0)
    
print("saving file corresponding to mapping.pkl")
mapping.to_pickle(f"{route0}/mapping.pkl")
pd.read_pickle(f"{route0}/mapping.pkl")

saving file corresponding to mapping.pkl


Unnamed: 0,brick,sweden_bc,sweden_me
0,02 Norrtälje,Stockholm,Stockholm ONCO
1,04 Uppsala,Uppsala,Uppsala ONCO
2,03 Enköping,Uppsala,Uppsala ONCO
3,05 Nyköping,Sörmland-Eskilstuna,Sörmland-Eskilstuna ONCO
4,06 Katrineholm,Sörmland-Eskilstuna,Sörmland-Eskilstuna ONCO
...,...,...,...
73,85 Kungälv,Västra Götaland-Göteborg,Västra Götaland-Göteborg ONCO
74,86 Lerum/Alingsås,Västra Götaland-Alingsås,Västra Götaland-SÄS ONCO
75,91 Malmö,Skåne-Lund,Skåne ONCO
76,92 Lund,Skåne-Lund,Skåne ONCO
