# Preparation of Breast Cancer Statistics

### Statistikdatabasen_Breast Cancer_17_09_2021 14_15_07.xlsx
This notebook prepares the breast cancer statistics data from `Statistikdatabasen_Breast Cancer_17_09_2021 14_15_07.xlsx`.

In [1]:
# Load required packages
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
import os

## Load data

In [2]:
# Read in data frame
stats_bc = pd.read_excel("../../0_raw_data/public_data/Statistikdatabasen_Breast Cancer_17_09_2021 14_15_07.xlsx")

# Look at entire data frame
stats_bc

Unnamed: 0,"Number of new cancer cases, Age: 0-85+, Diagnos:170 Breast, irrespective of tumour type",Unnamed: 1,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8,Unnamed: 9,...,Unnamed: 42,Unnamed: 43,Unnamed: 44,Unnamed: 45,Unnamed: 46,Unnamed: 47,Unnamed: 48,Unnamed: 49,Unnamed: 50,Unnamed: 51
0,Region,Sex,1970.0,1971.0,1972.0,1973.0,1974.0,1975.0,1976.0,1977.0,...,2010.0,2011.0,2012.0,2013.0,2014.0,2015.0,2016.0,2017.0,2018.0,2019.0
1,Entire Sweden,Men,41.0,34.0,33.0,28.0,32.0,29.0,22.0,31.0,...,33.0,45.0,41.0,51.0,61.0,61.0,63.0,42.0,63.0,64.0
2,Entire Sweden,Woman,3392.0,3444.0,3563.0,3611.0,3769.0,3681.0,3935.0,4199.0,...,7985.0,8474.0,8603.0,9215.0,9771.0,9423.0,9164.0,10557.0,10135.0,10829.0
3,Stockholms län,Men,7.0,2.0,4.0,4.0,4.0,6.0,9.0,4.0,...,6.0,10.0,7.0,11.0,7.0,14.0,6.0,8.0,14.0,14.0
4,Stockholms län,Woman,690.0,706.0,698.0,744.0,770.0,746.0,782.0,888.0,...,1612.0,1621.0,1738.0,1686.0,1814.0,1719.0,1609.0,2255.0,2077.0,2192.0
5,Uppsala län,Men,2.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,...,1.0,1.0,2.0,3.0,1.0,1.0,1.0,3.0,1.0,2.0
6,Uppsala län,Woman,85.0,102.0,103.0,107.0,113.0,79.0,95.0,110.0,...,273.0,258.0,306.0,347.0,329.0,293.0,258.0,371.0,279.0,315.0
7,Södermanlands län,Men,0.0,2.0,1.0,2.0,0.0,1.0,1.0,1.0,...,0.0,3.0,2.0,0.0,2.0,2.0,0.0,2.0,1.0,1.0
8,Södermanlands län,Woman,99.0,95.0,103.0,100.0,116.0,120.0,112.0,102.0,...,215.0,242.0,166.0,220.0,228.0,216.0,298.0,296.0,286.0,261.0
9,Östergötlands län,Men,4.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,...,5.0,3.0,2.0,0.0,1.0,0.0,3.0,1.0,1.0,1.0


## Preparatory steps

In [3]:
# Replace header with first row of data frame
stats_bc.columns = stats_bc.iloc[0]

In [4]:
# Remove all rows with index 0 and 45
stats_bc = stats_bc.loc[1:44, :]

In [5]:
# Rename all columns
stats_bc = stats_bc.rename(columns = {"Region": "region", "Sex": "sex", 1970.0: "1970", 1971.0: "1971", 1972.0: "1972", 
                                     1973.0: "1973", 1974.0: "1974", 1975.0: "1975", 1976.0: "1976", 1977.0: "1977", 
                                     1978.0: "1978", 1979.0: "1979", 1980.0: "1980", 1981.0: "1981", 1982.0: "1982", 
                                     1983.0: "1983", 1984.0: "1984", 1985.0: "1985", 1986.0: "1986", 1987.0: "1987", 
                                     1988.0: "1988", 1989.0: "1989", 1990.0: "1990", 1991.0: "1991", 1992.0: "1992", 
                                     1993.0: "1993", 1994.0: "1994", 1995.0: "1995", 1996.0: "1996", 1997.0: "1997", 
                                     1998.0: "1998", 1999.0: "1999", 2000.0: "2000", 2001.0: "2001", 2002.0: "2002", 
                                     2003.0: "2003", 2004.0: "2004", 2005.0: "2005", 2006.0: "2006", 2007.0: "2007", 
                                     2008.0: "2008", 2009.0: "2009", 2010.0: "2010", 2011.0: "2011", 2012.0: "2012", 
                                     2013.0: "2013", 2014.0: "2014", 2015.0: "2015", 2016.0: "2016", 2017.0: "2017", 
                                     2018.0: "2018", 2019.0: "2019"})

In [6]:
# Cast to appropriate data type
stats_bc["region"] = stats_bc["region"].astype('category')
stats_bc["sex"] = stats_bc["sex"].astype('category')
for col in [col for col in stats_bc.loc[:, stats_bc.dtypes == 'float'].columns]:
    stats_bc[col] = stats_bc[col].astype('float')

In [7]:
stats_bc

Unnamed: 0,region,sex,1970,1971,1972,1973,1974,1975,1976,1977,...,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019
1,Entire Sweden,Men,41.0,34.0,33.0,28.0,32.0,29.0,22.0,31.0,...,33.0,45.0,41.0,51.0,61.0,61.0,63.0,42.0,63.0,64.0
2,Entire Sweden,Woman,3392.0,3444.0,3563.0,3611.0,3769.0,3681.0,3935.0,4199.0,...,7985.0,8474.0,8603.0,9215.0,9771.0,9423.0,9164.0,10557.0,10135.0,10829.0
3,Stockholms län,Men,7.0,2.0,4.0,4.0,4.0,6.0,9.0,4.0,...,6.0,10.0,7.0,11.0,7.0,14.0,6.0,8.0,14.0,14.0
4,Stockholms län,Woman,690.0,706.0,698.0,744.0,770.0,746.0,782.0,888.0,...,1612.0,1621.0,1738.0,1686.0,1814.0,1719.0,1609.0,2255.0,2077.0,2192.0
5,Uppsala län,Men,2.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,...,1.0,1.0,2.0,3.0,1.0,1.0,1.0,3.0,1.0,2.0
6,Uppsala län,Woman,85.0,102.0,103.0,107.0,113.0,79.0,95.0,110.0,...,273.0,258.0,306.0,347.0,329.0,293.0,258.0,371.0,279.0,315.0
7,Södermanlands län,Men,0.0,2.0,1.0,2.0,0.0,1.0,1.0,1.0,...,0.0,3.0,2.0,0.0,2.0,2.0,0.0,2.0,1.0,1.0
8,Södermanlands län,Woman,99.0,95.0,103.0,100.0,116.0,120.0,112.0,102.0,...,215.0,242.0,166.0,220.0,228.0,216.0,298.0,296.0,286.0,261.0
9,Östergötlands län,Men,4.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,...,5.0,3.0,2.0,0.0,1.0,0.0,3.0,1.0,1.0,1.0
10,Östergötlands län,Woman,156.0,162.0,155.0,167.0,187.0,156.0,181.0,191.0,...,379.0,381.0,408.0,413.0,503.0,454.0,410.0,410.0,405.0,500.0


In [8]:
stats_bc = pd.melt(stats_bc, id_vars = ['region', 'sex'], var_name = "year", value_name = "count_bc")

In [9]:
stats_bc

Unnamed: 0,region,sex,year,count_bc
0,Entire Sweden,Men,1970,41.0
1,Entire Sweden,Woman,1970,3392.0
2,Stockholms län,Men,1970,7.0
3,Stockholms län,Woman,1970,690.0
4,Uppsala län,Men,1970,2.0
...,...,...,...,...
2195,Jämtlands län,Woman,2019,119.0
2196,Västerbottens län,Men,2019,1.0
2197,Västerbottens län,Woman,2019,305.0
2198,Norrbottens län,Men,2019,0.0


In [10]:
# Cast to appropriate data type
stats_bc["region"] = stats_bc["region"].astype('category')
stats_bc["sex"] = stats_bc["sex"].astype('category')
stats_bc["year"] = stats_bc["year"].astype('int') # change to date later on
stats_bc["count_bc"] = stats_bc["count_bc"].astype('float')

In [11]:
# Sort by 'region', 'sex', 'time'
stats_bc = stats_bc.sort_values(by = ['region', 'sex', 'year'], ignore_index = True)

In [12]:
stats_bc

Unnamed: 0,region,sex,year,count_bc
0,Blekinge län,Men,1970,3.0
1,Blekinge län,Men,1971,1.0
2,Blekinge län,Men,1972,2.0
3,Blekinge län,Men,1973,1.0
4,Blekinge län,Men,1974,4.0
...,...,...,...,...
2195,Östergötlands län,Woman,2015,454.0
2196,Östergötlands län,Woman,2016,410.0
2197,Östergötlands län,Woman,2017,410.0
2198,Östergötlands län,Woman,2018,405.0


In [13]:
# Save the prepared data frame
route0 = "../processed_data"

if not os.path.exists(route0):
    os.mkdir(route0)
    
print("saving file corresponding to stats_bc.pkl")
stats_bc.to_pickle(f"{route0}/stats_bc.pkl")
pd.read_pickle(f"{route0}/stats_bc.pkl")

saving file corresponding to stats_bc.pkl


Unnamed: 0,region,sex,year,count_bc
0,Blekinge län,Men,1970,3.0
1,Blekinge län,Men,1971,1.0
2,Blekinge län,Men,1972,2.0
3,Blekinge län,Men,1973,1.0
4,Blekinge län,Men,1974,4.0
...,...,...,...,...
2195,Östergötlands län,Woman,2015,454.0
2196,Östergötlands län,Woman,2016,410.0
2197,Östergötlands län,Woman,2017,410.0
2198,Östergötlands län,Woman,2018,405.0
