# Preparation of Melanoma Statistics

### Statistikdatabasen_Malignant Melanoma 17_09_2021 14_13_18.xlsx
This notebook prepares the melanoma statistics data from `Statistikdatabasen_Malignant Melanoma 17_09_2021 14_13_18.xlsx`.

In [1]:
# Load required packages
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
import os

## Load data

In [2]:
# Read in data frame
stats_me = pd.read_excel("../../0_raw_data/public_data/Statistikdatabasen_Malignant Melanoma 17_09_2021 14_13_18.xlsx")

# Look at entire data frame
stats_me

Unnamed: 0,"Number of new cancer cases, Age: 0-85+, Diagnos:190 Malignant Melanoma Of Skin, irrespective of tumour type",Unnamed: 1,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8,Unnamed: 9,...,Unnamed: 42,Unnamed: 43,Unnamed: 44,Unnamed: 45,Unnamed: 46,Unnamed: 47,Unnamed: 48,Unnamed: 49,Unnamed: 50,Unnamed: 51
0,Region,Sex,1970.0,1971.0,1972.0,1973.0,1974.0,1975.0,1976.0,1977.0,...,2010.0,2011.0,2012.0,2013.0,2014.0,2015.0,2016.0,2017.0,2018.0,2019.0
1,Entire Sweden,Men,241.0,242.0,256.0,291.0,282.0,334.0,351.0,378.0,...,1516.0,1696.0,1738.0,1712.0,1952.0,2038.0,2231.0,2206.0,2374.0,2419.0
2,Entire Sweden,Woman,316.0,285.0,318.0,332.0,345.0,324.0,375.0,434.0,...,1386.0,1673.0,1683.0,1675.0,1876.0,1939.0,2068.0,1980.0,2171.0,2152.0
3,Stockholms län,Men,57.0,46.0,48.0,48.0,58.0,62.0,50.0,62.0,...,313.0,312.0,316.0,318.0,418.0,378.0,401.0,384.0,424.0,388.0
4,Stockholms län,Woman,63.0,41.0,50.0,65.0,57.0,64.0,61.0,72.0,...,270.0,331.0,308.0,322.0,348.0,323.0,374.0,345.0,353.0,317.0
5,Uppsala län,Men,6.0,3.0,4.0,8.0,7.0,6.0,10.0,13.0,...,40.0,46.0,67.0,61.0,50.0,64.0,63.0,78.0,82.0,74.0
6,Uppsala län,Woman,4.0,6.0,10.0,7.0,9.0,8.0,10.0,9.0,...,34.0,71.0,72.0,64.0,54.0,62.0,70.0,70.0,71.0,57.0
7,Södermanlands län,Men,7.0,9.0,7.0,13.0,5.0,6.0,10.0,14.0,...,55.0,58.0,40.0,40.0,35.0,52.0,64.0,64.0,52.0,54.0
8,Södermanlands län,Woman,5.0,5.0,11.0,7.0,9.0,8.0,8.0,9.0,...,46.0,37.0,32.0,27.0,45.0,46.0,51.0,48.0,41.0,44.0
9,Östergötlands län,Men,16.0,12.0,7.0,14.0,13.0,8.0,12.0,12.0,...,103.0,95.0,106.0,96.0,102.0,161.0,146.0,151.0,163.0,178.0


## Preparatory steps

In [3]:
# Replace header with first row of data frame
stats_me.columns = stats_me.iloc[0]

In [4]:
# Remove all rows with index 0 and 45
stats_me = stats_me.loc[1:44, :]

In [5]:
# Rename all columns
stats_me = stats_me.rename(columns = {"Region": "region", "Sex": "sex", 1970.0: "1970", 1971.0: "1971", 1972.0: "1972", 
                                     1973.0: "1973", 1974.0: "1974", 1975.0: "1975", 1976.0: "1976", 1977.0: "1977", 
                                     1978.0: "1978", 1979.0: "1979", 1980.0: "1980", 1981.0: "1981", 1982.0: "1982", 
                                     1983.0: "1983", 1984.0: "1984", 1985.0: "1985", 1986.0: "1986", 1987.0: "1987", 
                                     1988.0: "1988", 1989.0: "1989", 1990.0: "1990", 1991.0: "1991", 1992.0: "1992", 
                                     1993.0: "1993", 1994.0: "1994", 1995.0: "1995", 1996.0: "1996", 1997.0: "1997", 
                                     1998.0: "1998", 1999.0: "1999", 2000.0: "2000", 2001.0: "2001", 2002.0: "2002", 
                                     2003.0: "2003", 2004.0: "2004", 2005.0: "2005", 2006.0: "2006", 2007.0: "2007", 
                                     2008.0: "2008", 2009.0: "2009", 2010.0: "2010", 2011.0: "2011", 2012.0: "2012", 
                                     2013.0: "2013", 2014.0: "2014", 2015.0: "2015", 2016.0: "2016", 2017.0: "2017", 
                                     2018.0: "2018", 2019.0: "2019"})

In [6]:
# Cast to appropriate data type
stats_me["region"] = stats_me["region"].astype('category')
stats_me["sex"] = stats_me["sex"].astype('category')
for col in [col for col in stats_me.loc[:, stats_me.dtypes == 'float'].columns]:
    stats_me[col] = stats_me[col].astype('float')

In [7]:
stats_me

Unnamed: 0,region,sex,1970,1971,1972,1973,1974,1975,1976,1977,...,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019
1,Entire Sweden,Men,241.0,242.0,256.0,291.0,282.0,334.0,351.0,378.0,...,1516.0,1696.0,1738.0,1712.0,1952.0,2038.0,2231.0,2206.0,2374.0,2419.0
2,Entire Sweden,Woman,316.0,285.0,318.0,332.0,345.0,324.0,375.0,434.0,...,1386.0,1673.0,1683.0,1675.0,1876.0,1939.0,2068.0,1980.0,2171.0,2152.0
3,Stockholms län,Men,57.0,46.0,48.0,48.0,58.0,62.0,50.0,62.0,...,313.0,312.0,316.0,318.0,418.0,378.0,401.0,384.0,424.0,388.0
4,Stockholms län,Woman,63.0,41.0,50.0,65.0,57.0,64.0,61.0,72.0,...,270.0,331.0,308.0,322.0,348.0,323.0,374.0,345.0,353.0,317.0
5,Uppsala län,Men,6.0,3.0,4.0,8.0,7.0,6.0,10.0,13.0,...,40.0,46.0,67.0,61.0,50.0,64.0,63.0,78.0,82.0,74.0
6,Uppsala län,Woman,4.0,6.0,10.0,7.0,9.0,8.0,10.0,9.0,...,34.0,71.0,72.0,64.0,54.0,62.0,70.0,70.0,71.0,57.0
7,Södermanlands län,Men,7.0,9.0,7.0,13.0,5.0,6.0,10.0,14.0,...,55.0,58.0,40.0,40.0,35.0,52.0,64.0,64.0,52.0,54.0
8,Södermanlands län,Woman,5.0,5.0,11.0,7.0,9.0,8.0,8.0,9.0,...,46.0,37.0,32.0,27.0,45.0,46.0,51.0,48.0,41.0,44.0
9,Östergötlands län,Men,16.0,12.0,7.0,14.0,13.0,8.0,12.0,12.0,...,103.0,95.0,106.0,96.0,102.0,161.0,146.0,151.0,163.0,178.0
10,Östergötlands län,Woman,14.0,17.0,14.0,12.0,13.0,7.0,17.0,17.0,...,107.0,78.0,94.0,85.0,103.0,133.0,146.0,120.0,133.0,130.0


In [8]:
stats_me = pd.melt(stats_me, id_vars = ['region', 'sex'], var_name = "year", value_name = "count_me")

In [9]:
stats_me

Unnamed: 0,region,sex,year,count_me
0,Entire Sweden,Men,1970,241.0
1,Entire Sweden,Woman,1970,316.0
2,Stockholms län,Men,1970,57.0
3,Stockholms län,Woman,1970,63.0
4,Uppsala län,Men,1970,6.0
...,...,...,...,...
2195,Jämtlands län,Woman,2019,17.0
2196,Västerbottens län,Men,2019,57.0
2197,Västerbottens län,Woman,2019,40.0
2198,Norrbottens län,Men,2019,27.0


In [10]:
# Cast to appropriate data type
stats_me["region"] = stats_me["region"].astype('category')
stats_me["sex"] = stats_me["sex"].astype('category')
stats_me["year"] = stats_me["year"].astype('int') # change to date later on
stats_me["count_me"] = stats_me["count_me"].astype('float')

In [11]:
# Sort by 'region', 'sex', 'time'
stats_me = stats_me.sort_values(by = ['region', 'sex', 'year'], ignore_index = True)

In [12]:
stats_me

Unnamed: 0,region,sex,year,count_me
0,Blekinge län,Men,1970,3.0
1,Blekinge län,Men,1971,6.0
2,Blekinge län,Men,1972,6.0
3,Blekinge län,Men,1973,4.0
4,Blekinge län,Men,1974,5.0
...,...,...,...,...
2195,Östergötlands län,Woman,2015,133.0
2196,Östergötlands län,Woman,2016,146.0
2197,Östergötlands län,Woman,2017,120.0
2198,Östergötlands län,Woman,2018,133.0


In [13]:
# Save the prepared data frame
route0 = "../processed_data"

if not os.path.exists(route0):
    os.mkdir(route0)
    
print("saving file corresponding to stats_me.pkl")
stats_me.to_pickle(f"{route0}/stats_me.pkl")
pd.read_pickle(f"{route0}/stats_me.pkl")

saving file corresponding to stats_me.pkl


Unnamed: 0,region,sex,year,count_me
0,Blekinge län,Men,1970,3.0
1,Blekinge län,Men,1971,6.0
2,Blekinge län,Men,1972,6.0
3,Blekinge län,Men,1973,4.0
4,Blekinge län,Men,1974,5.0
...,...,...,...,...
2195,Östergötlands län,Woman,2015,133.0
2196,Östergötlands län,Woman,2016,146.0
2197,Östergötlands län,Woman,2017,120.0
2198,Östergötlands län,Woman,2018,133.0
