# Preparation of Patients

### Patients Melanoma + BC market Updated.xlsx
This notebook prepares the patients data from the sheet `Sheet3` in `Patients Melanoma + BC market Updated.xlsx`.

In [1]:
# Load required packages
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
import os

## Load data

In [2]:
# Read in data frame
patients = pd.read_excel("../../0_raw_data/novartis_data/Patients Melanoma + BC market Updated.xlsx", 'Sheet3')

# Look at entire data frame
patients

Unnamed: 0.1,Unnamed: 0,Unnamed: 1,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8,Unnamed: 9,...,Unnamed: 18,Unnamed: 19,Unnamed: 20,Unnamed: 21,Unnamed: 22,Unnamed: 23,Unnamed: 24,Unnamed: 25,Unnamed: 26,Unnamed: 27
0,,Final Brand,Region,Territory,2019-09-01 00:00:00,2019-10-01 00:00:00,2019-11-01 00:00:00,2019-12-01 00:00:00,2020-01-01 00:00:00,2020-02-01 00:00:00,...,2020-11-01 00:00:00,2020-12-01 00:00:00,2021-01-01 00:00:00,2021-02-01 00:00:00,2021-03-01 00:00:00,2021-04-01 00:00:00,2021-05-01 00:00:00,2021-06-01 00:00:00,2021-07-01 00:00:00,2021-08-01 00:00:00
1,,TAF_MEK COMBO,SWEDEN,Blekinge ONCO,3.52941,7.35294,3.52941,3.23529,2.94118,2.94118,...,5,3.52941,2.35294,1.17647,2.35294,2.35294,2.35294,3.52941,4.98039,3.52941
2,,BRAFTOVI-MEKTOVI,SWEDEN,Blekinge ONCO,,,,,,,...,,,,1,2.5,1,2,2,2,1
3,,TAF_MEK COMBO,SWEDEN,Dalarna ONCO,3.52941,4.70588,5.88235,1.17647,2.35294,2.35294,...,2.35294,-1.17647,1.17647,1.17647,4.41176,3.23529,2.05882,3.23529,1.76471,3.23529
4,,BRAFTOVI-MEKTOVI,SWEDEN,Dalarna ONCO,2,1,3.5,2.5,5,3.5,...,4.5,6.5,8.5,8,7.5,5,6.5,8,6.5,5.5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
142,,KISQALI,SWEDEN,Östergötland-Linköping,,,,,2.75229,2.75229,...,7.33945,7.33945,7.33945,8.25688,3.66972,2.75229,2.75229,8.25688,9.17431,10.0917
143,,VERZENIOS,SWEDEN,Östergötland-Linköping,,,,,,,...,0.917431,1.83486,0.921659,1.83486,0.917431,0.917431,1.83486,0.917431,0.917431,1.83486
144,,IBRANCE,SWEDEN,Östergötland-Norrköping,9.17431,6.42202,8.25688,6.42202,10.0917,9.17431,...,16.5138,16.5138,18.3486,14.6789,20.1835,21.1009,19.2661,18.3486,22.0183,18.3486
145,,KISQALI,SWEDEN,Östergötland-Norrköping,,,,,,,...,,1.83486,,0.917431,,,,,,


## Preparatory steps

In [3]:
# Drop first column
patients.drop("Unnamed: 0", axis = 1, inplace = True)

In [4]:
# Remove row with index 0
patients = patients.loc[1:146, :].reset_index(drop=True)

In [5]:
# Rename all columns
patients = patients.rename(columns = {"Unnamed: 1": "final_brand", "Unnamed: 2": "region", "Unnamed: 3": "territory", 
                                     "Unnamed: 4" : "2019-09-01", "Unnamed: 5" : "2019-10-01",
                                     "Unnamed: 6" : "2019-11-01", "Unnamed: 7" : "2019-12-01",
                                     "Unnamed: 8" : "2020-01-01", "Unnamed: 9" : "2020-02-01",
                                     "Unnamed: 10" : "2020-03-01", "Unnamed: 11" : "2020-04-01",
                                     "Unnamed: 12" : "2020-05-01", "Unnamed: 13" : "2020-06-01",
                                     "Unnamed: 14" : "2020-07-01", "Unnamed: 15" : "2020-08-01",
                                     "Unnamed: 16" : "2020-09-01", "Unnamed: 17" : "2020-10-01",
                                     "Unnamed: 18" : "2020-11-01", "Unnamed: 19" : "2020-12-01",
                                     "Unnamed: 20" : "2021-01-01", "Unnamed: 21" : "2021-02-01",
                                     "Unnamed: 22" : "2021-03-01", "Unnamed: 23" : "2021-04-01",
                                     "Unnamed: 24" : "2021-05-01", "Unnamed: 25" : "2021-06-01",
                                     "Unnamed: 26" : "2021-07-01", "Unnamed: 27" : "2021-08-01"
                                     })

In [6]:
# Drop irrelevant columns
patients.drop("region", axis = 1, inplace = True)

In [7]:
# Cast to appropriate data type
patients["final_brand"] = patients["final_brand"].astype('category')
patients["territory"] = patients["territory"].astype('category')
patients["2019-09-01"] = patients["2019-09-01"].astype('float')
patients["2019-10-01"] = patients["2019-10-01"].astype('float')
patients["2019-11-01"] = patients["2019-11-01"].astype('float')
patients["2019-12-01"] = patients["2019-12-01"].astype('float')
patients["2020-01-01"] = patients["2020-01-01"].astype('float')
patients["2020-02-01"] = patients["2020-02-01"].astype('float')
patients["2020-03-01"] = patients["2020-03-01"].astype('float')
patients["2020-04-01"] = patients["2020-04-01"].astype('float')
patients["2020-05-01"] = patients["2020-05-01"].astype('float')
patients["2020-06-01"] = patients["2020-06-01"].astype('float')
patients["2020-07-01"] = patients["2020-07-01"].astype('float')
patients["2020-08-01"] = patients["2020-08-01"].astype('float')
patients["2020-09-01"] = patients["2020-09-01"].astype('float')
patients["2020-10-01"] = patients["2020-10-01"].astype('float')
patients["2020-11-01"] = patients["2020-11-01"].astype('float')
patients["2020-12-01"] = patients["2020-12-01"].astype('float')
patients["2021-01-01"] = patients["2021-01-01"].astype('float')
patients["2021-02-01"] = patients["2021-02-01"].astype('float')
patients["2021-03-01"] = patients["2021-03-01"].astype('float')
patients["2021-04-01"] = patients["2021-04-01"].astype('float')
patients["2021-05-01"] = patients["2021-05-01"].astype('float')
patients["2021-06-01"] = patients["2021-06-01"].astype('float')
patients["2021-07-01"] = patients["2021-07-01"].astype('float')
patients["2021-08-01"] = patients["2021-08-01"].astype('float')

In [8]:
# Replace negative values by 0, replace NaNs by 0, round values to nearest integer
for col in patients.select_dtypes(include = ['float']).columns:
    patients[col] = patients[col].apply(lambda x : x if x > 0 else 0) 
    patients[col] = patients[col].fillna(0) 
    patients[col] = patients[col].round(0) 

In [9]:
patients

Unnamed: 0,final_brand,territory,2019-09-01,2019-10-01,2019-11-01,2019-12-01,2020-01-01,2020-02-01,2020-03-01,2020-04-01,...,2020-11-01,2020-12-01,2021-01-01,2021-02-01,2021-03-01,2021-04-01,2021-05-01,2021-06-01,2021-07-01,2021-08-01
0,TAF_MEK COMBO,Blekinge ONCO,4.0,7.0,4.0,3.0,3.0,3.0,5.0,2.0,...,5.0,4.0,2.0,1.0,2.0,2.0,2.0,4.0,5.0,4.0
1,BRAFTOVI-MEKTOVI,Blekinge ONCO,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,2.0,1.0,2.0,2.0,2.0,1.0
2,TAF_MEK COMBO,Dalarna ONCO,4.0,5.0,6.0,1.0,2.0,2.0,6.0,1.0,...,2.0,0.0,1.0,1.0,4.0,3.0,2.0,3.0,2.0,3.0
3,BRAFTOVI-MEKTOVI,Dalarna ONCO,2.0,1.0,4.0,2.0,5.0,4.0,4.0,4.0,...,4.0,6.0,8.0,8.0,8.0,5.0,6.0,8.0,6.0,6.0
4,TAF_MEK COMBO,Gävleborg-Gävle ONCO,4.0,7.0,2.0,4.0,5.0,4.0,4.0,8.0,...,14.0,4.0,7.0,5.0,5.0,6.0,3.0,3.0,6.0,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
141,KISQALI,Östergötland-Linköping,0.0,0.0,0.0,0.0,3.0,3.0,2.0,4.0,...,7.0,7.0,7.0,8.0,4.0,3.0,3.0,8.0,9.0,10.0
142,VERZENIOS,Östergötland-Linköping,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,2.0,1.0,2.0,1.0,1.0,2.0,1.0,1.0,2.0
143,IBRANCE,Östergötland-Norrköping,9.0,6.0,8.0,6.0,10.0,9.0,10.0,13.0,...,17.0,17.0,18.0,15.0,20.0,21.0,19.0,18.0,22.0,18.0
144,KISQALI,Östergötland-Norrköping,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.0,...,0.0,2.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0


In [10]:
# drop rows with territory == 'SE-other' or territory == 'SE-other ONCO'
patients = patients[(patients["territory"] != 'SE-other') & (patients["territory"] != 'SE-other ONCO')].reset_index(drop=True)

In [11]:
patients

Unnamed: 0,final_brand,territory,2019-09-01,2019-10-01,2019-11-01,2019-12-01,2020-01-01,2020-02-01,2020-03-01,2020-04-01,...,2020-11-01,2020-12-01,2021-01-01,2021-02-01,2021-03-01,2021-04-01,2021-05-01,2021-06-01,2021-07-01,2021-08-01
0,TAF_MEK COMBO,Blekinge ONCO,4.0,7.0,4.0,3.0,3.0,3.0,5.0,2.0,...,5.0,4.0,2.0,1.0,2.0,2.0,2.0,4.0,5.0,4.0
1,BRAFTOVI-MEKTOVI,Blekinge ONCO,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,2.0,1.0,2.0,2.0,2.0,1.0
2,TAF_MEK COMBO,Dalarna ONCO,4.0,5.0,6.0,1.0,2.0,2.0,6.0,1.0,...,2.0,0.0,1.0,1.0,4.0,3.0,2.0,3.0,2.0,3.0
3,BRAFTOVI-MEKTOVI,Dalarna ONCO,2.0,1.0,4.0,2.0,5.0,4.0,4.0,4.0,...,4.0,6.0,8.0,8.0,8.0,5.0,6.0,8.0,6.0,6.0
4,TAF_MEK COMBO,Gävleborg-Gävle ONCO,4.0,7.0,2.0,4.0,5.0,4.0,4.0,8.0,...,14.0,4.0,7.0,5.0,5.0,6.0,3.0,3.0,6.0,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
136,KISQALI,Östergötland-Linköping,0.0,0.0,0.0,0.0,3.0,3.0,2.0,4.0,...,7.0,7.0,7.0,8.0,4.0,3.0,3.0,8.0,9.0,10.0
137,VERZENIOS,Östergötland-Linköping,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,2.0,1.0,2.0,1.0,1.0,2.0,1.0,1.0,2.0
138,IBRANCE,Östergötland-Norrköping,9.0,6.0,8.0,6.0,10.0,9.0,10.0,13.0,...,17.0,17.0,18.0,15.0,20.0,21.0,19.0,18.0,22.0,18.0
139,KISQALI,Östergötland-Norrköping,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.0,...,0.0,2.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0


Brands are already put together in the form of 

a) Kisqali (Novartis), Verzenios, Ibrance for breast cancer and

b) Tafinlar+Mekinist (Novartis), Braftovi+Mektovi, Zelboraf+Colletic.

In [12]:
# From wide to long
patients = pd.melt(patients, id_vars = ['final_brand', 'territory'], var_name = "time", value_name = "patients")

In [13]:
# Cast to appropriate data type
patients["final_brand"] = patients["final_brand"].astype('category')
patients["territory"] = patients["territory"].astype('category')
patients["time"] = pd.to_datetime(patients["time"], format = '%Y.%m.%d')
patients["patients"] = patients["patients"].astype('float')

In [14]:
# Sort by 'final_brand', 'territory', 'time'
patients = patients.sort_values(by = ['final_brand', 'territory', 'time'], ignore_index = True)

In [15]:
patients

Unnamed: 0,final_brand,territory,time,patients
0,BRAFTOVI-MEKTOVI,Blekinge ONCO,2019-09-01,0.0
1,BRAFTOVI-MEKTOVI,Blekinge ONCO,2019-10-01,0.0
2,BRAFTOVI-MEKTOVI,Blekinge ONCO,2019-11-01,0.0
3,BRAFTOVI-MEKTOVI,Blekinge ONCO,2019-12-01,0.0
4,BRAFTOVI-MEKTOVI,Blekinge ONCO,2020-01-01,0.0
...,...,...,...,...
3379,ZELBORAF_COTELLIC COMBO,Västra Götaland-Göteborg ONCO,2021-04-01,0.0
3380,ZELBORAF_COTELLIC COMBO,Västra Götaland-Göteborg ONCO,2021-05-01,0.0
3381,ZELBORAF_COTELLIC COMBO,Västra Götaland-Göteborg ONCO,2021-06-01,0.0
3382,ZELBORAF_COTELLIC COMBO,Västra Götaland-Göteborg ONCO,2021-07-01,0.0


In [16]:
# Save the prepared data frame
route0 = "../processed_data"

if not os.path.exists(route0):
    os.mkdir(route0)
    
print("saving file corresponding to patients.pkl")
patients.to_pickle(f"{route0}/patients.pkl")
pd.read_pickle(f"{route0}/patients.pkl")

saving file corresponding to patients.pkl


Unnamed: 0,final_brand,territory,time,patients
0,BRAFTOVI-MEKTOVI,Blekinge ONCO,2019-09-01,0.0
1,BRAFTOVI-MEKTOVI,Blekinge ONCO,2019-10-01,0.0
2,BRAFTOVI-MEKTOVI,Blekinge ONCO,2019-11-01,0.0
3,BRAFTOVI-MEKTOVI,Blekinge ONCO,2019-12-01,0.0
4,BRAFTOVI-MEKTOVI,Blekinge ONCO,2020-01-01,0.0
...,...,...,...,...
3379,ZELBORAF_COTELLIC COMBO,Västra Götaland-Göteborg ONCO,2021-04-01,0.0
3380,ZELBORAF_COTELLIC COMBO,Västra Götaland-Göteborg ONCO,2021-05-01,0.0
3381,ZELBORAF_COTELLIC COMBO,Västra Götaland-Göteborg ONCO,2021-06-01,0.0
3382,ZELBORAF_COTELLIC COMBO,Västra Götaland-Göteborg ONCO,2021-07-01,0.0
