Imports

In [None]:
import pandas as pd
import numpy as np

Loading the original data

In [None]:
# make sure to change the path to the correct data folder
data_sideeffects = pd.read_excel("sider_output.xlsx")
data = pd.read_excel('opioids_data_original.xlsx')
frequencies = pd.read_csv('meddra_freq.tsv', sep='\t', header=None)

Distribution of different outcome labels:

In [None]:
data.Outcome.value_counts().plot(kind='bar')

In [None]:
data['Outcome'].value_counts(normalize=True) * 100

Changing the 'Outcome' column values

In [None]:
data['Outcome'] = data['Outcome'].replace(['Outcome niet ingevuld', 'Recovered/resolved', 'Not recovered/not resolved/ongoing', 'Recovered/resolved with sequelae', 'Recovering/resolving'], ['Unknown', 'Recovered', 'Ongoing', 'Sequelae', 'Recovering'])
data.Outcome.value_counts().plot(kind='bar')

In [None]:
data['Outcome'].value_counts(normalize=True) * 100

Missing values

In [None]:
data.isna().sum()

Drop columns

In [None]:
data = data.drop(columns=['Primary Source Description', 'Status', 'Category', 'OutcomeCodeSystemVersion', 'OutcomeText', 'CultureID', 'date_received', 'summary', 'narrative', 'IsCurrent', 'IsDefaultSOC'])

Remove outliers

- Removal of instances with bodyweight == 0 
- Removal of instances with height == 0

In [None]:
data = data[data.BodyWeight != 0] 
data = data[data.Height != 0] 

Renaming columns

In [None]:
data['ATCText'].replace(['TRAMADOL MET PARACETAMOL', 'OXYCODON MET NALOXON', 'MORFINE, COMBINATIEPREPARATEN'], ['TRAMADOL', 'OXYCODON', 'MORFINE'], inplace=True)

data['ATCode'].replace(['N02AJ13', 'N02AA51'], ['N02AX02', 'N02AA01'], inplace=True)

data['ATCText'] = data['ATCText'].str.lower()

Replace all non-sense values with NaN

In [None]:
symbols1 = ["-", "_", "\+", "\?", "%", "\*", "\.", "\,", "\:", "\;", "\!", "\@", "\#", "\$", "\^", "\&", "\(", "\)", "\{", "\}", "\[", "\]", "\|", "\/", "\~", "\`", "\=", "\<", "\>", " "]

for i in symbols1:
    data = data.replace(i, "", regex = True)
    data_sideeffects = data_sideeffects.replace(i, "", regex = True)

Replace nan strings with NaN

In [None]:
symbols2 = ["NAN", "NaN", "None", "NaT", "NAT", "nat", "n/a", "N/A", "n/a", "N/A", "n.a.", "N.A.", " "]

for i in symbols2:
    
    # replace symbol in string with "", bit not the whole string
    data = data.replace(rf'\b{i}\b', np.nan, regex = True)
    data_sideeffects = data_sideeffects.replace(rf'\b{i}\b', np.nan, regex = True)

Merging

In [None]:
data = data.merge(data_sideeffects[['ATCode', 'ATCText', 'PTCode', 'Side effect', 'Frequency']], how='left', on=['ATCode', 'ATCText', 'PTCode'])

The 'reaction_impact' column presumably measures the impact the medicine has had on the patient. As shown before, there is no reaction_impact for fatalities. 
We can impute the missing values by taking the average of a patient with similar features. 

To make this easier, we'll first create a column to bin the ages. We'll also create a BMI column and a weight group column.

In [None]:
data['age_group'] = pd.cut(x=data['age_year'], bins=[18, 24, 44, 64, 90])
data['BMI'] = data['BodyWeight'] / (data['Height']/100)**2
data['weight_group'] = pd.cut(x=data['BMI'], bins=[0, 18.5, 25, 30, 50], labels=['underweight', 'normal', 'overweight', 'obese'])
data['WorldwideUniqueCaseIdentification'] = data['WorldwideUniqueCaseIdentification'].str.replace(" ", "")
data['age_group'] = data['age_group'].astype(str)
data['PTCode'] = data['PTCode'].astype(str)

Imputing missing values

In [None]:
data['Frequency'] = data['Frequency'] * 100
data['Frequency'] = data['Frequency'].replace(0.0, 0.0001)
data['Frequency'] = data['Frequency'].fillna(0.0)
data['is_sideeffect'] = data['Side effect'].notna()

In [None]:
data.reset_index(drop=True)
data.columns

Split the column generic drugname into two columns: type and dosage

In [None]:
gdn = list(data['GenericDrugName'].str.split(' ', expand=True).stack().unique())
type = ['CAPSULE', 'NEUSSPRAY', 'TABLET', 'PLEISTER', 'INJVLST', 'ZETPIL', 'DRANK', 'SPRAY', 'ZUIGTABLET', 'BRUISTABLET', 'INJ/INFOPL', 'INFVLST', 'DRUPPELS', 'SMELTTABLET', 'INJECTIE/INFUUS', 'DISPERTABLET', 'TAB', 'INJECTIEPOEDER']
dosage = []

def has_numbers(inputString):
    return bool(re.search(r'\d', inputString))

for i in gdn:
    if has_numbers(i):
        dosage.append(i)

data['Type'] = data['GenericDrugName'].apply(lambda x : ''.join([k for k in str(x).split() if k in type]))
data['Dosage'] = data['GenericDrugName'].apply(lambda x : ''.join([k for k in str(x).split() if k in dosage]))

In [None]:
data = data.reset_index(drop=True)

In [None]:
# save cleaned data to excel file
data.to_excel("opioid_datamerged.xlsx") 