# Projects for Analysis of Anti-Depressant Drugs’s Adverse Events by FDA Adverse Event Reporting System (FAERS) from January 2019 to December 2023

## 2. CLEAN DATA

In [1]:
# load library
import pandas as pd
import numpy as np
import re

In [2]:
# CHECK DATA BEFORE CLEANING
data = pd.read_csv('FAERS_merged_data.csv')

data.info()

  data = pd.read_csv('FAERS_merged_data.csv')


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1904757 entries, 0 to 1904756
Data columns (total 54 columns):
 #   Column            Dtype  
---  ------            -----  
 0   primaryid         int64  
 1   caseid            int64  
 2   drug_seq          int64  
 3   role_cod          object 
 4   drugname          object 
 5   prod_ai           object 
 6   val_vbm           int64  
 7   route             object 
 8   dose_vbm          object 
 9   cum_dose_chr      float64
 10  cum_dose_unit     object 
 11  dechal            object 
 12  rechal            object 
 13  lot_num           object 
 14  exp_dt            float64
 15  nda_num           float64
 16  dose_amt          float64
 17  dose_unit         object 
 18  dose_form         object 
 19  dose_freq         object 
 20  indi_drug_seq     int64  
 21  indi_pt           object 
 22  dsg_drug_seq      float64
 23  start_dt          float64
 24  end_dt            float64
 25  dur               float64
 26  dur_cod       

In [3]:
# snapshot of data
data.describe(include='all').T

Unnamed: 0,count,unique,top,freq,mean,std,min,25%,50%,75%,max
primaryid,1904757.0,,,,391520321.016668,504412937.658664,39063042.0,159016213.0,184448831.0,218733853.0,2300387711.0
caseid,1904757.0,,,,17003711.165206,3591565.716194,3906304.0,15004798.0,17207571.0,19709683.0,23357481.0
drug_seq,1904757.0,,,,14.487911,23.461267,1.0,2.0,6.0,16.0,403.0
role_cod,1904757.0,4.0,C,669550.0,,,,,,,
drugname,1904757.0,6525.0,SERTRALINE,103479.0,,,,,,,
prod_ai,1891949.0,1016.0,SERTRALINE HYDROCHLORIDE,158070.0,,,,,,,
val_vbm,1904757.0,,,,1.005792,0.075887,1.0,1.0,1.0,1.0,2.0
route,1409233.0,34.0,Oral,745557.0,,,,,,,
dose_vbm,1326974.0,26031.0,UNK,276944.0,,,,,,,
cum_dose_chr,108093.0,,,,97449.501018,300018.504177,0.0,1080.0,10965.0,85078.75,8767200.0


In [4]:
# normalise text function
def normalize_text(text):
    if pd.isnull(text):
        return ''
    
    return re.sub(r'[^\w\s]', '', text).lower()

### 2.1 FILTER DATA BASED ON KEY FOCUS


#### 2.1.1. Filter role_cod

For any indication, it could be the mix between medication given to patients. To focus only on antidepressants, we filter the data based on role_cod for studying direct effects of antidepressants.

Code for drug's reported role in event:
| Code | Meaning                |
|------|------------------------|
| PS   | Primary Suspect Drug   |
| SS   | Secondary Suspect Drug |
| C    | Concomitant            |
| I    | Interacting            |

In [5]:
# check if the drug is primary suspected or not -> if not, remove to focus only main suspect
data['role_cod'].value_counts()

C     669550
SS    663402
PS    337517
I     234288
Name: role_cod, dtype: int64

In [6]:
# filter based on role_cod of main suspect
data = data[data['role_cod'] == 'PS']
data['role_cod'] = data['role_cod'].replace('PS', 'primary suspect')

data['role_cod'].value_counts()

primary suspect    337517
Name: role_cod, dtype: int64

In [7]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 337517 entries, 192 to 1904756
Data columns (total 54 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   primaryid         337517 non-null  int64  
 1   caseid            337517 non-null  int64  
 2   drug_seq          337517 non-null  int64  
 3   role_cod          337517 non-null  object 
 4   drugname          337517 non-null  object 
 5   prod_ai           337517 non-null  object 
 6   val_vbm           337517 non-null  int64  
 7   route             287348 non-null  object 
 8   dose_vbm          288750 non-null  object 
 9   cum_dose_chr      24663 non-null   float64
 10  cum_dose_unit     24643 non-null   object 
 11  dechal            278059 non-null  object 
 12  rechal            96368 non-null   object 
 13  lot_num           66701 non-null   object 
 14  exp_dt            7070 non-null    float64
 15  nda_num           312152 non-null  float64
 16  dose_amt         

#### 2.1.2 Filter event_dt & remove other date columns

We also only focus on the adverse events happened from 01/2018 to 12/2024 to analyse if any trends of adverse events pre-Covid, during Covid, and post-Covid => Use event_dt for further analysis.

In [8]:
# check missing data of event_dt
missing_pattern = data[data['event_dt'].isnull()]

print(missing_pattern.describe(include='all'))

           primaryid        caseid       drug_seq         role_cod  \
count   1.394610e+05  1.394610e+05  139461.000000           139461   
unique           NaN           NaN            NaN                1   
top              NaN           NaN            NaN  primary suspect   
freq             NaN           NaN            NaN           139461   
mean    2.218843e+08  1.804951e+07       1.120593              NaN   
std     2.519622e+08  2.877664e+06       0.761686              NaN   
min     5.847994e+07  5.847994e+06       1.000000              NaN   
25%     1.599015e+08  1.589154e+07       1.000000              NaN   
50%     1.791568e+08  1.775410e+07       1.000000              NaN   
75%     2.063272e+08  2.034579e+07       1.000000              NaN   
max     2.188078e+09  2.335335e+07      30.000000              NaN   

          drugname                    prod_ai   val_vbm    route dose_vbm  \
count       139461                     139461  139461.0   116915   116682   
uniqu

In [9]:
# Check missing bias by drug
missing_drug_bias = missing_pattern['drugname'].value_counts()

print("Drug distribution in missing EVENT_DT:")
print(missing_drug_bias)


Drug distribution in missing EVENT_DT:
SERTRALINE                              10077
VENLAFAXINE                              7948
ESCITALOPRAM                             7100
FLUOXETINE                               6393
CITALOPRAM                               5903
                                        ...  
Fonksera                                    1
TRAMADOL TEVA                               1
RISPERIDONE 0.5mg                           1
ARIPIPRAZOLE TABLETS 2MG                    1
Fluoxetin-ratiopharm 20 mg Tabletten        1
Name: drugname, Length: 1129, dtype: int64


In [10]:
# Check missing bias by adverse events
missing_pt_bias = missing_pattern['pt'].value_counts()

print("Adverse events distribution in missing EVENT_DT:")
print(missing_pt_bias)

Adverse events distribution in missing EVENT_DT:
Drug ineffective              3832
Drug interaction              2871
Serotonin syndrome            2136
Off label use                 2121
Suicidal ideation             1970
                              ... 
Intervertebral disc injury       1
Adenoidal disorder               1
Psychotic behaviour              1
Meningitis listeria              1
Vomiting in pregnancy            1
Name: pt, Length: 3529, dtype: int64


In [11]:
# the top 3 AEs of mising bias is not AEs -> remove these 3 symptoms out of data
non_ae = ['Drug ineffective', 'Drug interaction', 'Off label use']

data = data[~data['pt'].isin(non_ae)]

data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 322661 entries, 192 to 1904756
Data columns (total 54 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   primaryid         322661 non-null  int64  
 1   caseid            322661 non-null  int64  
 2   drug_seq          322661 non-null  int64  
 3   role_cod          322661 non-null  object 
 4   drugname          322661 non-null  object 
 5   prod_ai           322661 non-null  object 
 6   val_vbm           322661 non-null  int64  
 7   route             275481 non-null  object 
 8   dose_vbm          276433 non-null  object 
 9   cum_dose_chr      24030 non-null   float64
 10  cum_dose_unit     24010 non-null   object 
 11  dechal            266815 non-null  object 
 12  rechal            92664 non-null   object 
 13  lot_num           63603 non-null   object 
 14  exp_dt            6903 non-null    float64
 15  nda_num           297990 non-null  float64
 16  dose_amt         

In [12]:
import random
import numpy as np
import pandas as pd

# Function to handle partial dates
def complete_date(date):
    """
    Handle partial dates:
    - Year only -> Keep year, impute month later
    - Year + Month -> Add first day of the month
    - Full date -> Use as is
    """
    if pd.isnull(date):
        return np.nan
    date = str(int(date))  # Convert to string
    if len(date) == 4:  # Year only
        return f"{date}"  # Keep year, impute month later
    elif len(date) == 6:  # Year + Month
        return f"{date[:4]}-{date[4:6]}-01"  # First day of the month
    elif len(date) == 8:  # Full date
        return f"{date[:4]}-{date[4:6]}-{date[6:]}"  # Full date
    return np.nan  # Handle invalid formats

# Apply function to date columns
date_columns = ['exp_dt', 'start_dt', 'end_dt', 'event_dt', 'mfr_dt', 'init_fda_dt', 'fda_dt', 'rept_dt']
for col in date_columns:
    data[col] = data[col].apply(complete_date)
    data[col] = pd.to_datetime(data[col], errors='coerce')

In [13]:
# Function to impute event_dt for each primaryid group
def impute_event_dt_for_primaryid(group):
    """
    Ensure event_dt is consistent for the same primaryid.
    Fill missing event_dt using related date columns (mfr_dt, rept_dt, init_fda_dt).
    Leave NaN if no dates exist for the group.
    """
    # Fill event_dt using related date columns
    group['event_dt'] = group['event_dt'].fillna(
        group['mfr_dt'].combine_first(group['rept_dt']).combine_first(group['init_fda_dt'])
    )
    
    # If all event_dt are NaN, leave as NaN
    if group['event_dt'].isnull().all():
        return group

    # Use the first valid event_dt for the group and assign it to all rows in the group
    first_valid_date = group['event_dt'].dropna().iloc[0]
    group['event_dt'] = group['event_dt'].fillna(first_valid_date)

    return group

# Group by primaryid and ensure event_dt consistency
data = data.groupby('primaryid', group_keys=False).apply(impute_event_dt_for_primaryid)

# Convert event_dt back to datetime format to ensure consistency
data['event_dt'] = pd.to_datetime(data['event_dt'], errors='coerce')

In [14]:
# Validate the remaining missing values
missing_event_dt = data['event_dt'].isnull().sum()
print(f"Remaining missing EVENT_DT after imputation: {missing_event_dt}")

Remaining missing EVENT_DT after imputation: 0


In [15]:
data['event_dt'].describe(include='all')

  data['event_dt'].describe(include='all')


count                  322661
unique                   3650
top       2003-01-01 00:00:00
freq                     9436
first     1959-09-04 00:00:00
last      2023-12-27 00:00:00
Name: event_dt, dtype: object

In [16]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 322661 entries, 192 to 1904756
Data columns (total 54 columns):
 #   Column            Non-Null Count   Dtype         
---  ------            --------------   -----         
 0   primaryid         322661 non-null  int64         
 1   caseid            322661 non-null  int64         
 2   drug_seq          322661 non-null  int64         
 3   role_cod          322661 non-null  object        
 4   drugname          322661 non-null  object        
 5   prod_ai           322661 non-null  object        
 6   val_vbm           322661 non-null  int64         
 7   route             275481 non-null  object        
 8   dose_vbm          276433 non-null  object        
 9   cum_dose_chr      24030 non-null   float64       
 10  cum_dose_unit     24010 non-null   object        
 11  dechal            266815 non-null  object        
 12  rechal            92664 non-null   object        
 13  lot_num           63603 non-null   object        
 14  e

In [17]:
# filter all the event happens from 2018 to 2023
data = data[(data['event_dt'] >= '2018-01-01') & (data['event_dt'] <= '2023-12-31')]

In [18]:
# after filter, we only keep event_dt for further analysis, other dates we drop due to large missing values
drop_date_col = [col for col in data.columns if col.endswith('_dt') and col != 'event_dt']

data = data.drop(columns=drop_date_col)

print(drop_date_col)
print(data.info())

['exp_dt', 'start_dt', 'end_dt', 'mfr_dt', 'init_fda_dt', 'fda_dt', 'rept_dt']
<class 'pandas.core.frame.DataFrame'>
Int64Index: 248239 entries, 192 to 1904754
Data columns (total 47 columns):
 #   Column            Non-Null Count   Dtype         
---  ------            --------------   -----         
 0   primaryid         248239 non-null  int64         
 1   caseid            248239 non-null  int64         
 2   drug_seq          248239 non-null  int64         
 3   role_cod          248239 non-null  object        
 4   drugname          248239 non-null  object        
 5   prod_ai           248239 non-null  object        
 6   val_vbm           248239 non-null  int64         
 7   route             207758 non-null  object        
 8   dose_vbm          208795 non-null  object        
 9   cum_dose_chr      14812 non-null   float64       
 10  cum_dose_unit     14792 non-null   object        
 11  dechal            203244 non-null  object        
 12  rechal            70881 non-null

Remove columns from initial dataset: ['exp_dt', 'start_dt', 'end_dt', 'mfr_dt', 'init_fda_dt', 'fda_dt', 'rept_dt']

In [19]:
data.describe(include='all').T

  data.describe(include='all').T


Unnamed: 0,count,unique,top,freq,first,last,mean,std,min,25%,50%,75%,max
primaryid,248239.0,,,,NaT,NaT,220022549.950785,239742132.015691,58509932.0,163883434.5,181909251.0,206956311.0,2288369510.0
caseid,248239.0,,,,NaT,NaT,18351328.618489,2668815.57536,5850993.0,16351423.0,18108602.0,20491761.0,23357481.0
drug_seq,248239.0,,,,NaT,NaT,1.117911,0.91429,1.0,1.0,1.0,1.0,33.0
role_cod,248239.0,1.0,primary suspect,248239.0,NaT,NaT,,,,,,,
drugname,248239.0,2161.0,SERTRALINE,22691.0,NaT,NaT,,,,,,,
prod_ai,248239.0,357.0,SERTRALINE HYDROCHLORIDE,32726.0,NaT,NaT,,,,,,,
val_vbm,248239.0,,,,NaT,NaT,1.0,0.0,1.0,1.0,1.0,1.0,1.0
route,207758.0,23.0,Oral,119455.0,NaT,NaT,,,,,,,
dose_vbm,208795.0,10019.0,UNK,28793.0,NaT,NaT,,,,,,,
cum_dose_chr,14812.0,,,,NaT,NaT,15235.404346,75837.209386,0.0,75.0,370.5,2840.0,1398375.0


#### 2.1.3 Filter indi_pt

Due to many indication not for depression, we filter out only antidepressant or psychotics indication for our analysis.

In [20]:
# check any irrelevant indication
data['indi_pt'].value_counts()

depression                                                   186674
major depression                                              35209
antidepressant therapy                                         7395
depressed mood                                                 4575
mixed anxiety and depressive disorder                          4476
depressive symptom                                             2464
perinatal depression                                           1834
depression suicidal                                            1602
adjustment disorder with depressed mood                        1023
persistent depressive disorder                                 1008
schizoaffective disorder depressive type                        906
torsade de pointes                                              319
agitated depression                                             182
adjustment disorder with mixed anxiety and depressed mood       177
depressed level of consciousness                

In [21]:
# filter non-relevant indication
focused_indi = ['depression', 'major depression', 'antidepressant therapy', 'depressed mood',
                'mixed anxiety and depressive disorder', 'depressive symptom','perinatal depression', 'depression suicidal', 
                'adjustment disorder with depressed mood', 'persistent depressive disorder', 'schizoaffective disorder depressive type', 
                'agitated depression','adjustment disorder with mixed anxiety and depressed mood', 'depressive delusion',
                'post stroke depression', 'childhood depression', 'menopausal depression', 'antidepressant drug level',
                'antidepressant discontinuation syndrome', 'depression postoperative']
    
data = data[data['indi_pt'].isin(focused_indi)]

data['indi_pt'].value_counts()

depression                                                   186674
major depression                                              35209
antidepressant therapy                                         7395
depressed mood                                                 4575
mixed anxiety and depressive disorder                          4476
depressive symptom                                             2464
perinatal depression                                           1834
depression suicidal                                            1602
adjustment disorder with depressed mood                        1023
persistent depressive disorder                                 1008
schizoaffective disorder depressive type                        906
agitated depression                                             182
adjustment disorder with mixed anxiety and depressed mood       177
depressive delusion                                              73
post stroke depression                          

In [22]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 247733 entries, 192 to 1904754
Data columns (total 47 columns):
 #   Column            Non-Null Count   Dtype         
---  ------            --------------   -----         
 0   primaryid         247733 non-null  int64         
 1   caseid            247733 non-null  int64         
 2   drug_seq          247733 non-null  int64         
 3   role_cod          247733 non-null  object        
 4   drugname          247733 non-null  object        
 5   prod_ai           247733 non-null  object        
 6   val_vbm           247733 non-null  int64         
 7   route             207290 non-null  object        
 8   dose_vbm          208350 non-null  object        
 9   cum_dose_chr      14808 non-null   float64       
 10  cum_dose_unit     14788 non-null   object        
 11  dechal            202785 non-null  object        
 12  rechal            70582 non-null   object        
 13  lot_num           52539 non-null   object        
 14  n

### 2.2 CHECK DATA - DRUG

#### 2.2.1 Check drugname, val_vbm
There are various drug names in the market from different macnufacturer. We only focus their product active ingredient adverse events => remove drugname and val_vbm columns

In [23]:
# drop the columns 'drugname' and 'val_vbm'
data = data.drop(columns=['drugname', 'val_vbm'])

print(data.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 247733 entries, 192 to 1904754
Data columns (total 45 columns):
 #   Column            Non-Null Count   Dtype         
---  ------            --------------   -----         
 0   primaryid         247733 non-null  int64         
 1   caseid            247733 non-null  int64         
 2   drug_seq          247733 non-null  int64         
 3   role_cod          247733 non-null  object        
 4   prod_ai           247733 non-null  object        
 5   route             207290 non-null  object        
 6   dose_vbm          208350 non-null  object        
 7   cum_dose_chr      14808 non-null   float64       
 8   cum_dose_unit     14788 non-null   object        
 9   dechal            202785 non-null  object        
 10  rechal            70582 non-null   object        
 11  lot_num           52539 non-null   object        
 12  nda_num           226701 non-null  float64       
 13  dose_amt          166469 non-null  float64       
 14  d

Remove columns from initial dataset:
['exp_dt', 'start_dt', 'end_dt', 'mfr_dt', 'init_fda_dt', 'fda_dt', 'rept_dt', 'drugname', 'val_vbm', '']

#### 2.2.2 Check prod_ai

In [24]:
data['prod_ai'] = data['prod_ai'].apply(normalize_text)

In [25]:
# check prod_ai
prod_ai_check = data['prod_ai'].value_counts()
print(prod_ai_check)

# prod_ai_check_df = prod_ai_check.reset_index()
# prod_ai_check_df.columns = ['prod_ai', 'count']
# prod_ai_check_df.to_csv('prod_ai_check.csv', index=False) => export file for manually mapping

sertraline hydrochloride         32705
venlafaxine hydrochloride        27290
escitalopram oxalate             16083
fluoxetine hydrochloride         14337
esketamine                       12993
                                 ...  
amitriptylinechlordiazepoxide        1
fremanezumabvfrm                     1
perphenazine                         1
metoprolol tartrate                  1
propylhexedrine                      1
Name: prod_ai, Length: 330, dtype: int64


In [26]:
# remove the missing values of prod_ai
data = data[data['prod_ai'].str.strip().str.lower() != 'unspecified ingredient']

data['prod_ai'].value_counts()

sertraline hydrochloride                               32705
venlafaxine hydrochloride                              27290
escitalopram oxalate                                   16083
fluoxetine hydrochloride                               14337
esketamine                                             12993
                                                       ...  
estradiol hemihydratenorethindrone acetaterelugolix        1
baclofen                                                   1
amitriptylinechlordiazepoxide                              1
perphenazine                                               1
propylhexedrine                                            1
Name: prod_ai, Length: 329, dtype: int64

In [27]:
# load prod_ai mapping file after manually reviewing and check prod_ai with WebMD website
prod_ai_map = pd.read_excel('prod_ai_map.xlsx')

prod_ai_map.head()

Unnamed: 0,prod_ai,prod_ai_cleaned,webmd_group,webmd_ad
0,sertraline hydrochloride,sertraline,selective serotonin reuptake inhibitors,antidepressants
1,venlafaxine hydrochloride,venlafaxine,serotonin and norepinephrine reuptake inhibitors,antidepressants
2,escitalopram oxalate,escitalopram,selective serotonin reuptake inhibitors,antidepressants
3,fluoxetine hydrochloride,fluoxetine,selective serotonin reuptake inhibitors,antidepressants
4,esketamine,esketamine,n-methyl d-aspartate antagonists,antidepressants


In [28]:
# merge new prod_ai_cleanded, webmd_group, webmd_ad
data['prod_ai'] = data['prod_ai'].str.strip().str.lower()
prod_ai_map['prod_ai'] = prod_ai_map['prod_ai'].str.strip().str.lower()

mapping_columns = ['prod_ai', 'prod_ai_cleaned', 'webmd_group', 'webmd_ad']
prod_ai_map = prod_ai_map[mapping_columns]

data = pd.merge(data, prod_ai_map, on='prod_ai', how='left')

print(data.head())

   primaryid   caseid  drug_seq         role_cod                   prod_ai  \
0   74189024  7418902         1  primary suspect  sertraline hydrochloride   
1   74189024  7418902         1  primary suspect  sertraline hydrochloride   
2   74189024  7418902         1  primary suspect  sertraline hydrochloride   
3   74189024  7418902         1  primary suspect  sertraline hydrochloride   
4   74189024  7418902         1  primary suspect  sertraline hydrochloride   

     route          dose_vbm  cum_dose_chr cum_dose_unit dechal  ... occp_cod  \
0  Unknown  50 milligram, qd           NaN           NaN      Y  ...       MD   
1  Unknown  50 milligram, qd           NaN           NaN      Y  ...       MD   
2  Unknown  50 milligram, qd           NaN           NaN      Y  ...       MD   
3  Unknown  50 milligram, qd           NaN           NaN      Y  ...       MD   
4  Unknown  50 milligram, qd           NaN           NaN      Y  ...       MD   

  reporter_country  occr_country            

In [29]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 247721 entries, 0 to 247720
Data columns (total 48 columns):
 #   Column            Non-Null Count   Dtype         
---  ------            --------------   -----         
 0   primaryid         247721 non-null  int64         
 1   caseid            247721 non-null  int64         
 2   drug_seq          247721 non-null  int64         
 3   role_cod          247721 non-null  object        
 4   prod_ai           247721 non-null  object        
 5   route             207289 non-null  object        
 6   dose_vbm          208349 non-null  object        
 7   cum_dose_chr      14807 non-null   float64       
 8   cum_dose_unit     14787 non-null   object        
 9   dechal            202774 non-null  object        
 10  rechal            70582 non-null   object        
 11  lot_num           52531 non-null   object        
 12  nda_num           226700 non-null  float64       
 13  dose_amt          166463 non-null  float64       
 14  dose

In [30]:
# replace 'prod_ai' with 'prod_ai_cleaned'
data['prod_ai'] = data['prod_ai_cleaned']

# remove 'prod_ai_cleaned'
data = data.drop(columns=['prod_ai_cleaned'])

# double-check
print(data.info())
print(data['prod_ai'].value_counts())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 247721 entries, 0 to 247720
Data columns (total 47 columns):
 #   Column            Non-Null Count   Dtype         
---  ------            --------------   -----         
 0   primaryid         247721 non-null  int64         
 1   caseid            247721 non-null  int64         
 2   drug_seq          247721 non-null  int64         
 3   role_cod          247721 non-null  object        
 4   prod_ai           247721 non-null  object        
 5   route             207289 non-null  object        
 6   dose_vbm          208349 non-null  object        
 7   cum_dose_chr      14807 non-null   float64       
 8   cum_dose_unit     14787 non-null   object        
 9   dechal            202774 non-null  object        
 10  rechal            70582 non-null   object        
 11  lot_num           52531 non-null   object        
 12  nda_num           226700 non-null  float64       
 13  dose_amt          166463 non-null  float64       
 14  dose

Remove columns from initial dataset:
['exp_dt', 'start_dt', 'end_dt', 'mfr_dt', 'init_fda_dt', 'fda_dt', 'rept_dt', 'drugname', 'val_vbm', 'prod_ai']

#### 2.2.3 Check route, dose_vbm, cum_dose_chr, cum_dose_unit

In [31]:
data['route'] = data['route'].apply(normalize_text)

data['route'].value_counts()

oral                                   119271
unknown                                 74251
                                        40432
transplacental                           9995
intravenous not otherwise specified       811
nasal                                     757
intramuscular                             673
transdermal                               409
transmammary                              240
other                                     189
sublingual                                164
respiratory inhalation                    160
endocervical                              148
subcutaneous                               82
vaginal                                    42
intravenous drip                           31
buccal                                     29
parenteral                                  9
intrathoracic                               8
intrauterine                                7
rectal                                      6
ophthalmic                        

In [32]:
# route seems not relevant to adverse events => remove
data = data.drop(columns=['route'])

data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 247721 entries, 0 to 247720
Data columns (total 46 columns):
 #   Column            Non-Null Count   Dtype         
---  ------            --------------   -----         
 0   primaryid         247721 non-null  int64         
 1   caseid            247721 non-null  int64         
 2   drug_seq          247721 non-null  int64         
 3   role_cod          247721 non-null  object        
 4   prod_ai           247721 non-null  object        
 5   dose_vbm          208349 non-null  object        
 6   cum_dose_chr      14807 non-null   float64       
 7   cum_dose_unit     14787 non-null   object        
 8   dechal            202774 non-null  object        
 9   rechal            70582 non-null   object        
 10  lot_num           52531 non-null   object        
 11  nda_num           226700 non-null  float64       
 12  dose_amt          166463 non-null  float64       
 13  dose_unit         166462 non-null  object        
 14  dose

Remove columns from initial dataset:
['exp_dt', 'start_dt', 'end_dt', 'mfr_dt', 'init_fda_dt', 'fda_dt', 'rept_dt', 'drugname', 'val_vbm', 'prod_ai', 'route']

In [33]:
data['dose_vbm'] = data['dose_vbm'].apply(normalize_text)

data['dose_vbm'].value_counts()

                                                                       39601
unk                                                                    28792
          quantity1 tablets                                             2965
10 mg qd                                                                2954
20 mg qd                                                                2904
                                                                       ...  
most recent 27aug2019 med kit 287600 1 2                                   1
075 mg 1 day                                                               1
100 mg twice a day one in the morning and one in the late afternoon        1
med kit number100000935984 and 100000935985                                1
45120 mgdaily                                                              1
Name: dose_vbm, Length: 8493, dtype: int64

In [34]:
data['cum_dose_chr'].describe(include='all')

count    1.480700e+04
mean     1.523052e+04
std      7.584169e+04
min      0.000000e+00
25%      7.500000e+01
50%      3.728570e+02
75%      2.840000e+03
max      1.398375e+06
Name: cum_dose_chr, dtype: float64

In [35]:
data['cum_dose_unit'] = data['cum_dose_unit'].apply(normalize_text)

data['cum_dose_unit'].value_counts()

        232934
mg       13022
df        1271
gtt        176
g          152
ml         117
ug          27
gm          14
umol         4
ul           4
Name: cum_dose_unit, dtype: int64

In [36]:
# after checking data, recommend to not use these columns for further analysis
# because the author has not enough knowledge to clean the data based on drugs dose
data = data.drop(columns=['dose_vbm', 'cum_dose_chr', 'cum_dose_unit'])

data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 247721 entries, 0 to 247720
Data columns (total 43 columns):
 #   Column            Non-Null Count   Dtype         
---  ------            --------------   -----         
 0   primaryid         247721 non-null  int64         
 1   caseid            247721 non-null  int64         
 2   drug_seq          247721 non-null  int64         
 3   role_cod          247721 non-null  object        
 4   prod_ai           247721 non-null  object        
 5   dechal            202774 non-null  object        
 6   rechal            70582 non-null   object        
 7   lot_num           52531 non-null   object        
 8   nda_num           226700 non-null  float64       
 9   dose_amt          166463 non-null  float64       
 10  dose_unit         166462 non-null  object        
 11  dose_form         113880 non-null  object        
 12  dose_freq         94137 non-null   object        
 13  indi_drug_seq     247721 non-null  int64         
 14  indi

Remove columns from initial dataset:

['exp_dt', 'start_dt', 'end_dt', 'mfr_dt', 'init_fda_dt', 'fda_dt', 'rept_dt', 'drugname', 'val_vbm', 'prod_ai', 'route',
'dose_vbm', 'cum_dose_chr', 'cum_dose_unit']

#### 2.2.4 Check dechal, rechal

In [37]:
data['dechal'] = data['dechal'].apply(normalize_text)

data['dechal'].value_counts()

u    77402
y    73070
     44947
d    34335
n    17967
Name: dechal, dtype: int64

In [38]:
data['rechal'] = data['rechal'].apply(normalize_text)

data['rechal'].value_counts()

     177139
u     52713
d     11539
y      4604
n      1726
Name: rechal, dtype: int64

In [39]:
# after checking, with large missing values, unknown, does not applied => remove these columns
data = data.drop(columns=['dechal', 'rechal'])

data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 247721 entries, 0 to 247720
Data columns (total 41 columns):
 #   Column            Non-Null Count   Dtype         
---  ------            --------------   -----         
 0   primaryid         247721 non-null  int64         
 1   caseid            247721 non-null  int64         
 2   drug_seq          247721 non-null  int64         
 3   role_cod          247721 non-null  object        
 4   prod_ai           247721 non-null  object        
 5   lot_num           52531 non-null   object        
 6   nda_num           226700 non-null  float64       
 7   dose_amt          166463 non-null  float64       
 8   dose_unit         166462 non-null  object        
 9   dose_form         113880 non-null  object        
 10  dose_freq         94137 non-null   object        
 11  indi_drug_seq     247721 non-null  int64         
 12  indi_pt           247721 non-null  object        
 13  dsg_drug_seq      126086 non-null  float64       
 14  dur 

Remove columns from initial dataset:

['exp_dt', 'start_dt', 'end_dt', 'mfr_dt', 'init_fda_dt', 'fda_dt', 'rept_dt', 'drugname', 'val_vbm', 'prod_ai', 'route',
'dose_vbm', 'cum_dose_chr', 'cum_dose_unit', 'dechal', 'rechal']

#### 2.2.5 Check lot_num and nda_num

In [40]:
# remove these columns - not helping for analysis
data = data.drop(columns=['lot_num', 'nda_num'])

data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 247721 entries, 0 to 247720
Data columns (total 39 columns):
 #   Column            Non-Null Count   Dtype         
---  ------            --------------   -----         
 0   primaryid         247721 non-null  int64         
 1   caseid            247721 non-null  int64         
 2   drug_seq          247721 non-null  int64         
 3   role_cod          247721 non-null  object        
 4   prod_ai           247721 non-null  object        
 5   dose_amt          166463 non-null  float64       
 6   dose_unit         166462 non-null  object        
 7   dose_form         113880 non-null  object        
 8   dose_freq         94137 non-null   object        
 9   indi_drug_seq     247721 non-null  int64         
 10  indi_pt           247721 non-null  object        
 11  dsg_drug_seq      126086 non-null  float64       
 12  dur               23306 non-null   float64       
 13  dur_cod           23302 non-null   object        
 14  case

Remove columns from initial dataset:

['exp_dt', 'start_dt', 'end_dt', 'mfr_dt', 'init_fda_dt', 'fda_dt', 'rept_dt', 'drugname', 'val_vbm', 'prod_ai', 'route',
'dose_vbm', 'cum_dose_chr', 'cum_dose_unit', 'dechal', 'rechal', 'lot_num', 'nda_num']

#### 2.2.6 Check dose_amt, dose_unit, dose_form, dose_freq

In [41]:
data['dose_amt'].describe(include='all')

count    166463.000000
mean        124.609179
std         596.399349
min           0.000000
25%          10.000000
50%          37.500000
75%          90.000000
max       14100.000000
Name: dose_amt, dtype: float64

In [42]:
data['dose_unit'] = data['dose_unit'].apply(normalize_text)

data['dose_unit'].value_counts()

mg      154731
         81259
df        8735
g         1154
gm         761
gtt        277
ml         256
mgm2       156
mgkg       142
ug         105
ng          50
iu          31
kiu         30
mgmg        15
ugkg         9
umol         4
ul           4
tot          1
mgml         1
Name: dose_unit, dtype: int64

In [43]:
data['dose_form'] = data['dose_form'].apply(normalize_text)

data['dose_form'].value_counts()

                                      133862
tablet                                 52377
nasal spray                            12487
filmcoated tablet                       9168
prolongedrelease tablet                 8743
                                       ...  
tablet cr                                  1
powder for injection                       1
intrauterine delivery system               1
controlledrelease tablet  extended         1
concentrate for oral solution              1
Name: dose_form, Length: 126, dtype: int64

In [44]:
data['dose_freq'] = data['dose_freq'].apply(normalize_text)

data['dose_freq'].value_counts()

         153584
qd        80881
bid        5779
hs         2213
biw         950
tid         948
wk          917
qow         448
qid         402
q12h        361
qod         241
1x          204
month       157
999         146
q8h         112
q6h          99
q4h          74
hr           70
prn          64
qw           41
qm           10
ud            7
tiw           7
cycle         3
min           2
q3w           1
Name: dose_freq, dtype: int64

In [45]:
# after checking data, recommend to not use these columns for further analysis
# because the author has not enough knowledge to clean the data based on drug dose info
data = data.drop(columns=['dose_amt', 'dose_unit', 'dose_form', 'dose_freq'])

data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 247721 entries, 0 to 247720
Data columns (total 35 columns):
 #   Column            Non-Null Count   Dtype         
---  ------            --------------   -----         
 0   primaryid         247721 non-null  int64         
 1   caseid            247721 non-null  int64         
 2   drug_seq          247721 non-null  int64         
 3   role_cod          247721 non-null  object        
 4   prod_ai           247721 non-null  object        
 5   indi_drug_seq     247721 non-null  int64         
 6   indi_pt           247721 non-null  object        
 7   dsg_drug_seq      126086 non-null  float64       
 8   dur               23306 non-null   float64       
 9   dur_cod           23302 non-null   object        
 10  caseversion       247721 non-null  int64         
 11  i_f_code          247721 non-null  object        
 12  event_dt          247721 non-null  datetime64[ns]
 13  rept_cod          247721 non-null  object        
 14  auth

Remove columns from initial dataset:

DRUG = ['exp_dt', 'start_dt', 'end_dt', 'mfr_dt', 'init_fda_dt', 'fda_dt', 'rept_dt', 'drugname', 'val_vbm', 'prod_ai', 'route',
'dose_vbm', 'cum_dose_chr', 'cum_dose_unit', 'dechal', 'rechal', 'lot_num', 'nda_num', 'dose_amt', 'dose_unit', 'dose_form', 'dose_freq']

#### 2.2.7 Check drug_seq

In [46]:
# use this to merge data only, not useful for analysis => remove
data = data.drop(columns=['drug_seq'])

data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 247721 entries, 0 to 247720
Data columns (total 34 columns):
 #   Column            Non-Null Count   Dtype         
---  ------            --------------   -----         
 0   primaryid         247721 non-null  int64         
 1   caseid            247721 non-null  int64         
 2   role_cod          247721 non-null  object        
 3   prod_ai           247721 non-null  object        
 4   indi_drug_seq     247721 non-null  int64         
 5   indi_pt           247721 non-null  object        
 6   dsg_drug_seq      126086 non-null  float64       
 7   dur               23306 non-null   float64       
 8   dur_cod           23302 non-null   object        
 9   caseversion       247721 non-null  int64         
 10  i_f_code          247721 non-null  object        
 11  event_dt          247721 non-null  datetime64[ns]
 12  rept_cod          247721 non-null  object        
 13  auth_num          55123 non-null   object        
 14  mfr_

### 2.3 CHECK DATA - INDI

#### Check indi_drug_seq

In [47]:
# use this to merge data only, not useful for analysis => remove
data = data.drop(columns=['indi_drug_seq'])

data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 247721 entries, 0 to 247720
Data columns (total 33 columns):
 #   Column            Non-Null Count   Dtype         
---  ------            --------------   -----         
 0   primaryid         247721 non-null  int64         
 1   caseid            247721 non-null  int64         
 2   role_cod          247721 non-null  object        
 3   prod_ai           247721 non-null  object        
 4   indi_pt           247721 non-null  object        
 5   dsg_drug_seq      126086 non-null  float64       
 6   dur               23306 non-null   float64       
 7   dur_cod           23302 non-null   object        
 8   caseversion       247721 non-null  int64         
 9   i_f_code          247721 non-null  object        
 10  event_dt          247721 non-null  datetime64[ns]
 11  rept_cod          247721 non-null  object        
 12  auth_num          55123 non-null   object        
 13  mfr_num           228455 non-null  object        
 14  mfr_

Remove columns from initial dataset:

DRUG = ['drug_seq', 'drugname', 'prod_ai' - *keep 'prod_ai_cleaned'*, 'val_vbm', 'route', 'dose_vbm', 'cum_dose_chr', 'cum_dose_unit',
'dechal', 'rechal', 'lot_num', 'exp_dt', 'nda_num', 'dose_amt', 'dose_unit', 'dose_form', 'dose_freq']

INDI = ['indi_drug_seq']

'start_dt', 'end_dt', 'mfr_dt', 'init_fda_dt', 'fda_dt', 'rept_dt',

### 2.4 CHECK DATA - THER

#### 2.4.1 Check dsg_drug_seq

In [48]:
# use this to merge data only, not useful for analysis => remove
data = data.drop(columns=['dsg_drug_seq'])

data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 247721 entries, 0 to 247720
Data columns (total 32 columns):
 #   Column            Non-Null Count   Dtype         
---  ------            --------------   -----         
 0   primaryid         247721 non-null  int64         
 1   caseid            247721 non-null  int64         
 2   role_cod          247721 non-null  object        
 3   prod_ai           247721 non-null  object        
 4   indi_pt           247721 non-null  object        
 5   dur               23306 non-null   float64       
 6   dur_cod           23302 non-null   object        
 7   caseversion       247721 non-null  int64         
 8   i_f_code          247721 non-null  object        
 9   event_dt          247721 non-null  datetime64[ns]
 10  rept_cod          247721 non-null  object        
 11  auth_num          55123 non-null   object        
 12  mfr_num           228455 non-null  object        
 13  mfr_sndr          247721 non-null  object        
 14  lit_

Remove columns from initial dataset:

DRUG = ['drug_seq', 'drugname', 'prod_ai' - *keep 'prod_ai_cleaned'*, 'val_vbm', 'route', 'dose_vbm', 'cum_dose_chr', 'cum_dose_unit',
'dechal', 'rechal', 'lot_num', 'exp_dt', 'nda_num', 'dose_amt', 'dose_unit', 'dose_form', 'dose_freq']

INDI = ['indi_drug_seq']

THER = ['dsg_drug_seq', 'start_dt', 'end_dt', ]

'mfr_dt', 'init_fda_dt', 'fda_dt', 'rept_dt',

#### 2.4.2 Check dur, dur_cod

In [49]:
data['dur'].describe(include='all')

count    23306.000000
mean       134.431511
std        511.206258
min          0.000000
25%          3.000000
50%         15.000000
75%        129.750000
max      13850.000000
Name: dur, dtype: float64

In [50]:
data['dur_cod'] = data['dur_cod'].apply(normalize_text)

data['dur_cod'].value_counts()

       224419
day     16136
yr       3161
mon      2599
wk       1229
hr        106
min        71
Name: dur_cod, dtype: int64

In [51]:
# both dur & dur_cod could help for further analysis, but large missing values (90% data) => suggest to remove
data = data.drop(columns=['dur', 'dur_cod'])

data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 247721 entries, 0 to 247720
Data columns (total 30 columns):
 #   Column            Non-Null Count   Dtype         
---  ------            --------------   -----         
 0   primaryid         247721 non-null  int64         
 1   caseid            247721 non-null  int64         
 2   role_cod          247721 non-null  object        
 3   prod_ai           247721 non-null  object        
 4   indi_pt           247721 non-null  object        
 5   caseversion       247721 non-null  int64         
 6   i_f_code          247721 non-null  object        
 7   event_dt          247721 non-null  datetime64[ns]
 8   rept_cod          247721 non-null  object        
 9   auth_num          55123 non-null   object        
 10  mfr_num           228455 non-null  object        
 11  mfr_sndr          247721 non-null  object        
 12  lit_ref           39442 non-null   object        
 13  age               191783 non-null  float64       
 14  age_

Remove columns from initial dataset:

DRUG = ['drug_seq', 'drugname', 'prod_ai' - *keep 'prod_ai_cleaned'*, 'val_vbm', 'route', 'dose_vbm', 'cum_dose_chr', 'cum_dose_unit',
'dechal', 'rechal', 'lot_num', 'exp_dt', 'nda_num', 'dose_amt', 'dose_unit', 'dose_form', 'dose_freq']

INDI = ['indi_drug_seq']

THER = ['dsg_drug_seq', 'start_dt', 'end_dt', 'dur', 'dur_cod']

'mfr_dt', 'init_fda_dt', 'fda_dt', 'rept_dt',

### 2.5 CHECK DATA - DEMO

#### 2.5.1 Check caseversion

In [52]:
data['caseversion'].describe(include='all')

count    247721.000000
mean          2.120737
std           3.000837
min           1.000000
25%           1.000000
50%           1.000000
75%           2.000000
max          49.000000
Name: caseversion, dtype: float64

In [53]:
check_col = data[['primaryid', 'caseid', 'caseversion']]

check_col.head(20)

Unnamed: 0,primaryid,caseid,caseversion
0,74189024,7418902,4
1,74189024,7418902,4
2,74189024,7418902,4
3,74189024,7418902,4
4,74189024,7418902,4
5,74189024,7418902,4
6,74189024,7418902,4
7,74189024,7418902,4
8,109174139,10917413,9
9,109174139,10917413,9


In [54]:
# both caseid and caseversion reflected on primary id and use for merging => remove these 2 cols
data = data.drop(columns=['caseid', 'caseversion'])

data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 247721 entries, 0 to 247720
Data columns (total 28 columns):
 #   Column            Non-Null Count   Dtype         
---  ------            --------------   -----         
 0   primaryid         247721 non-null  int64         
 1   role_cod          247721 non-null  object        
 2   prod_ai           247721 non-null  object        
 3   indi_pt           247721 non-null  object        
 4   i_f_code          247721 non-null  object        
 5   event_dt          247721 non-null  datetime64[ns]
 6   rept_cod          247721 non-null  object        
 7   auth_num          55123 non-null   object        
 8   mfr_num           228455 non-null  object        
 9   mfr_sndr          247721 non-null  object        
 10  lit_ref           39442 non-null   object        
 11  age               191783 non-null  float64       
 12  age_cod           191859 non-null  object        
 13  age_grp           35539 non-null   object        
 14  sex 

Remove columns from initial dataset:

DRUG = ['drug_seq', 'drugname', 'prod_ai' - *keep 'prod_ai_cleaned'*, 'val_vbm', 'route', 'dose_vbm', 'cum_dose_chr', 'cum_dose_unit',
'dechal', 'rechal', 'lot_num', 'exp_dt', 'nda_num', 'dose_amt', 'dose_unit', 'dose_form', 'dose_freq']

INDI = ['indi_drug_seq']

THER = ['dsg_drug_seq', 'start_dt', 'end_dt', 'dur', 'dur_cod']

DEMO = ['caseid', 'caseversion', 'mfr_dt', 'init_fda_dt', 'fda_dt']

'rept_dt',

#### 2.5.2 Check i_f_code

In [55]:
data['i_f_code'] = data['i_f_code'].apply(normalize_text)

data['i_f_code'].value_counts()

i    156431
f     91290
Name: i_f_code, dtype: int64

In [56]:
# not helpful for further analysis
data = data.drop(columns=['i_f_code'])

data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 247721 entries, 0 to 247720
Data columns (total 27 columns):
 #   Column            Non-Null Count   Dtype         
---  ------            --------------   -----         
 0   primaryid         247721 non-null  int64         
 1   role_cod          247721 non-null  object        
 2   prod_ai           247721 non-null  object        
 3   indi_pt           247721 non-null  object        
 4   event_dt          247721 non-null  datetime64[ns]
 5   rept_cod          247721 non-null  object        
 6   auth_num          55123 non-null   object        
 7   mfr_num           228455 non-null  object        
 8   mfr_sndr          247721 non-null  object        
 9   lit_ref           39442 non-null   object        
 10  age               191783 non-null  float64       
 11  age_cod           191859 non-null  object        
 12  age_grp           35539 non-null   object        
 13  sex               226104 non-null  object        
 14  e_su

#### 2.5.3 Check rept_cod, auth_num, mfr_num, mfr_sndr, lit_ref

In [57]:
data['rept_cod'] = data['rept_cod'].apply(normalize_text)

data['rept_cod'].value_counts()

exp      198924
per       29507
dir       19266
5day         18
30day         6
Name: rept_cod, dtype: int64

In [58]:
data['auth_num'] = data['auth_num'].apply(normalize_text)

data['auth_num'].value_counts()

                                    192598
gbmhraeyc 00245177                     705
frafssapsgr20191138                    632
gbmhraadr 28224885                     288
plmerck healthcare kgaa9284737         273
                                     ...  
frafssapsma20221761                      1
gbmhramed2022041412441384204h0fz         1
frafssapsly20222044                      1
frafssapspb20191053                      1
frafssapsrn20180313                      1
Name: auth_num, Length: 7173, dtype: int64

In [59]:
data['mfr_num'] = data['mfr_num'].apply(normalize_text)

data['mfr_num'].value_counts()

                                        19266
frglaxosmithklinefr2019133161             632
usglaxosmithklineus2020gsk063158          426
uspfizer inc2021308628                    414
plmylanlabs2020m1048017                   328
                                        ...  
usallergan1943746us                         1
usallergan1948292us                         1
usallergan1948657us                         1
usallergan1948906us                         1
gblupin pharmaceuticals inc201905354        1
Name: mfr_num, Length: 43247, dtype: int64

In [60]:
data['mfr_sndr'] = data['mfr_sndr'].apply(normalize_text)

data['mfr_sndr'].value_counts()

aurobindo                   31659
pfizer                      22649
mylan                       20716
fdactu                      19266
teva                        13826
                            ...  
woodward pharma services        1
aralez pharmaceuticals          1
ge healthcare                   1
nuvo pharmaceuticals            1
bayer                           1
Name: mfr_sndr, Length: 165, dtype: int64

In [61]:
# project only focus on active ingredients instead of drugname => remove these column
data = data.drop(columns=['rept_cod', 'auth_num', 'mfr_num', 'mfr_sndr', 'lit_ref'])

data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 247721 entries, 0 to 247720
Data columns (total 22 columns):
 #   Column            Non-Null Count   Dtype         
---  ------            --------------   -----         
 0   primaryid         247721 non-null  int64         
 1   role_cod          247721 non-null  object        
 2   prod_ai           247721 non-null  object        
 3   indi_pt           247721 non-null  object        
 4   event_dt          247721 non-null  datetime64[ns]
 5   age               191783 non-null  float64       
 6   age_cod           191859 non-null  object        
 7   age_grp           35539 non-null   object        
 8   sex               226104 non-null  object        
 9   e_sub             247721 non-null  object        
 10  wt                94308 non-null   float64       
 11  wt_cod            94308 non-null   object        
 12  to_mfr            19167 non-null   object        
 13  occp_cod          245904 non-null  object        
 14  repo

Remove columns from initial dataset:

DRUG = ['drug_seq', 'drugname', 'prod_ai' - *keep 'prod_ai_cleaned'*, 'val_vbm', 'route', 'dose_vbm', 'cum_dose_chr', 'cum_dose_unit',
'dechal', 'rechal', 'lot_num', 'exp_dt', 'nda_num', 'dose_amt', 'dose_unit', 'dose_form', 'dose_freq']

INDI = ['indi_drug_seq']

THER = ['dsg_drug_seq', 'start_dt', 'end_dt', 'dur', 'dur_cod']

DEMO = ['caseid', 'caseversion', 'i_f_code', 'mfr_dt', 'init_fda_dt', 'fda_dt', 'rept_cod', 'auth_num', 'mfr_num', 'mfr_sndr', 'lit_ref']

'rept_dt',

#### 2.5.4 Check age, age_cod, age_grp

In [62]:
data['age'].describe(include='all')

count    191783.000000
mean         62.683799
std         564.056113
min           0.000000
25%          30.000000
50%          48.000000
75%          63.000000
max       32056.000000
Name: age, dtype: float64

In [63]:
data['age_cod'] = data['age_cod'].apply(normalize_text)

data['age_cod'].value_counts()

yr     183690
        55862
dy       4778
dec      2548
wk        427
mon       331
hr         85
Name: age_cod, dtype: int64

In [64]:
# define conversion factors for age_cod to years
conversion_factors = {'yr': 1,
                      'mon': 1 / 12,
                      'wk': 1 / 52,
                      'dy': 1 / 365,
                      'hr': 1 / 8760,
                      'dec': 10}

# convert age to year
data['age_cleaned'] = data['age'] * data['age_cod'].map(conversion_factors)

print(data.head(20))

    primaryid         role_cod        prod_ai             indi_pt   event_dt  \
0    74189024  primary suspect     sertraline  depressive symptom 2023-10-26   
1    74189024  primary suspect     sertraline  depressive symptom 2023-10-26   
2    74189024  primary suspect     sertraline  depressive symptom 2023-10-26   
3    74189024  primary suspect     sertraline  depressive symptom 2023-10-26   
4    74189024  primary suspect     sertraline  depressive symptom 2023-10-26   
5    74189024  primary suspect     sertraline  depressive symptom 2023-10-26   
6    74189024  primary suspect     sertraline  depressive symptom 2023-10-26   
7    74189024  primary suspect     sertraline  depressive symptom 2023-10-26   
8   109174139  primary suspect  oxcarbazepine          depression 2023-10-16   
9   109174139  primary suspect  oxcarbazepine          depression 2023-10-16   
10  109174139  primary suspect  oxcarbazepine          depression 2023-10-16   
11  109174139  primary suspect  oxcarbaz

In [65]:
# filter blank age_cod but with age number
na_age_cod = data[(data['age_cod'] == '') & (data['age'].notnull())]

print(na_age_cod['age'].unique())

[]


In [66]:
data['age_cleaned'].describe(include='all')

count    191783.000000
mean         47.389630
std          20.949671
min           0.000000
25%          31.000000
50%          48.000000
75%          63.000000
max         420.000000
Name: age_cleaned, dtype: float64

In [67]:
# we only focus on patient from with age value to further analysis, age valid 12 - 123
data['age_cleaned'] = data['age_cleaned'].replace('', np.nan)

data = data.dropna(subset=['age_cleaned'])

data = data[(data['age_cleaned'] >= 12) & (data['age_cleaned'] < 123)]

data['age_cleaned'].describe(include='all')

count    185754.000000
mean         48.900512
std          19.467537
min          12.000000
25%          33.000000
50%          49.000000
75%          63.000000
max         103.000000
Name: age_cleaned, dtype: float64

In [68]:
data['age_grp'] = data['age_grp'].apply(normalize_text)

data['age_grp'].value_counts()

     159970
a     20274
e      4450
t      1056
c         4
Name: age_grp, dtype: int64

In [69]:
# recalculate age_grp based on FDA definition
def set_age_group(age):
    if 12 <= age < 21:
        return 'adolescent'
    elif 21 <= age < 65:
        return 'adult'
    elif age >= 65:
        return 'elderly'
    else:
        return None

data['age_grp'] = data['age_cleaned'].apply(set_age_group)

print(data['age_grp'].value_counts())

adult         128763
elderly        43835
adolescent     13156
Name: age_grp, dtype: int64


In [70]:
# replace 'age' with 'age_cleaned'
data['age'] = data['age_cleaned']

# remove 'age_cod' & 'age_cleaned'
data = data.drop(columns=['age_cod', 'age_cleaned'])

# double-check
print(data.info())
print(data['age'].describe(include='all'))

<class 'pandas.core.frame.DataFrame'>
Int64Index: 185754 entries, 0 to 247720
Data columns (total 21 columns):
 #   Column            Non-Null Count   Dtype         
---  ------            --------------   -----         
 0   primaryid         185754 non-null  int64         
 1   role_cod          185754 non-null  object        
 2   prod_ai           185754 non-null  object        
 3   indi_pt           185754 non-null  object        
 4   event_dt          185754 non-null  datetime64[ns]
 5   age               185754 non-null  float64       
 6   age_grp           185754 non-null  object        
 7   sex               184445 non-null  object        
 8   e_sub             185754 non-null  object        
 9   wt                79125 non-null   float64       
 10  wt_cod            79125 non-null   object        
 11  to_mfr            18222 non-null   object        
 12  occp_cod          184176 non-null  object        
 13  reporter_country  185754 non-null  object        
 14  occr

Remove columns from initial dataset:

DRUG = ['drug_seq', 'drugname', 'prod_ai' - *keep 'prod_ai_cleaned'*, 'val_vbm', 'route', 'dose_vbm', 'cum_dose_chr', 'cum_dose_unit',
'dechal', 'rechal', 'lot_num', 'exp_dt', 'nda_num', 'dose_amt', 'dose_unit', 'dose_form', 'dose_freq']

INDI = ['indi_drug_seq']

THER = ['dsg_drug_seq', 'start_dt', 'end_dt', 'dur', 'dur_cod']

DEMO = ['caseid', 'caseversion', 'i_f_code', 'mfr_dt', 'init_fda_dt', 'fda_dt', 'rept_cod', 'auth_num', 'mfr_num', 'mfr_sndr', 'lit_ref', 'age_cod']

'rept_dt',

#### 2.5.5 Check sex, e_sub

In [71]:
data['sex'].value_counts()

F      125078
M       59328
UNK        29
T           6
P           4
Name: sex, dtype: int64

In [72]:
# Rename the column 'sex' to 'gender'
data = data.rename(columns={'sex': 'gender'})

# rename values
gender_dic = {'F': 'female',
                  'M': 'male',
                  'UNK': 'unknown',
                  'T': 'unknown',
                  'P': 'unknown'}

data['gender'] = data['gender'].replace(gender_dic)

print(data['gender'].value_counts())

female     125078
male        59328
unknown        39
Name: gender, dtype: int64


In [73]:
data = data[data['gender'] != 'unknown']

data['gender'] = data['gender'].replace('', np.nan)
data = data.dropna(subset=['gender'])

print(data['gender'].value_counts())
print(data.info())

female    125078
male       59328
Name: gender, dtype: int64
<class 'pandas.core.frame.DataFrame'>
Int64Index: 184406 entries, 0 to 247720
Data columns (total 21 columns):
 #   Column            Non-Null Count   Dtype         
---  ------            --------------   -----         
 0   primaryid         184406 non-null  int64         
 1   role_cod          184406 non-null  object        
 2   prod_ai           184406 non-null  object        
 3   indi_pt           184406 non-null  object        
 4   event_dt          184406 non-null  datetime64[ns]
 5   age               184406 non-null  float64       
 6   age_grp           184406 non-null  object        
 7   gender            184406 non-null  object        
 8   e_sub             184406 non-null  object        
 9   wt                78708 non-null   float64       
 10  wt_cod            78708 non-null   object        
 11  to_mfr            17959 non-null   object        
 12  occp_cod          182838 non-null  object        
 13

In [74]:
data['e_sub'].value_counts()

Y    166385
N     18021
Name: e_sub, dtype: int64

In [75]:
# drop e_sub for not useful
data = data.drop(columns=['e_sub'])

data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 184406 entries, 0 to 247720
Data columns (total 20 columns):
 #   Column            Non-Null Count   Dtype         
---  ------            --------------   -----         
 0   primaryid         184406 non-null  int64         
 1   role_cod          184406 non-null  object        
 2   prod_ai           184406 non-null  object        
 3   indi_pt           184406 non-null  object        
 4   event_dt          184406 non-null  datetime64[ns]
 5   age               184406 non-null  float64       
 6   age_grp           184406 non-null  object        
 7   gender            184406 non-null  object        
 8   wt                78708 non-null   float64       
 9   wt_cod            78708 non-null   object        
 10  to_mfr            17959 non-null   object        
 11  occp_cod          182838 non-null  object        
 12  reporter_country  184406 non-null  object        
 13  occr_country      175665 non-null  object        
 14  pt  

Remove columns from initial dataset:

* DRUG = ['drug_seq', 'drugname', 'prod_ai' - *keep 'prod_ai_cleaned'*, 'val_vbm', 'route', 'dose_vbm', 'cum_dose_chr', 'cum_dose_unit',
'dechal', 'rechal', 'lot_num', 'exp_dt', 'nda_num', 'dose_amt', 'dose_unit', 'dose_form', 'dose_freq']

* INDI = ['indi_drug_seq']

* THER = ['dsg_drug_seq', 'start_dt', 'end_dt', 'dur', 'dur_cod']

* DEMO = ['caseid', 'caseversion', 'i_f_code', 'mfr_dt', 'init_fda_dt', 'fda_dt', 'rept_cod', 'auth_num', 'mfr_num', 'mfr_sndr', 'lit_ref', 'age_cod', 'e_sub']

* 'rept_dt',


Non-AEs filtered out for event_dt: the top 3 AEs of mising bias is not AEs -> remove these 3 symptoms out of data:
* non_ae = ['Drug ineffective', 'Drug interaction', 'Off label use']

#### 2.5.6 Check wt, wt_cod, to_mfr, occp_cod, reporter_country, occr_country

In [76]:
data['reporter_country'] = data['reporter_country'].apply(normalize_text)

data['reporter_country'].value_counts()

us    60150
gb    27350
ca    17154
fr    15209
de     8814
      ...  
aw        2
kz        1
um        1
bb        1
ss        1
Name: reporter_country, Length: 89, dtype: int64

In [77]:
data['occr_country'] = data['occr_country'].apply(normalize_text)

print(data['occr_country'].value_counts())
print(data['occr_country'].unique())

us    54056
gb    26611
ca    16487
fr    16104
de     9566
      ...  
jm        2
um        1
bb        1
sd        1
kz        1
Name: occr_country, Length: 90, dtype: int64
['pt' 'es' 'us' 'dk' 'de' 'fr' 'ca' 'pl' 'tr' 'jp' 'se' 'gb' '' 'in' 'il'
 'it' 'hr' 'be' 'cn' 'si' 'nl' 'fi' 'au' 'ch' 'bg' 'br' 'za' 'hk' 'rs'
 'ru' 'tw' 'pe' 'ar' 'sa' 'pr' 'pa' 'ae' 'ua' 'mx' 'ma' 'at' 'th' 'gr'
 'lu' 'tn' 'ec' 'ie' 'no' 'kr' 'sd' 'eg' 'co' 'hu' 'cr' 'lt' 'cz' 'ro'
 'ir' 'mc' 'sk' 'mt' 'is' 'ph' 'my' 'id' 'sv' 'pk' 'nz' 'cl' 'np' 'lk'
 'bb' 'ee' 'kz' 'kw' 'al' 'cy' 'sg' 'ps' 'lb' 'jm' 'ke' 'um' 'jo' 'ug'
 'dm' 'zw' 'qa' 'ng' 'aw']


In [78]:
data['occr_country'] = data['occr_country'].replace('', np.nan)

data = data.dropna(subset=['occr_country'])

data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 175665 entries, 0 to 247720
Data columns (total 20 columns):
 #   Column            Non-Null Count   Dtype         
---  ------            --------------   -----         
 0   primaryid         175665 non-null  int64         
 1   role_cod          175665 non-null  object        
 2   prod_ai           175665 non-null  object        
 3   indi_pt           175665 non-null  object        
 4   event_dt          175665 non-null  datetime64[ns]
 5   age               175665 non-null  float64       
 6   age_grp           175665 non-null  object        
 7   gender            175665 non-null  object        
 8   wt                72681 non-null   float64       
 9   wt_cod            72681 non-null   object        
 10  to_mfr            11676 non-null   object        
 11  occp_cod          174749 non-null  object        
 12  reporter_country  175665 non-null  object        
 13  occr_country      175665 non-null  object        
 14  pt  

In [79]:
country_check = data['occr_country'].value_counts()
print(country_check)

# country_check_df = country_check.reset_index()
# country_check_df.columns = ['occr_country', 'count']
# country_check_df.to_csv('country_check_df.csv', index=False) => export file for manually checking

us    54056
gb    26611
ca    16487
fr    16104
de     9566
      ...  
jm        2
um        1
bb        1
sd        1
kz        1
Name: occr_country, Length: 89, dtype: int64


In [80]:
# drop na country and map the name
country_map = {'us': 'united states', 'gb': 'united kingdom', 'ca': 'canada', 'fr': 'france',
               'de': 'germany', 'it': 'italy', 'es': 'spain', 'pl': 'poland', 'pt': 'portugal',
               'jp': 'japan', 'se': 'sweden', 'cn': 'china', 'nl': 'netherlands', 'gr': 'greece',
               'br': 'brazil', 'be': 'belgium', 'au': 'australia', 'cz': 'czechia', 'ch': 'switzerland', 'tr': 'turkey'}

# map occr_country to event_country and assign 'others' for unmatched entries
data['event_country'] = data['occr_country'].map(country_map).fillna('others')

print(data[['occr_country', 'event_country']].head(20))

   occr_country event_country
0            pt      portugal
1            pt      portugal
2            pt      portugal
3            pt      portugal
4            pt      portugal
5            pt      portugal
6            pt      portugal
7            pt      portugal
8            es         spain
9            es         spain
10           es         spain
11           es         spain
12           es         spain
13           es         spain
14           es         spain
15           es         spain
16           es         spain
17           es         spain
18           es         spain
19           es         spain


In [81]:
print(data.info())
print(country_check)
print(data['event_country'].value_counts())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 175665 entries, 0 to 247720
Data columns (total 21 columns):
 #   Column            Non-Null Count   Dtype         
---  ------            --------------   -----         
 0   primaryid         175665 non-null  int64         
 1   role_cod          175665 non-null  object        
 2   prod_ai           175665 non-null  object        
 3   indi_pt           175665 non-null  object        
 4   event_dt          175665 non-null  datetime64[ns]
 5   age               175665 non-null  float64       
 6   age_grp           175665 non-null  object        
 7   gender            175665 non-null  object        
 8   wt                72681 non-null   float64       
 9   wt_cod            72681 non-null   object        
 10  to_mfr            11676 non-null   object        
 11  occp_cod          174749 non-null  object        
 12  reporter_country  175665 non-null  object        
 13  occr_country      175665 non-null  object        
 14  pt  

In [82]:
# replace 'occr_country' with 'event_country'
data['occr_country'] = data['event_country']

# remove 'reporter_country', 'event_country'
data = data.drop(columns=['reporter_country', 'event_country'])

# double-check
print(data.info())
print(data['occr_country'].value_counts())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 175665 entries, 0 to 247720
Data columns (total 19 columns):
 #   Column        Non-Null Count   Dtype         
---  ------        --------------   -----         
 0   primaryid     175665 non-null  int64         
 1   role_cod      175665 non-null  object        
 2   prod_ai       175665 non-null  object        
 3   indi_pt       175665 non-null  object        
 4   event_dt      175665 non-null  datetime64[ns]
 5   age           175665 non-null  float64       
 6   age_grp       175665 non-null  object        
 7   gender        175665 non-null  object        
 8   wt            72681 non-null   float64       
 9   wt_cod        72681 non-null   object        
 10  to_mfr        11676 non-null   object        
 11  occp_cod      174749 non-null  object        
 12  occr_country  175665 non-null  object        
 13  pt            175665 non-null  object        
 14  drug_rec_act  693 non-null     object        
 15  outc_cod      157

In [83]:
data['occp_cod'] = data['occp_cod'].apply(normalize_text)

data['occp_cod'].value_counts()

cn    64774
md    49420
hp    33314
ot    14504
ph    12523
        916
lw      214
Name: occp_cod, dtype: int64

In [84]:
# drop columns not neccessary for further analysis
data = data.drop(columns=['wt', 'wt_cod', 'to_mfr', 'occp_cod'])

data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 175665 entries, 0 to 247720
Data columns (total 15 columns):
 #   Column        Non-Null Count   Dtype         
---  ------        --------------   -----         
 0   primaryid     175665 non-null  int64         
 1   role_cod      175665 non-null  object        
 2   prod_ai       175665 non-null  object        
 3   indi_pt       175665 non-null  object        
 4   event_dt      175665 non-null  datetime64[ns]
 5   age           175665 non-null  float64       
 6   age_grp       175665 non-null  object        
 7   gender        175665 non-null  object        
 8   occr_country  175665 non-null  object        
 9   pt            175665 non-null  object        
 10  drug_rec_act  693 non-null     object        
 11  outc_cod      157746 non-null  object        
 12  rpsr_cod      11550 non-null   object        
 13  webmd_group   175665 non-null  object        
 14  webmd_ad      175665 non-null  object        
dtypes: datetime64[ns]

Remove columns from initial dataset:

* DRUG = ['drug_seq', 'drugname', 'prod_ai' - *keep 'prod_ai_cleaned'*, 'val_vbm', 'route', 'dose_vbm', 'cum_dose_chr', 'cum_dose_unit',
'dechal', 'rechal', 'lot_num', 'exp_dt', 'nda_num', 'dose_amt', 'dose_unit', 'dose_form', 'dose_freq']

* INDI = ['indi_drug_seq']

* THER = ['dsg_drug_seq', 'start_dt', 'end_dt', 'dur', 'dur_cod']

* DEMO = ['caseid', 'caseversion', 'i_f_code', 'mfr_dt', 'init_fda_dt', 'fda_dt', 'rept_cod', 'auth_num', 'mfr_num', 'mfr_sndr', 'lit_ref',
'age_cod', 'e_sub', 'wt', 'wt_cod', 'rept_dt', 'to_mfr', 'occp_cod', 'reporter_country']

* 


Non-AEs filtered out for event_dt: the top 3 AEs of mising bias is not AEs -> remove these 3 symptoms out of data:
* non_ae = ['Drug ineffective', 'Drug interaction', 'Off label use']

### 2.6 CHECK DATA - REAC & OUTC

#### 2.6.1 Check outc_cod

In [85]:
data['outc_cod'] = data['outc_cod'].apply(normalize_text)

data['outc_cod'].value_counts()

ot    79156
ho    49006
      17919
lt    13750
ds    10507
de     4809
ri      313
ca      205
Name: outc_cod, dtype: int64

In [86]:
# map the patient outcome
outcome_map = {'ot': 'others',
               'ho': 'hospitalisation',
               'lt': 'life threatening',
               'ds': 'disability',
               'de': 'death',
               'ri': 'required intervention',
               'ca': 'congenital anomaly'}

# map occr_country to event_country and assign 'others' for unmatched entries
data['outc_cod'] = data['outc_cod'].replace('', np.nan)

data['outc_cod'] = data['outc_cod'].map(outcome_map).fillna('others')

print(data['outc_cod'].value_counts())
print(data.info())

others                   97075
hospitalisation          49006
life threatening         13750
disability               10507
death                     4809
required intervention      313
congenital anomaly         205
Name: outc_cod, dtype: int64
<class 'pandas.core.frame.DataFrame'>
Int64Index: 175665 entries, 0 to 247720
Data columns (total 15 columns):
 #   Column        Non-Null Count   Dtype         
---  ------        --------------   -----         
 0   primaryid     175665 non-null  int64         
 1   role_cod      175665 non-null  object        
 2   prod_ai       175665 non-null  object        
 3   indi_pt       175665 non-null  object        
 4   event_dt      175665 non-null  datetime64[ns]
 5   age           175665 non-null  float64       
 6   age_grp       175665 non-null  object        
 7   gender        175665 non-null  object        
 8   occr_country  175665 non-null  object        
 9   pt            175665 non-null  object        
 10  drug_rec_act  693 non-null

#### 2.6.3 Check rpsr_cod, drug_rec_act

In [87]:
data['rpsr_cod'] = data['rpsr_cod'].apply(normalize_text)

data['rpsr_cod'].value_counts()

       164115
csm      9647
hp       1574
fgn       323
uf          6
Name: rpsr_cod, dtype: int64

In [88]:
data['drug_rec_act'] = data['drug_rec_act'].apply(normalize_text)

data['drug_rec_act'].value_counts()

                                    174972
toxicity to various agents              19
serotonin syndrome                      19
depressed level of consciousness        15
condition aggravated                    13
                                     ...  
pupillary deformity                      1
bladder dilatation                       1
urinary retention                        1
dyspnoea                                 1
pneumonia aspiration                     1
Name: drug_rec_act, Length: 285, dtype: int64

In [89]:
# drop columns not neccessary for further analysis
# for this dataset, we focus on the 1st adverse event for further analysis => drop drug_rec_act
data = data.drop(columns=['rpsr_cod', 'drug_rec_act'])

data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 175665 entries, 0 to 247720
Data columns (total 13 columns):
 #   Column        Non-Null Count   Dtype         
---  ------        --------------   -----         
 0   primaryid     175665 non-null  int64         
 1   role_cod      175665 non-null  object        
 2   prod_ai       175665 non-null  object        
 3   indi_pt       175665 non-null  object        
 4   event_dt      175665 non-null  datetime64[ns]
 5   age           175665 non-null  float64       
 6   age_grp       175665 non-null  object        
 7   gender        175665 non-null  object        
 8   occr_country  175665 non-null  object        
 9   pt            175665 non-null  object        
 10  outc_cod      175665 non-null  object        
 11  webmd_group   175665 non-null  object        
 12  webmd_ad      175665 non-null  object        
dtypes: datetime64[ns](1), float64(1), int64(1), object(10)
memory usage: 18.8+ MB


Remove columns from initial dataset:

* DRUG = ['drug_seq', 'drugname', 'prod_ai' - *keep 'prod_ai_cleaned'*, 'val_vbm', 'route', 'dose_vbm', 'cum_dose_chr', 'cum_dose_unit',
'dechal', 'rechal', 'lot_num', 'exp_dt', 'nda_num', 'dose_amt', 'dose_unit', 'dose_form', 'dose_freq']

* INDI = ['indi_drug_seq']

* THER = ['dsg_drug_seq', 'start_dt', 'end_dt', 'dur', 'dur_cod']

* DEMO = ['caseid', 'caseversion', 'i_f_code', 'mfr_dt', 'init_fda_dt', 'fda_dt', 'rept_cod', 'auth_num', 'mfr_num', 'mfr_sndr', 'lit_ref',
'age_cod', 'e_sub', 'wt', 'wt_cod', 'rept_dt', 'to_mfr', 'occp_cod', 'reporter_country']

* REAC = ['drug_rec_act']

* OUTC = ['rpsr_cod']


Non-AEs filtered out for event_dt: the top 3 AEs of mising bias is not AEs -> remove these 3 symptoms out of data:
* non_ae = ['Drug ineffective', 'Drug interaction', 'Off label use']

### 2.7 CHECK DATA BEFORE ADVERSE EVENTS

In [90]:
# Drop exact duplicates across all columns
data = data.drop_duplicates(subset=['primaryid', 'pt'])

data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 126101 entries, 0 to 247720
Data columns (total 13 columns):
 #   Column        Non-Null Count   Dtype         
---  ------        --------------   -----         
 0   primaryid     126101 non-null  int64         
 1   role_cod      126101 non-null  object        
 2   prod_ai       126101 non-null  object        
 3   indi_pt       126101 non-null  object        
 4   event_dt      126101 non-null  datetime64[ns]
 5   age           126101 non-null  float64       
 6   age_grp       126101 non-null  object        
 7   gender        126101 non-null  object        
 8   occr_country  126101 non-null  object        
 9   pt            126101 non-null  object        
 10  outc_cod      126101 non-null  object        
 11  webmd_group   126101 non-null  object        
 12  webmd_ad      126101 non-null  object        
dtypes: datetime64[ns](1), float64(1), int64(1), object(10)
memory usage: 13.5+ MB


### 2.8 CHECK ADVERSE EVENTS

In [91]:
data['pt'] = data['pt'].apply(normalize_text)

data['pt'].value_counts()

nausea                               2183
suicidal ideation                    2072
anxiety                              1886
dizziness                            1798
depression                           1763
                                     ... 
joint instability                       1
staphylococcal bacteraemia              1
coronary arterial stent insertion       1
perforated ulcer                        1
anaemia folate deficiency               1
Name: pt, Length: 3854, dtype: int64

In [92]:
pt_check = data['pt'].value_counts()
print(pt_check)

# pt_check_df = pt_check.reset_index()
# pt_check_df.columns = ['pt', 'count']
# pt_check_df.to_csv('pt_check_df.csv', index=False) # => export file for manually checking

nausea                               2183
suicidal ideation                    2072
anxiety                              1886
dizziness                            1798
depression                           1763
                                     ... 
joint instability                       1
staphylococcal bacteraemia              1
coronary arterial stent insertion       1
perforated ulcer                        1
anaemia folate deficiency               1
Name: pt, Length: 3854, dtype: int64


In [93]:
# after reviewing 1000 highest frequency pt & randomly 116 pt based on repetitive keywords ~ 91%
# => filtered out these 175 non-ae terms

non_ae = [
    'drug ineffective', 'drug interaction', 'off label use', 'product use in unapproved indication',
    'product substitution issue', 'overdose', 'intentional overdose', 'intentional product misuse',
    'product dose omission issue', 'wrong technique in product usage process', 'product use issue',
    'intentional product use issue', 'medication error', 'underdose', 'incorrect dose administered',
    'prescribed overdose', 'product quality issue', 'product dose omission',
    'drug ineffective for unapproved indication', 'covid19', 'product prescribing error',
    'inappropriate schedule of product administration', 'contraindicated product administered',
    'product dispensing error', 'accidental overdose', 'product odour abnormal', 'device malfunction',
    'no adverse event', 'labelled drugdrug interaction medication error', 'product administration error',
    'product residue present', 'device issue', 'product complaint', 'extra dose administered',
    'expired product administered', 'product availability issue', 'drug effective for unapproved indication',
    'product formulation issue', 'intentional dose omission', 'product taste abnormal',
    'product physical issue', 'drug dose omission', 'unevaluable event', 'drug screen false positive',
    'poor quality product administered', 'prescribed underdose', 'product dose omission in error',
    'incorrect route of product administration', 'product administered to patient of inappropriate age',
    'drug titration error', 'economic problem', 'product prescribing issue',
    'drug monitoring procedure incorrectly performed', 'product use complaint', 'pregnancy',
    'prescription drug used without a prescription', 'therapeutic product ineffective',
    'product solubility abnormal', 'therapeutic product effect delayed', 'drug dispensed to wrong patient',
    'product label issue', 'circumstance or information capable of leading to medication error',
    'suspected counterfeit product', 'wrong product administered', 'wrong dose', 'device defective',
    'manufacturing issue', 'inappropriate schedule of drug administration', 'product administration interrupted',
    'contraindication to medical treatment', 'suspected product quality issue',
    'manufacturing materials issue', 'product packaging quantity issue', 'product colour issue',
    'drug monitoring procedure not performed', 'accidental exposure to product',
    'incorrect product administration duration', 'drug administered to patient of inappropriate age',
    'drug dose titration not performed', 'product administered at inappropriate site', 'wrong strength',
    'product preparation error', 'inappropriate schedule of product discontinuation',
    'product storage error', 'product adhesion issue', 'duplicate therapy error', 'suspected covid19',
    'product supply issue', 'device breakage', 'wrong patient received product', 'product counterfeit',
    'retained products of conception', 'drug administration error', 'incorrect dose administered by device',
    'device dislocation', 'intercepted product selection error', 'device dispensing error',
    'incorrect dosage administered', 'intentional underdose', 'postacute covid19 syndrome',
    'product quality control issue', 'product measured potency issue', 'product lot number issue',
    'product size issue', 'syringe issue', 'product coating issue', 'product container issue',
    'abortion missed', 'counterfeit product administered', 'suspected product tampering',
    'product used for unknown indication', 'wrong dosage formulation', 'product dosage form issue',
    'product appearance confusion', 'product packaging issue', 'product shape issue', 'product commingling',
    'device adhesion issue', 'device difficult to use', 'product dispensing issue', 'intention tremor',
    'contraindicated product prescribed', 'product contamination physical', 'incorrect route of drug administration',
    'product physical consistency issue', 'product contamination', 'device leakage', 'product substitution',
    'drug prescribing error', 'recalled product administered', 'breakthrough covid19', 'covid19 pneumonia',
    'product after taste', 'device use issue', 'manufacturing production issue', 'wrong drug administered',
    'product distribution issue', 'drug dispensing error', 'product leakage', 'product communication issue',
    'accidental underdose', 'manufacturing process control procedure issue',
    'documented hypersensitivity to administered product', 'suspected product contamination',
    'discontinued product administered', 'recalled product', 'product compounding quality issue',
    'incorrect dose administered by product', 'physical product label issue', 'poor quality drug administered',
    'product monitoring error', 'therapeutic product effect prolonged', 'adulterated product',
    'intercepted medication error', 'intercepted product preparation error',
    'wrong technique in device usage process', 'device mechanical issue', 'product blister packaging issue',
    'medical device discomfort', 'wrong schedule', 'product tampering', 'labelled drugdisease interaction medication error',
    'product identification number issue', 'product confusion', 'drug therapy',
    'product contamination with body fluid', 'device failure', 'product container seal issue',
    'product selection error', 'incorrect product dosage form administered', 'product preparation issue',
    'incorrect drug administration rate', 'device programming error', 'product label confusion',
    'device infusion issue'
]


In [94]:
data = data[~data['pt'].isin(non_ae)]

print(data.info())
print(data.head(10))

<class 'pandas.core.frame.DataFrame'>
Int64Index: 118577 entries, 0 to 247720
Data columns (total 13 columns):
 #   Column        Non-Null Count   Dtype         
---  ------        --------------   -----         
 0   primaryid     118577 non-null  int64         
 1   role_cod      118577 non-null  object        
 2   prod_ai       118577 non-null  object        
 3   indi_pt       118577 non-null  object        
 4   event_dt      118577 non-null  datetime64[ns]
 5   age           118577 non-null  float64       
 6   age_grp       118577 non-null  object        
 7   gender        118577 non-null  object        
 8   occr_country  118577 non-null  object        
 9   pt            118577 non-null  object        
 10  outc_cod      118577 non-null  object        
 11  webmd_group   118577 non-null  object        
 12  webmd_ad      118577 non-null  object        
dtypes: datetime64[ns](1), float64(1), int64(1), object(10)
memory usage: 12.7+ MB
None
    primaryid         role_cod      

### 2.9 FINAL DATA DETAILS

In [103]:
data['primaryid'].nunique()

35652

In [105]:
# rename webmd_group & webmd_ad for further analysis & save cleaned file
data = data.rename(columns={'webmd_group': 'prod_ai_group',
                            'webmd_ad': 'drug_cate'})

print(data.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 118577 entries, 0 to 247720
Data columns (total 13 columns):
 #   Column         Non-Null Count   Dtype         
---  ------         --------------   -----         
 0   primaryid      118577 non-null  int64         
 1   role_cod       118577 non-null  object        
 2   prod_ai        118577 non-null  object        
 3   indi_pt        118577 non-null  object        
 4   event_dt       118577 non-null  datetime64[ns]
 5   age            118577 non-null  float64       
 6   age_grp        118577 non-null  object        
 7   gender         118577 non-null  object        
 8   occr_country   118577 non-null  object        
 9   pt             118577 non-null  object        
 10  outc_cod       118577 non-null  object        
 11  prod_ai_group  118577 non-null  object        
 12  drug_cate      118577 non-null  object        
dtypes: datetime64[ns](1), float64(1), int64(1), object(10)
memory usage: 12.7+ MB
None


In [106]:
data.describe(include='all').T

  data.describe(include='all').T


Unnamed: 0,count,unique,top,freq,first,last,mean,std,min,25%,50%,75%,max
primaryid,118577.0,,,,NaT,NaT,214149509.769719,226362961.436257,58509932.0,163496367.0,179871751.0,203117394.0,2288369510.0
role_cod,118577.0,1.0,primary suspect,118577.0,NaT,NaT,,,,,,,
prod_ai,118577.0,173.0,sertraline,17714.0,NaT,NaT,,,,,,,
indi_pt,118577.0,19.0,depression,91358.0,NaT,NaT,,,,,,,
event_dt,118577.0,2162.0,2019-01-01 00:00:00,1886.0,2018-01-01,2023-12-27,,,,,,,
age,118577.0,,,,NaT,NaT,49.645322,19.416225,12.0,34.0,50.0,64.0,103.0
age_grp,118577.0,3.0,adult,81583.0,NaT,NaT,,,,,,,
gender,118577.0,2.0,female,81612.0,NaT,NaT,,,,,,,
occr_country,118577.0,21.0,united states,39709.0,NaT,NaT,,,,,,,
pt,118577.0,3682.0,nausea,2183.0,NaT,NaT,,,,,,,


Columns are removed from initial dataset:

* DRUG = ['drug_seq', 'drugname', 'prod_ai' - *keep 'prod_ai_cleaned'*, 'val_vbm', 'route', 'dose_vbm', 'cum_dose_chr', 'cum_dose_unit',
'dechal', 'rechal', 'lot_num', 'exp_dt', 'nda_num', 'dose_amt', 'dose_unit', 'dose_form', 'dose_freq']

* INDI = ['indi_drug_seq']

* THER = ['dsg_drug_seq', 'start_dt', 'end_dt', 'dur', 'dur_cod']

* DEMO = ['caseid', 'caseversion', 'i_f_code', 'mfr_dt', 'init_fda_dt', 'fda_dt', 'rept_cod', 'auth_num', 'mfr_num', 'mfr_sndr', 'lit_ref',
'age_cod', 'e_sub', 'wt', 'wt_cod', 'rept_dt', 'to_mfr', 'occp_cod', 'reporter_country']

* REAC = ['drug_rec_act']

* OUTC = ['rpsr_cod']

Columns are added to final dataset by verified product active ingredients with WebMD: 'prod_ai_group', 'drug_cate'.

Non-AEs filtered out: Refer 2.8

In [107]:
# # save the cleaned data for AEs mapping & further analysis
# step2_data = data.copy()
# step2_data.to_csv('FAERS_cleaned_data.csv', index=False)

In [108]:
# # Group the data by 'primaryid' and aggregate the indications
# aggregated_data = data.groupby('primaryid')['indi_pt'].apply(lambda x: ', '.join(x.unique())).reset_index()

# # Count the frequency of each unique indication across all primaryids
# value_counts = aggregated_data['indi_pt'].value_counts()

# # Create a formatted DataFrame for LaTeX table
# formatted_table = pd.DataFrame({
#     'Focused Indication': value_counts.index,
#     'Frequency': value_counts.values
# })

# # Specify the column format for the LaTeX table
# column_format = 'l@{}r'

# # Generate the LaTeX table
# latex_table = formatted_table.to_latex(
#     index=False, 
#     header=True, 
#     column_format=column_format,
#     escape=False
# )

# # Save the LaTeX table to a file
# with open('Focused_Indication.tex', 'w') as f:
#     f.write(latex_table)

# # Optionally, print the first few rows of the formatted table
# print(formatted_table.head())


                      Focused Indication  Frequency
0                             depression      27086
1                       major depression       4927
2  mixed anxiety and depressive disorder        916
3                 antidepressant therapy        745
4                         depressed mood        600


  latex_table = formatted_table.to_latex(


> Refer fda_step3_map for next part of mapping adverse events to MedDRA categories.