# Projects for Analysis of Anti-Depressant Drugs’s Adverse Events by FDA Adverse Event Reporting System (FAERS) from January 2019 to December 2023

## 3. MAPPING ADVERSE EVENTS WITH MEDDRA 27.1 

In [1]:
# load library
import pandas as pd
import numpy as np
import re

In [2]:
# normalise text function
def normalize_text(text):
    if pd.isnull(text):
        return ''
    
    return re.sub(r'[^\w\s]', '', text).lower()

### 3.1 Create data of MedDRA

In [3]:
# import pandas as pd

# # Load the file (example: smq_content.asc) with potential formatting errors
# smq_content = pd.read_csv('data/MedDRA_27_1_English/MedAscii/smq_content.asc', sep='$', encoding='latin-1', header=None)

# # Detect columns with all NaN values
# smq_content_nan = smq_content.isnull().all()

# # Drop columns with all NaN values
# smq_content_cleaned = smq_content.loc[:, ~smq_content_nan]

# # Check the number of columns after cleaning
# print(f"Original columns: {smq_content.shape[1]}, Cleaned columns: {smq_content_cleaned.shape[1]}")

# # Assign appropriate column names
# smq_content_cleaned.columns = ['smq_code', 'term_code', 'term_level', 'term_scope', 'term_category', 'term_weight',
#                                'term_status', 'term_addition_version', 'term_last_modified_version']

# # drop unused cols and check the cleaned data
# smq_content_cleaned = smq_content_cleaned.drop(columns=['term_weight', 'term_addition_version', 'term_last_modified_version'])
# print(smq_content_cleaned.head())

In [4]:
# smq_content_cleaned.info()

In [5]:
# import pandas as pd

# # Load the file (example: smq_list.asc) with potential formatting errors
# smq_list = pd.read_csv('data/MedDRA_27_1_English/MedAscii/smq_list.asc', sep='$', encoding='latin-1', header=None)

# # Detect columns with all NaN values
# smq_list_nan = smq_list.isnull().all()

# # Drop columns with all NaN values
# smq_list_cleaned = smq_list.loc[:, ~smq_list_nan]

# # Check the number of columns after cleaning
# print(f"Original columns: {smq_list.shape[1]}, Cleaned columns: {smq_list_cleaned.shape[1]}")

# # Assign appropriate column names
# smq_list_cleaned.columns = ['smq_code', 'smq_name', 'smq_level', 'smq_description', 'smq_source', 'smq_note',
#                             'MedDRA_version', 'status', 'smq_algorithm']

# # drop unused cols and check the cleaned data
# smq_list_cleaned = smq_list_cleaned.drop(columns=['MedDRA_version', 'status', 'smq_algorithm'])
# print(smq_list_cleaned.head())

In [6]:
# smq_list_cleaned.info()

In [7]:
import pandas as pd

# Load the file (example: mdhier.asc) with potential formatting errors
mdhier = pd.read_csv('data/MedDRA_27_1_English/MedAscii/mdhier.asc', sep='$', encoding='latin-1', header=None)

# Drop columns with all NaN values
mdhier_cleaned = mdhier.dropna(axis=1, how='all')

# Check the number of columns after cleaning
print(f"Original columns: {mdhier.shape[1]}, Cleaned columns: {mdhier_cleaned.shape[1]}")

# Assign appropriate column names
mdhier_cleaned.columns = ['pt_code', 'hlt_code', 'hlgt_code', 'soc_code',
                          'pt_name', 'hlt_name', 'hlgt_name', 'soc_name', 'soc_abbrev ',
                          'pt_soc_code', 'primary_soc_fg']

# Verify the cleaned data
print(mdhier_cleaned.head())

Original columns: 13, Cleaned columns: 11
    pt_code  hlt_code  hlgt_code  soc_code                         pt_name  \
0  10002043  10002042   10002086  10005329       Anaemia folate deficiency   
1  10002080  10002042   10002086  10005329  Anaemia vitamin B12 deficiency   
2  10002081  10002042   10002086  10005329   Anaemia vitamin B6 deficiency   
3  10022972  10002042   10002086  10005329         Iron deficiency anaemia   
4  10034695  10002042   10002086  10005329              Pernicious anaemia   

               hlt_name                                     hlgt_name  \
0  Anaemia deficiencies  Anaemias nonhaemolytic and marrow depression   
1  Anaemia deficiencies  Anaemias nonhaemolytic and marrow depression   
2  Anaemia deficiencies  Anaemias nonhaemolytic and marrow depression   
3  Anaemia deficiencies  Anaemias nonhaemolytic and marrow depression   
4  Anaemia deficiencies  Anaemias nonhaemolytic and marrow depression   

                               soc_name soc_abbrev

In [8]:
print(mdhier_cleaned.info())
print(mdhier_cleaned.nunique())
print(mdhier_cleaned['primary_soc_fg'].value_counts())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41363 entries, 0 to 41362
Data columns (total 11 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   pt_code         41363 non-null  int64 
 1   hlt_code        41363 non-null  int64 
 2   hlgt_code       41363 non-null  int64 
 3   soc_code        41363 non-null  int64 
 4   pt_name         41363 non-null  object
 5   hlt_name        41363 non-null  object
 6   hlgt_name       41363 non-null  object
 7   soc_name        41363 non-null  object
 8   soc_abbrev      41363 non-null  object
 9   pt_soc_code     41363 non-null  int64 
 10  primary_soc_fg  41363 non-null  object
dtypes: int64(5), object(6)
memory usage: 3.5+ MB
None
pt_code           26641
hlt_code           1738
hlgt_code           337
soc_code             27
pt_name           26641
hlt_name           1738
hlgt_name           337
soc_name             27
soc_abbrev           27
pt_soc_code          27
primary_soc_fg        2


In [9]:
# filter based on role_cod of main suspect
mdhier_cleaned = mdhier_cleaned[mdhier_cleaned['primary_soc_fg'] == 'Y']

print(mdhier_cleaned.info())
print(mdhier_cleaned.nunique())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 26641 entries, 0 to 41362
Data columns (total 11 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   pt_code         26641 non-null  int64 
 1   hlt_code        26641 non-null  int64 
 2   hlgt_code       26641 non-null  int64 
 3   soc_code        26641 non-null  int64 
 4   pt_name         26641 non-null  object
 5   hlt_name        26641 non-null  object
 6   hlgt_name       26641 non-null  object
 7   soc_name        26641 non-null  object
 8   soc_abbrev      26641 non-null  object
 9   pt_soc_code     26641 non-null  int64 
 10  primary_soc_fg  26641 non-null  object
dtypes: int64(5), object(6)
memory usage: 2.4+ MB
None
pt_code           26641
hlt_code           1564
hlgt_code           326
soc_code             27
pt_name           26641
hlt_name           1564
hlgt_name           326
soc_name             27
soc_abbrev           27
pt_soc_code          27
primary_soc_fg        1


In [10]:
for col in mdhier_cleaned.columns:
    if mdhier_cleaned[col].dtype == 'object':  # Check if the column is of string type
        mdhier_cleaned[col] = mdhier_cleaned[col].apply(normalize_text)

# Check the updated DataFrame information
mdhier_cleaned.head(10)

Unnamed: 0,pt_code,hlt_code,hlgt_code,soc_code,pt_name,hlt_name,hlgt_name,soc_name,soc_abbrev,pt_soc_code,primary_soc_fg
0,10002043,10002042,10002086,10005329,anaemia folate deficiency,anaemia deficiencies,anaemias nonhaemolytic and marrow depression,blood and lymphatic system disorders,blood,10005329,y
1,10002080,10002042,10002086,10005329,anaemia vitamin b12 deficiency,anaemia deficiencies,anaemias nonhaemolytic and marrow depression,blood and lymphatic system disorders,blood,10005329,y
2,10002081,10002042,10002086,10005329,anaemia vitamin b6 deficiency,anaemia deficiencies,anaemias nonhaemolytic and marrow depression,blood and lymphatic system disorders,blood,10005329,y
3,10022972,10002042,10002086,10005329,iron deficiency anaemia,anaemia deficiencies,anaemias nonhaemolytic and marrow depression,blood and lymphatic system disorders,blood,10005329,y
4,10034695,10002042,10002086,10005329,pernicious anaemia,anaemia deficiencies,anaemias nonhaemolytic and marrow depression,blood and lymphatic system disorders,blood,10005329,y
5,10037006,10002042,10002086,10005329,protein deficiency anaemia,anaemia deficiencies,anaemias nonhaemolytic and marrow depression,blood and lymphatic system disorders,blood,10005329,y
7,10061101,10002042,10002086,10005329,deficiency anaemia,anaemia deficiencies,anaemias nonhaemolytic and marrow depression,blood and lymphatic system disorders,blood,10005329,y
8,10066468,10002042,10002086,10005329,anaemia of pregnancy,anaemia deficiencies,anaemias nonhaemolytic and marrow depression,blood and lymphatic system disorders,blood,10005329,y
12,10084860,10002042,10002086,10005329,hypotransferrinaemia,anaemia deficiencies,anaemias nonhaemolytic and marrow depression,blood and lymphatic system disorders,blood,10005329,y
13,10086662,10002042,10002086,10005329,scorbutic anaemia,anaemia deficiencies,anaemias nonhaemolytic and marrow depression,blood and lymphatic system disorders,blood,10005329,y


In [11]:
mdhier_cleaned.describe(include='all')

Unnamed: 0,pt_code,hlt_code,hlgt_code,soc_code,pt_name,hlt_name,hlgt_name,soc_name,soc_abbrev,pt_soc_code,primary_soc_fg
count,26641.0,26641.0,26641.0,26641.0,26641,26641,26641,26641,26641,26641.0,26641
unique,,,,,26641,1564,326,27,27,,1
top,,,,,anaemia folate deficiency,virus identification and serology,haematology investigations incl blood groups,investigations,inv,,y
freq,,,,,1,313,762,6407,6407,,26641
mean,10054280.0,10031870.0,10028690.0,10026260.0,,,,,,10026260.0,
std,25362.27,20442.66,19050.55,10718.36,,,,,,10718.36,
min,10000000.0,10000030.0,10000070.0,10005330.0,,,,,,10005330.0,
25%,10036940.0,10017540.0,10017530.0,10021880.0,,,,,,10021880.0,
50%,10060090.0,10028640.0,10025320.0,10022890.0,,,,,,10022890.0,
75%,10074720.0,10043710.0,10038670.0,10029200.0,,,,,,10029200.0,


In [12]:
# # save file to use for mapping

# mdhier_cleaned.to_csv('meddra_map.csv', index=False) # => export file for manually checking

### 3.2 Map data with our current data

In [13]:
meddra = pd.read_csv('meddra_map.csv')

meddra.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26641 entries, 0 to 26640
Data columns (total 11 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   pt_code         26641 non-null  int64 
 1   hlt_code        26641 non-null  int64 
 2   hlgt_code       26641 non-null  int64 
 3   soc_code        26641 non-null  int64 
 4   pt_name         26641 non-null  object
 5   hlt_name        26641 non-null  object
 6   hlgt_name       26641 non-null  object
 7   soc_name        26641 non-null  object
 8   soc_abbrev      26641 non-null  object
 9   pt_soc_code     26641 non-null  int64 
 10  primary_soc_fg  26641 non-null  object
dtypes: int64(5), object(6)
memory usage: 2.2+ MB


In [14]:
cleaned_data = pd.read_csv('FAERS_cleaned_data.csv')

cleaned_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 118577 entries, 0 to 118576
Data columns (total 13 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   primaryid      118577 non-null  int64  
 1   role_cod       118577 non-null  object 
 2   prod_ai        118577 non-null  object 
 3   indi_pt        118577 non-null  object 
 4   event_dt       118577 non-null  object 
 5   age            118577 non-null  float64
 6   age_grp        118577 non-null  object 
 7   gender         118577 non-null  object 
 8   occr_country   118577 non-null  object 
 9   pt             118577 non-null  object 
 10  outc_cod       118577 non-null  object 
 11  prod_ai_group  118577 non-null  object 
 12  drug_cate      118577 non-null  object 
dtypes: float64(1), int64(1), object(11)
memory usage: 11.8+ MB


In [15]:
# Normalize the columns for consistency
cleaned_data['pt'] = cleaned_data['pt'].str.strip().str.lower()
meddra['pt_name'] = meddra['pt_name'].str.strip().str.lower()

# Create a mapping dictionary from meddra
meddra_map = meddra.set_index('pt_name')['soc_name'].to_dict()

# Define a mapping function to get 'pt_name' and 'soc_name'
def map_pt(pt):
    if pt in meddra_map:
        return pd.Series([pt, meddra_map[pt]])
    else:
        return pd.Series([pt, None])  # Keep 'pt' and set 'soc_name' as None if unmatched

# Apply the mapping function to the 'pt' column in cleaned_data
cleaned_data[['pt_name', 'soc_name']] = cleaned_data['pt'].apply(map_pt)

# Check unmatched rows
unmatched_rows = cleaned_data[cleaned_data['soc_name'].isna()]
print(f"Unmatched Rows: {len(unmatched_rows)}")
print(unmatched_rows[['pt']].head())

# Verify the updated DataFrame
print(cleaned_data.info())
print(cleaned_data.head())

Unmatched Rows: 490
                                          pt
154             nonalcoholic steatohepatitis
3243   gastrointestinal bacterial overgrowth
8670                        gastric disorder
9735                        gastric disorder
10507                       gastric disorder
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 118577 entries, 0 to 118576
Data columns (total 15 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   primaryid      118577 non-null  int64  
 1   role_cod       118577 non-null  object 
 2   prod_ai        118577 non-null  object 
 3   indi_pt        118577 non-null  object 
 4   event_dt       118577 non-null  object 
 5   age            118577 non-null  float64
 6   age_grp        118577 non-null  object 
 7   gender         118577 non-null  object 
 8   occr_country   118577 non-null  object 
 9   pt             118577 non-null  object 
 10  outc_cod       118577 non-null  object 
 11  prod_ai_g

In [16]:
# rename soc_name for analysis
cleaned_data = cleaned_data.rename(columns={'soc_name': 'ae_cate'})

print(cleaned_data.info())

cleaned_data['ae_cate'].value_counts()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 118577 entries, 0 to 118576
Data columns (total 15 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   primaryid      118577 non-null  int64  
 1   role_cod       118577 non-null  object 
 2   prod_ai        118577 non-null  object 
 3   indi_pt        118577 non-null  object 
 4   event_dt       118577 non-null  object 
 5   age            118577 non-null  float64
 6   age_grp        118577 non-null  object 
 7   gender         118577 non-null  object 
 8   occr_country   118577 non-null  object 
 9   pt             118577 non-null  object 
 10  outc_cod       118577 non-null  object 
 11  prod_ai_group  118577 non-null  object 
 12  drug_cate      118577 non-null  object 
 13  pt_name        118577 non-null  object 
 14  ae_cate        118087 non-null  object 
dtypes: float64(1), int64(1), object(13)
memory usage: 13.6+ MB
None


psychiatric disorders                                               27940
nervous system disorders                                            21110
general disorders and administration site conditions                13463
gastrointestinal disorders                                          10041
investigations                                                       6283
skin and subcutaneous tissue disorders                               4221
injury poisoning and procedural complications                        3840
cardiac disorders                                                    3639
musculoskeletal and connective tissue disorders                      3562
metabolism and nutrition disorders                                   3447
respiratory thoracic and mediastinal disorders                       3261
eye disorders                                                        2527
vascular disorders                                                   2350
hepatobiliary disorders               

In [17]:
# manually mapping unmatching pt
unmatched_pts = unmatched_rows['pt'].value_counts()
print(unmatched_pts)

# unmatched_pts_df = unmatched_pts.reset_index()
# unmatched_pts_df.columns = ['pt', 'count']
# unmatched_pts_df.to_csv('unmatched_pts_manual_map.csv', index=False) # => export file for manually mapping

drug effect incomplete                               57
gastric disorder                                     44
menorrhagia                                          41
congestive cardiomyopathy                            37
drug effect decreased                                33
rash generalised                                     33
pruritus generalised                                 25
temporomandibular joint syndrome                     24
metrorrhagia                                         18
immune thrombocytopenic purpura                      15
nonalcoholic steatohepatitis                         13
thrombophlebitis superficial                         13
pupils unequal                                       12
electrocardiogram pq interval prolonged              10
attention deficithyperactivity disorder               8
fluid overload                                        7
medication residue present                            6
mucous membrane disorder                        

In [18]:
unmatched_pts_map = pd.read_excel('unmatched_pts_manual_map.xlsx')

unmatched_pts_map.head()

Unnamed: 0,pt,ae_cate,count
0,drug effect incomplete,general disorders and administration site cond...,57
1,gastric disorder,gastrointestinal disorders,44
2,menorrhagia,reproductive system and breast disorders,41
3,congestive cardiomyopathy,cardiac disorders,37
4,drug effect decreased,general disorders and administration site cond...,33


In [19]:
# Ensure both columns are normalized for matching
cleaned_data['pt'] = cleaned_data['pt'].str.strip().str.lower()
unmatched_pts_map['pt'] = unmatched_pts_map['pt'].str.strip().str.lower()

# Create a mapping dictionary from unmatched_pts_map
manual_map_dict = dict(zip(unmatched_pts_map['pt'], unmatched_pts_map['ae_cate']))

# Update 'ae_cate' in cleaned_data where it's NaN using the mapping
cleaned_data['ae_cate'] = cleaned_data.apply(
    lambda row: manual_map_dict[row['pt']] if pd.isna(row['ae_cate']) and row['pt'] in manual_map_dict else row['ae_cate'],
    axis=1
)

print(cleaned_data['ae_cate'].isna().sum())
print(cleaned_data.head())

0
   primaryid         role_cod        prod_ai             indi_pt    event_dt  \
0   74189024  primary suspect     sertraline  depressive symptom  2023-10-26   
1   74189024  primary suspect     sertraline  depressive symptom  2023-10-26   
2   74189024  primary suspect     sertraline  depressive symptom  2023-10-26   
3   74189024  primary suspect     sertraline  depressive symptom  2023-10-26   
4  109174139  primary suspect  oxcarbazepine          depression  2023-10-16   

    age age_grp  gender occr_country                   pt         outc_cod  \
0  29.0   adult  female     portugal    movement disorder  hospitalisation   
1  29.0   adult  female     portugal               tremor  hospitalisation   
2  29.0   adult  female     portugal  conversion disorder  hospitalisation   
3  29.0   adult  female     portugal          torticollis  hospitalisation   
4  47.0   adult  female        spain        adverse event  hospitalisation   

                             prod_ai_group      

In [20]:
# recheck ae_cate
cleaned_data['ae_cate'].value_counts()

psychiatric disorders                                               27952
nervous system disorders                                            21113
general disorders and administration site conditions                13557
gastrointestinal disorders                                          10097
investigations                                                       6316
skin and subcutaneous tissue disorders                               4288
injury poisoning and procedural complications                        3842
cardiac disorders                                                    3678
musculoskeletal and connective tissue disorders                      3592
metabolism and nutrition disorders                                   3448
respiratory thoracic and mediastinal disorders                       3263
eye disorders                                                        2540
vascular disorders                                                   2363
hepatobiliary disorders               

In [21]:
# List of categories to group as 'Others'
cate_others = ['surgical and medical procedures', 'social circumstances', 'endocrine disorders', 'pregnancy puerperium and perinatal conditions',
               'neoplasms benign malignant and unspecified incl cysts and polyps', 'immune system disorders',
               'congenital familial and genetic disorders', 'product issues']

ae_cate_dic = {cate: 'others' for cate in cate_others}

# Rename categories in the 'ae_cate' column
cleaned_data['ae_cate'] = cleaned_data['ae_cate'].replace(ae_cate_dic)

# Print the value counts of the updated column
print(cleaned_data['ae_cate'].value_counts())

psychiatric disorders                                   27952
nervous system disorders                                21113
general disorders and administration site conditions    13557
gastrointestinal disorders                              10097
investigations                                           6316
skin and subcutaneous tissue disorders                   4288
injury poisoning and procedural complications            3842
others                                                   3731
cardiac disorders                                        3678
musculoskeletal and connective tissue disorders          3592
metabolism and nutrition disorders                       3448
respiratory thoracic and mediastinal disorders           3263
eye disorders                                            2540
vascular disorders                                       2363
hepatobiliary disorders                                  1662
reproductive system and breast disorders                 1635
renal an

In [22]:
# due to we exact match words 'pt' in our data to MedDRA data 'pt_name' => remove 'pt_name' for duplicated
cleaned_data = cleaned_data.drop(columns=['pt_name'])

print(cleaned_data.info())
print(cleaned_data.head(20).T)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 118577 entries, 0 to 118576
Data columns (total 14 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   primaryid      118577 non-null  int64  
 1   role_cod       118577 non-null  object 
 2   prod_ai        118577 non-null  object 
 3   indi_pt        118577 non-null  object 
 4   event_dt       118577 non-null  object 
 5   age            118577 non-null  float64
 6   age_grp        118577 non-null  object 
 7   gender         118577 non-null  object 
 8   occr_country   118577 non-null  object 
 9   pt             118577 non-null  object 
 10  outc_cod       118577 non-null  object 
 11  prod_ai_group  118577 non-null  object 
 12  drug_cate      118577 non-null  object 
 13  ae_cate        118577 non-null  object 
dtypes: float64(1), int64(1), object(12)
memory usage: 12.7+ MB
None
                                                    0   \
primaryid                                 

In [24]:
# # # save the cleaned data for AEs mapping & further analysis
# step3_data = cleaned_data.copy()
# step3_data.to_csv('FAERS_mapped_data.csv', index=False)

> Refer fda_step4_eda for next part of analysis.