In [21]:
import pandas as pd
import numpy as np

In [22]:
df = pd.read_csv("/content/sample_data/sample_train_eval.csv")
df.head()

Unnamed: 0,primaryid,age,fda_dt_parsed,is_severe_outcome,drug_count,indication_count,all_reaction_pts,reaction_count,is_ineffective,is_failure,...,occp_cod_LW,occp_cod_MD,occp_cod_OT,occp_cod_PH,occp_cod_UNK,reporter_country_COUNTRY NOT SPECIFIED,reporter_country_GB,reporter_country_JP,reporter_country_OTHER,reporter_country_US
0,119705751,39.0,2016-01-28,0,2,2,fatigue feeling abnormal migraine nasopharyngi...,8,0,0,...,False,False,False,False,False,True,False,False,False,False
1,116426401,73.0,2015-10-20,0,4,2,choking foreign body product coating issue pro...,10,0,0,...,False,False,False,False,False,False,False,False,False,True
2,132217582,35.0,2017-02-11,1,10,8,anaemia multiple sclerosis relapse urinary tra...,6,0,1,...,False,False,False,False,False,False,False,False,False,True
3,108811261,27.0,2015-02-25,1,8,2,acute psychosis amnesia cardiac failure acute ...,28,0,1,...,False,False,False,True,False,False,False,False,False,True
4,121040361,60.0,2016-02-23,1,2,2,gastrointestinal haemorrhage gastrointestinal ...,2,0,1,...,False,False,False,True,False,False,False,False,True,False


In [23]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 701665 entries, 0 to 701664
Data columns (total 24 columns):
 #   Column                                  Non-Null Count   Dtype  
---  ------                                  --------------   -----  
 0   primaryid                               701665 non-null  int64  
 1   age                                     701665 non-null  float64
 2   fda_dt_parsed                           701665 non-null  object 
 3   is_severe_outcome                       701665 non-null  int64  
 4   drug_count                              701665 non-null  int64  
 5   indication_count                        701665 non-null  int64  
 6   all_reaction_pts                        701665 non-null  object 
 7   reaction_count                          701665 non-null  int64  
 8   is_ineffective                          701665 non-null  int64  
 9   is_failure                              701665 non-null  int64  
 10  rept_cod_EXP                            7016

In [24]:
df.isna().sum()

Unnamed: 0,0
primaryid,0
age,0
fda_dt_parsed,0
is_severe_outcome,0
drug_count,0
indication_count,0
all_reaction_pts,0
reaction_count,0
is_ineffective,0
is_failure,0


### Importing the required Modules

In [25]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.metrics.pairwise import cosine_similarity

### NLP-Based Cluster Naming

In [26]:
def assign_cluster_name_centroid(top_terms, vectorizer):

    phenotype_docs = {
        "Cardiac": "cardiac heart myocardial infarction chest pain arrhythmia cardiogenic shock heart failure",
        "Respiratory": "dyspnoea respiratory pneumonia cough asthma bronchospasm hypoxia respiratory failure",
        "Infection": "infection sepsis fever pneumonia abscess viral bacterial septicemia influenza",
        "Hepatic": "hepatic liver jaundice bilirubin hepatotoxicity cirrhosis hepatic failure",
        "Gastrointestinal": "nausea vomiting diarrhoea abdominal pain gastritis stomach gi bleed",
        "Neurological": "seizure headache dizziness convulsion neuropathy stroke tremor neuro",
        "Immune/Hypersensitivity": "anaphylactic hypersensitivity allergic rash urticaria swelling angioedema",
        "Renal": "renal kidney nephropathy acute kidney injury renal failure dialysis",
        "Inefficacy": "drug ineffective condition worsened treatment failure no therapeutic response"
    }

    phenotype_vectors = {
        name: vectorizer.transform([doc]).toarray()
        for name, doc in phenotype_docs.items()
    }

    cluster_names = {}

    for cid, terms in top_terms.items():
        cluster_doc = " ".join(terms)
        cluster_vec = vectorizer.transform([cluster_doc]).toarray()

        scores = {
            name: cosine_similarity(cluster_vec, vec)[0][0]
            for name, vec in phenotype_vectors.items()
        }

        best_name = max(scores, key=scores.get)
        cluster_names[cid] = best_name

    return cluster_names


### Filtering Failure Reports, TF-IDF, SVD

In [27]:
# 1) Filtering Failure Reports
df_fail = df[df["is_failure"] == 1].reset_index(drop=True)
print("Total failure reports:", df_fail.shape[0])

# 2) TF-IDF
tfidf = TfidfVectorizer(
    max_features=5000,
    stop_words="english",
    lowercase=True,
    token_pattern=r"[A-Za-z]+"
)

X_tfidf = tfidf.fit_transform(df_fail["all_reaction_pts"].astype(str))
print("TF-IDF Shape:", X_tfidf.shape)

# 3) SVD
svd = TruncatedSVD(n_components=50, random_state=42)
X_svd = svd.fit_transform(X_tfidf)
print("SVD Shape:", X_svd.shape)

Total failure reports: 249048
TF-IDF Shape: (249048, 5000)
SVD Shape: (249048, 50)


### Silhoutte Scores

In [28]:
k_values = [3, 4, 5]
silhouette_results = {}

print("\nCalculating Silhouette Scores...\n")

for k in k_values:
    km = KMeans(n_clusters=k, random_state=42, n_init=20)
    labels = km.fit_predict(X_svd)
    sil = silhouette_score(X_svd, labels)
    silhouette_results[k] = sil
    print(f"k = {k} → Silhouette Score = {sil:.4f}")

best_k = max(silhouette_results, key=silhouette_results.get)

print(f"\nBest k = {best_k} (Highest Silhouette Score)")


Calculating Silhouette Scores...

k = 3 → Silhouette Score = 0.3796
k = 4 → Silhouette Score = 0.2805
k = 5 → Silhouette Score = 0.3581

Best k = 3 (Highest Silhouette Score)


### Final Kmeans

In [29]:
kmeans = KMeans(n_clusters=best_k, random_state=42, n_init=20)
clusters = kmeans.fit_predict(X_svd)

df_fail["Failure_Phenotype"] = clusters

### Top Terms per Cluster

In [30]:
def get_top_terms(model, vectorizer, n=15):
    terms = np.array(vectorizer.get_feature_names_out())
    centroids = model.cluster_centers_.argsort()[:, ::-1]
    return {
        i: terms[centroids[i, :n]].tolist()
        for i in range(model.n_clusters)
    }

kmeans_tfidf = KMeans(n_clusters=best_k, random_state=42, n_init=20)
kmeans_tfidf.fit(X_tfidf)

top_terms = get_top_terms(kmeans_tfidf, tfidf)

### Assigning Phenotype Names (Centroid Similarity)

In [31]:
cluster_names = assign_cluster_name_centroid(top_terms, tfidf)
df_fail["Failure_Phenotype_Name"] = df_fail["Failure_Phenotype"].map(cluster_names)

### Merging Back

In [32]:
df_final = df.merge(
    df_fail[["primaryid", "Failure_Phenotype", "Failure_Phenotype_Name"]],
    on="primaryid",
    how="left"
)

### Finally Exporting to CSV

In [33]:
df_final.to_csv("faers_failure_phenotypes.csv", index=False)

print("\n Saved: faers_failure_phenotypes.csv")



 Saved: faers_failure_phenotypes.csv


In [34]:
df_final.head()

Unnamed: 0,primaryid,age,fda_dt_parsed,is_severe_outcome,drug_count,indication_count,all_reaction_pts,reaction_count,is_ineffective,is_failure,...,occp_cod_OT,occp_cod_PH,occp_cod_UNK,reporter_country_COUNTRY NOT SPECIFIED,reporter_country_GB,reporter_country_JP,reporter_country_OTHER,reporter_country_US,Failure_Phenotype,Failure_Phenotype_Name
0,119705751,39.0,2016-01-28,0,2,2,fatigue feeling abnormal migraine nasopharyngi...,8,0,0,...,False,False,False,True,False,False,False,False,,
1,116426401,73.0,2015-10-20,0,4,2,choking foreign body product coating issue pro...,10,0,0,...,False,False,False,False,False,False,False,True,,
2,132217582,35.0,2017-02-11,1,10,8,anaemia multiple sclerosis relapse urinary tra...,6,0,1,...,False,False,False,False,False,False,False,True,1.0,Neurological
3,108811261,27.0,2015-02-25,1,8,2,acute psychosis amnesia cardiac failure acute ...,28,0,1,...,False,True,False,False,False,False,False,True,1.0,Neurological
4,121040361,60.0,2016-02-23,1,2,2,gastrointestinal haemorrhage gastrointestinal ...,2,0,1,...,False,True,False,False,False,False,True,False,1.0,Neurological


In [35]:
df_final.isna().sum()

Unnamed: 0,0
primaryid,0
age,0
fda_dt_parsed,0
is_severe_outcome,0
drug_count,0
indication_count,0
all_reaction_pts,0
reaction_count,0
is_ineffective,0
is_failure,0


In [36]:
df.shape

(701665, 24)

*   Total Rows: 701665
*   Non-failure Rows: 452617

*   Failure Rows: 701665 - 452617 = 249048

Here, Clustering is only done for failure rows. Non-failure rows do not get a phenotype because, they have no failure mode to classify therefore, they should not have a failure cluster label.

In [37]:
df_final[df_final["is_failure"] == 1]["Failure_Phenotype_Name"].value_counts()

Unnamed: 0_level_0,count
Failure_Phenotype_Name,Unnamed: 1_level_1
Neurological,209188
Respiratory,20240
Cardiac,19620
