In [1]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords

# Download stopwords once
nltk.download("stopwords")

# === STEP 1: Load Data ===
df = pd.read_excel("DATASET.xlsx")
abstracts = df.iloc[:, 1].dropna().astype(str).tolist()

# === STEP 2: Define Stopwords ===
stop_words = set(stopwords.words("english"))
custom_stopwords = {
    "jia", "juvenile", "idiopathic", "arthritis", "disease",
    "patients", "study", "studies", "children", "also",
    "treatment", "systemic", "il", "clinical", "sjia"
}
stop_words.update(custom_stopwords)

# === STEP 3: Preprocess Text ===
def preprocess(text):
    tokens = re.findall(r'\b[a-zA-Z]{2,}\b', text.lower())
    return [t for t in tokens if t not in stop_words]

processed_abstracts = [preprocess(text) for text in abstracts]


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\britt\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [11]:
from gensim import corpora
from gensim.models import LdaModel
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis

# === STEP 4: Dictionary and Corpus ===
dictionary = corpora.Dictionary(processed_abstracts)
corpus = [dictionary.doc2bow(text) for text in processed_abstracts]

# === STEP 5: Train LDA Model ===
lda_model = LdaModel(
    corpus=corpus,
    id2word=dictionary,
    num_topics=15,          # You can tune this later
    passes=10,
    random_state=42
)

# === STEP 6: Visualize Topics ===
pyLDAvis.enable_notebook()
lda_display = gensimvis.prepare(lda_model, corpus, dictionary)
pyLDAvis.display(lda_display)


In [12]:
# === STEP 7: Assign LDA Topic to Each Abstract ===
abstract_topics = []

for row in corpus:
    topic_probs = sorted(lda_model.get_document_topics(row), key=lambda x: x[1], reverse=True)
    topic_id, prob = topic_probs[0]
    keywords = ", ".join([w for w, _ in lda_model.show_topic(topic_id, topn=5)])
    abstract_topics.append((topic_id, prob, keywords))

df["LDA_Topic"] = [t[0] for t in abstract_topics]
df["LDA_Prob"] = [t[1] for t in abstract_topics]
df["LDA_Keywords"] = [t[2] for t in abstract_topics]

print("✅ Assigned dominant LDA topic to all abstracts.")


✅ Assigned dominant LDA topic to all abstracts.


In [13]:
# === STEP 8: Create LDA Topic Distribution Vectors ===
lda_topic_vectors = []

for doc_bow in corpus:
    topic_distribution = lda_model.get_document_topics(doc_bow, minimum_probability=0.0)
    topic_vector = [prob for _, prob in sorted(topic_distribution, key=lambda x: x[0])]
    lda_topic_vectors.append(topic_vector)

print("✅ Created LDA topic distribution vectors for all abstracts.")


✅ Created LDA topic distribution vectors for all abstracts.


In [14]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# === STEP 9: Compute Cosine Similarity Matrix ===
lda_matrix = np.array(lda_topic_vectors)
similarity_matrix = cosine_similarity(lda_matrix)

def recommend_lda(doc_index, top_n=5):
    sims = similarity_matrix[doc_index]
    top_indices = sims.argsort()[::-1]
    top_indices = [i for i in top_indices if i != doc_index][:top_n]
    results = [(i, sims[i], df.iloc[i, 0], df.iloc[i]["LDA_Keywords"]) for i in top_indices]
    return results

# === TEST ===
query_id = 0
results = recommend_lda(query_id, top_n=5)

for i, score, title, keywords in results:
    print(f"\n📄 {i}: Similarity = {score:.2f}")
    print(f"→ Title: {title}")
    print(f"→ Keywords: {keywords}")



📄 2: Similarity = 0.90
→ Title: Barut K, Adrovic A, Şahin S, Kasapçopur Ö. Juvenile Idiopathic Arthritis. Balkan Med J. 2017 Apr 5;34(2):90-101. doi: 10.4274/balkanmedj.2017.0111. PMID: 28418334; PMCID: PMC5394305.
→ Keywords: inflammatory, chronic, years, joint, term

📄 184: Similarity = 0.77
→ Title: Quartier P, Prieur AM. Arthrites juvéniles idiopathiques. II. Traitement et pronostic [Juvenile idiopathic arthritis. II. Treatment and prognosis]. Rev Prat. 2007 Jun 30;57(12):1289-93. French. PMID: 17717939.
→ Keywords: uveitis, non, polyarticular, joint, anti

📄 99: Similarity = 0.77
→ Title: Thomson W, Donn R. Juvenile idiopathic arthritis genetics - what's new? What's next? Arthritis Res. 2002;4(5):302-6. doi: 10.1186/ar591. Epub 2002 Aug 5. PMID: 12223104; PMCID: PMC128941.
→ Keywords: uveitis, non, polyarticular, joint, anti

📄 162: Similarity = 0.77
→ Title: Neto A, Costa M, Branco JC, Mourão AF. Benign transient hyperphosphatasemia in Juvenile Idiopathic Arthritis: a case repor

In [15]:
for i, vec in enumerate(lda_topic_vectors[:5]):
    print(f"Doc {i}: {vec}")


Doc 0: [0.00077534316, 0.0007753435, 0.00077534403, 0.5374763, 0.0007753431, 0.0007753431, 0.0007753434, 0.45244426, 0.0007753433, 0.00077534263, 0.00077534263, 0.00077534356, 0.0007753441, 0.0007753428, 0.0007753451]
Doc 1: [0.000823221, 0.0008232213, 0.0008232222, 0.00082322175, 0.000823221, 0.00082322204, 0.00082322146, 0.11214664, 0.0008232221, 0.0008232203, 0.0008232205, 0.0008232213, 0.3129045, 0.56507015, 0.0008232211]
Doc 2: [0.00030180413, 0.12723285, 0.06973789, 0.28083852, 0.00030180402, 0.0003018041, 0.00030180402, 0.36888725, 0.00030180407, 0.00030180384, 0.00030180384, 0.027838582, 0.12274865, 0.0003018041, 0.0003018043]
Doc 3: [0.00077544706, 0.6519866, 0.00077544694, 0.00077544706, 0.0007754462, 0.00077544636, 0.00077544636, 0.00077544653, 0.00077544624, 0.00077544583, 0.000775446, 0.0007754462, 0.30201522, 0.036692817, 0.00077544636]
Doc 4: [0.0011497164, 0.645445, 0.33960873, 0.0011497171, 0.0011497149, 0.0011497166, 0.0011497155, 0.001149717, 0.0011497161, 0.00114971

In [16]:
query_id = 0
results = recommend_lda(query_id, top_n=5)

for i, score, title, keywords in results:
    print(f"\n📄 {i}: Similarity = {score:.2f}")
    print(f"→ Title: {title}")
    print(f"→ Keywords: {keywords}")



📄 2: Similarity = 0.90
→ Title: Barut K, Adrovic A, Şahin S, Kasapçopur Ö. Juvenile Idiopathic Arthritis. Balkan Med J. 2017 Apr 5;34(2):90-101. doi: 10.4274/balkanmedj.2017.0111. PMID: 28418334; PMCID: PMC5394305.
→ Keywords: inflammatory, chronic, years, joint, term

📄 184: Similarity = 0.77
→ Title: Quartier P, Prieur AM. Arthrites juvéniles idiopathiques. II. Traitement et pronostic [Juvenile idiopathic arthritis. II. Treatment and prognosis]. Rev Prat. 2007 Jun 30;57(12):1289-93. French. PMID: 17717939.
→ Keywords: uveitis, non, polyarticular, joint, anti

📄 99: Similarity = 0.77
→ Title: Thomson W, Donn R. Juvenile idiopathic arthritis genetics - what's new? What's next? Arthritis Res. 2002;4(5):302-6. doi: 10.1186/ar591. Epub 2002 Aug 5. PMID: 12223104; PMCID: PMC128941.
→ Keywords: uveitis, non, polyarticular, joint, anti

📄 162: Similarity = 0.77
→ Title: Neto A, Costa M, Branco JC, Mourão AF. Benign transient hyperphosphatasemia in Juvenile Idiopathic Arthritis: a case repor

In [17]:
lda_topic_vectors = []

for doc_bow in corpus:
    topic_distribution = lda_model.get_document_topics(doc_bow, minimum_probability=0.0)
    topic_vector = [prob for _, prob in sorted(topic_distribution, key=lambda x: x[0])]
    lda_topic_vectors.append(topic_vector)

print("✅ Rebuilt LDA topic vectors.")


✅ Rebuilt LDA topic vectors.


In [18]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

lda_matrix = np.array(lda_topic_vectors)
similarity_matrix = cosine_similarity(lda_matrix)

def recommend_lda(doc_index, top_n=5):
    sims = similarity_matrix[doc_index]
    top_indices = sims.argsort()[::-1]
    top_indices = [i for i in top_indices if i != doc_index][:top_n]
    results = [(i, sims[i], df.iloc[i, 0], df.iloc[i]["LDA_Keywords"]) for i in top_indices]
    return results


In [19]:
query_id = 0
results = recommend_lda(query_id, top_n=5)

for i, score, title, keywords in results:
    print(f"\n📄 {i}: Similarity = {score:.2f}")
    print(f"→ Title: {title}")
    print(f"→ Keywords: {keywords}")



📄 2: Similarity = 0.90
→ Title: Barut K, Adrovic A, Şahin S, Kasapçopur Ö. Juvenile Idiopathic Arthritis. Balkan Med J. 2017 Apr 5;34(2):90-101. doi: 10.4274/balkanmedj.2017.0111. PMID: 28418334; PMCID: PMC5394305.
→ Keywords: inflammatory, chronic, years, joint, term

📄 184: Similarity = 0.77
→ Title: Quartier P, Prieur AM. Arthrites juvéniles idiopathiques. II. Traitement et pronostic [Juvenile idiopathic arthritis. II. Treatment and prognosis]. Rev Prat. 2007 Jun 30;57(12):1289-93. French. PMID: 17717939.
→ Keywords: uveitis, non, polyarticular, joint, anti

📄 99: Similarity = 0.77
→ Title: Thomson W, Donn R. Juvenile idiopathic arthritis genetics - what's new? What's next? Arthritis Res. 2002;4(5):302-6. doi: 10.1186/ar591. Epub 2002 Aug 5. PMID: 12223104; PMCID: PMC128941.
→ Keywords: uveitis, non, polyarticular, joint, anti

📄 162: Similarity = 0.77
→ Title: Neto A, Costa M, Branco JC, Mourão AF. Benign transient hyperphosphatasemia in Juvenile Idiopathic Arthritis: a case repor