In [None]:
import pandas as pd
import re
import string

# Extract Impression attribute
def extract_impression(text):
    if pd.isna(text):
        return None
    match = re.search(r'impression:\s*(.*)', text, flags=re.IGNORECASE | re.DOTALL)
    return match.group(1).strip() if match else None

# Remove punctuations and digits
def clean_text(text):
    if pd.isna(text):
        return None
    # Lowercase
    text = text.lower()
    # Remove digits and punctuation
    text = re.sub(r'[\d' + string.punctuation + ']', ' ', text)
    return text

# Load data from CSV containing procedure reports
df = pd.read_csv(" ", sep=',')

# Extract impression and clean text
df['impression_extracted'] = df['procedure_report'].apply(extract_impression)
df['impression_cleaned'] = df['impression_extracted'].apply(clean_text)

# Drop rows with no Impression attribute
df = df.dropna(subset=['impression_cleaned'])

# Preview results
print(df[['impression_extracted', 'impression_cleaned']].head())

In [None]:
#Perform Negation Detection
import scispacy
import spacy
import pandas as pd
from negspacy.negation import Negex
from negspacy.termsets import termset

# Load SciSpaCy model
nlp = spacy.load("en_core_sci_sm")

# Set up negation termset
ts = termset("en_clinical")
ts.add_patterns({
    "preceding_negations": ["unable"],
    "following_negations": ["was negative"]
})

# Add negex component to pipeline
if "negex" not in nlp.pipe_names:
    nlp.add_pipe("negex", config={"neg_termset": ts.get_patterns()})

# Apply negation-aware processing
def process_impression(text):
    if pd.isna(text):
        return None
    text = text.lower()  # Convert to lowercase
    doc = nlp(text)

    phrases = []
    for ent in doc.ents:
        ent_text = ent.text.strip().replace(" ", "_")
        if ent._.negex:
            phrases.append(f"no_{ent_text}")
        else:
            phrases.append(ent_text)
    return " ".join(phrases) if phrases else None

# Apply to DataFrame
df["impression_negex"] = df["impression_cleaned"].apply(process_impression)

# Print result
print(df[["impression_cleaned", "impression_negex"]])

In [None]:
#Remove stopwords and generic words
import pandas as pd
from nltk.corpus import stopwords
import nltk

# Download stopwords if not already downloaded
nltk.download('stopwords')

# Extend stopwords
stop_words = set(stopwords.words('english'))
stop_words.update([
    'personalname', 'alphanumericid', 'examination', 'preliminary', 'impression', 
    'chest', 'view', 'change', 'final', 'available', 'evidence', 'see', 'right', 
    'stable', 'date', 'compare', 'interpretation', 'portable', 'lung', 'leave', 
    'report', 'process', 'review', 'resident', 'radiologist', 'attend', 
    'finding', 'findings', 'electronic', 'and', 'sign'
])

# Function to remove stopwords
def remove_stopwords_custom(text):
    if pd.isna(text):
        return ''
    tokens = text.lower().split()
    filtered = [t for t in tokens if t not in stop_words]
    return ' '.join(filtered)

# Apply stopword removal
df['impression_cleaned'] = df['impression_negex'].apply(remove_stopwords_custom)

# Remove rows with empty cleaned impressions
df = df[df['impression_cleaned'].str.strip() != ''].reset_index(drop=True)

# Show final result
print(df[['impression_negex', 'impression_cleaned']])

In [None]:
import gensim
from gensim import corpora, models

# Tokenize cleaned impression strings
tokenized_data = df['impression_cleaned'].apply(lambda x: x.split()).tolist()

# Remove empty lists
filtered_list = [doc for doc in tokenized_data if doc]

# Create a dictionary
dictionary = corpora.Dictionary(filtered_list)

# Create a Bag-of-Words (BoW) representation
bow_corpus = [dictionary.doc2bow(doc) for doc in filtered_list]

# Create a TF-IDF model
tfidf_model = models.TfidfModel(bow_corpus)

# Apply the model to get the TF-IDF corpus
corpus_tfidf = tfidf_model[bow_corpus]

In [None]:
# Find optimal no. of topics using Coherence score
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel
import matplotlib.pyplot as plt
def compute_coherence_values(dictionary, corpus, texts, limit, start=2, step=1):
    coherence_values = []
    model_list = []
    for num_topics in range(start, limit, step):
        model=gensim.models.LdaMulticore(corpus_tfidf, num_topics=num_topics, id2word=dictionary,random_state=0)
        model_list.append(model)
        coherencemodel = CoherenceModel(model=model, texts=filtered_list, dictionary=dictionary, coherence='c_v')
        coherence_values.append(coherencemodel.get_coherence())
    return model_list, coherence_values
model_list, coherence_values = compute_coherence_values(dictionary=dictionary, corpus=corpus_tfidf, texts=filtered_list, start=2, limit=40, step=1)
limit=40; start=2; step=1;
x = range(start, limit, step)
plt.plot(x, coherence_values)
plt.xlabel("Num Topics")
plt.ylabel("Coherence score")
plt.legend(("coherence_values"), loc='best')
plt.show()

In [None]:
for m, cv in zip(x, coherence_values):
    print("Num Topics =", m, " has Coherence Value of", round(cv, 4))

In [None]:
# Generate topics based on the no. of optimal topics which is '5' in this case 
lda_model = gensim.models.LdaMulticore(corpus_tfidf, num_topics=5, id2word=dictionary,random_state=0)
for idx, topic in lda_model.print_topics(-1):
    print('Topic: {} Word: {}'.format(idx, topic))

In [None]:
# Find dominant topic in each sentence of the text
def format_topics_sentences(ldamodel=lda_model, corpus=bow_corpus, texts=filtered_list):
    # Init output
    sent_topics_list = []
    
    # Get main topic in each document
    for i, row in enumerate(ldamodel[corpus]):
        row = sorted(row, key=lambda x: (x[1]), reverse=True)
        # Get the Dominant topic, Perc Contribution and Keywords for each document
        if row:
            topic_num, prop_topic = row[0]  # Dominant topic
            wp = ldamodel.show_topic(topic_num)
            topic_keywords = ", ".join([word for word, prop in wp])
            sent_topics_list.append([int(topic_num), round(prop_topic, 4), topic_keywords])
    
    sent_topics_df = pd.DataFrame(sent_topics_list, columns=['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords'])
    
    # Add original text to the end of the output
    sent_topics_df = pd.concat([sent_topics_df, pd.Series(texts, name='Text')], axis=1)
    
    return sent_topics_df

df_topic_sents_keywords = format_topics_sentences(ldamodel=lda_model, corpus=bow_corpus, texts=filtered_list)
df_dominant_topic = df_topic_sents_keywords.reset_index()
df_dominant_topic.columns = ['Document_No', 'Dominant_Topic', 'Topic_Perc_Contrib', 'Keywords', 'Text']
df_dominant_topic.head(10)

In [None]:
# Count of documents per topic
topic_counts = df_dominant_topic['Dominant_Topic'].value_counts()

# Percentage of documents per topic
topic_percentages = topic_counts / len(df_dominant_topic) * 100

# Combine into a single DataFrame for better readability
topic_distribution = pd.DataFrame({
    'Topic': topic_counts.index,
    'Num_Documents': topic_counts.values,
    'Percentage': topic_percentages.values.round(2)
})

# Optionally sort by percentage
topic_distribution = topic_distribution.sort_values(by='Percentage', ascending=False)

print(topic_distribution)

In [None]:
# Linguistic analysis using chi-square analysis
import numpy as np
from sklearn.feature_selection import chi2, SelectKBest
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder
from gensim import corpora, models

num_docs = len(corpus_tfidf)
num_terms = len(dictionary)
dense_matrix = np.zeros((num_docs, num_terms))

for i, doc in enumerate(corpus_tfidf):
    for term_id, tfidf_value in doc:
        dense_matrix[i, term_id] = tfidf_value

# Label encode dominant topics for supervised feature selection
labels = df_dominant_topic['Dominant_Topic'].values
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(labels)

# Apply chi-square feature selection to find top 20 TF-IDF features
chi2_selector = SelectKBest(score_func=chi2, k=20)
X_chi2 = chi2_selector.fit_transform(dense_matrix, y_encoded)

# Get selected term indices and actual terms from dictionary
selected_term_ids = chi2_selector.get_support(indices=True)
top_features = [dictionary[i] for i in selected_term_ids]

print("Top 20 chi-square selected features:")
print(top_features)

# For each topic, count documents containing each selected feature
binary_matrix = (dense_matrix[:, selected_term_ids] > 0).astype(int)
topic_feature_counts = {feature: [0] * len(label_encoder.classes_) for feature in top_features}

for i in range(num_docs):
    topic_id = y_encoded[i]
    present_features = binary_matrix[i]
    for j, present in enumerate(present_features):
        if present:
            feature = top_features[j]
            topic_feature_counts[feature][topic_id] += 1

# Sum feature counts across top 20 for each topic
topic_total_counts = np.zeros(len(label_encoder.classes_))
for feature in top_features:
    topic_total_counts += topic_feature_counts[feature]

# Find topic with highest total counts
best_topic_index = int(np.argmax(topic_total_counts))
best_topic = label_encoder.inverse_transform([best_topic_index])[0]

print(f"\nTopic most associated with comorbidity-related features: {best_topic}")
print(f"Total relevant feature occurrences in that topic: {int(topic_total_counts[best_topic_index])}")

In [None]:
# Get scores and indices of selected features
chi2_scores = chi2_selector.scores_[selected_term_ids]

# Format feature names with chi-square value
formatted_features = [
    f"{dictionary[term_id]} (χ²({len(label_encoder.classes_)-1}) = {score:.1f})"
    for term_id, score in zip(selected_term_ids, chi2_scores)
]

# Use existing topic_feature_counts to build final DataFrame
feature_topic_df = pd.DataFrame(
    topic_feature_counts, 
    index=[f'Topic_{i}' for i in range(len(label_encoder.classes_))]
).T  # Transpose

# Rename index to include chi-square values
feature_topic_df.index = formatted_features

# Sort columns by topic number (optional)
feature_topic_df.columns = sorted(feature_topic_df.columns, key=lambda x: int(x.split("_")[1]))

# Display
print("Number of clinical notes containing each top feature per topic:\n")
print(feature_topic_df)
