<a href="https://colab.research.google.com/github/agneeshrc/NON_BC-NLP/blob/main/NLP_for_NON_BC.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Install all necessary libraries, including spaCy
!pip install -q pandas openpyxl spacy scikit-learn

# Download the small English model for spaCy
!python -m spacy download en_core_web_sm

# Import all libraries
import pandas as pd
import re
import spacy
from google.colab import files
import io
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression

print("✅ Setup Complete: All libraries and the spaCy model are ready.")

Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m99.4 MB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.
✅ Setup Complete: All libraries and the spaCy model are ready.


In [None]:
try:
    # --- LOAD SPACY MODEL ---
    # Load the English model we just downloaded
    nlp = spacy.load("en_core_web_sm")
    # Get the default stop words from spaCy
    stop_words = nlp.Defaults.stop_words

    # --- UPLOAD THE FILE ---
    print("Please upload your Excel file.")
    uploaded = files.upload()
    file_name = next(iter(uploaded))
    print(f"\n✅ Successfully uploaded '{file_name}'.")

    # --- READ AND PROCESS THE DATA ---
    df_patients = pd.read_excel(io.BytesIO(uploaded[file_name]))
    df_processed = df_patients.copy()
    print("\nData loaded into DataFrame successfully.")

    # --- DEFINE NEW PREPROCESSING FUNCTION USING SPACY ---
    def spacy_preprocess_text(text):
        if not isinstance(text, str):
            return ""
        # Remove punctuation and convert to lowercase
        text = re.sub(r'[^a-z\s]', '', text.lower())

        # Process the text with spaCy
        doc = nlp(text)

        # Lemmatize and remove stop words in one step
        processed_tokens = [token.lemma_ for token in doc if not token.is_stop]
        return ' '.join(processed_tokens)

    # Apply preprocessing
    print("\nPreprocessing the diagnosis text using spaCy...")
    df_processed.dropna(subset=['Diagnosis_Cleaned'], inplace=True)
    df_processed.dropna(subset=['Cancer No'], inplace=True) # <-- THIS LINE IS THE FIX
    df_processed['processed_diagnosis'] = df_processed['Diagnosis_Cleaned'].apply(spacy_preprocess_text)
    print("Preprocessing complete.")

    # Vectorize text
    print("\nVectorizing text with TF-IDF...")
    vectorizer = TfidfVectorizer(ngram_range=(1, 2), max_features=2000)
    X = vectorizer.fit_transform(df_processed['processed_diagnosis'])
    y = df_processed['Cancer No']
    print("Vectorization complete.")

    # Train model and extract keywords
    print("\nTraining model and extracting keywords...")
    model = LogisticRegression(random_state=42, multi_class='ovr', solver='liblinear')
    model.fit(X, y)

    feature_names = vectorizer.get_feature_names_out()
    class_labels = {1: 'Non-Cancer', 2: 'Cancer', 3: 'Needs Confirmation'}

    print("\n--- TOP KEYWORDS PER CATEGORY ---")
    for class_val in sorted(model.classes_):
        class_index = list(model.classes_).index(class_val)
        class_coef = model.coef_[class_index]
        top_coef_indices = class_coef.argsort()[-15:][::-1]
        top_words = [feature_names[i] for i in top_coef_indices]

        print(f"\n## Top keywords for '{class_labels.get(class_val, 'Unknown Category')}' (Class {class_val}):")
        print(", ".join(top_words))

except Exception as e:
    print(f"\nAn error occurred during processing: {e}")

Please upload your Excel file.


Saving NON_BC_Cleaned(By Dr. Das).xlsx to NON_BC_Cleaned(By Dr. Das).xlsx

✅ Successfully uploaded 'NON_BC_Cleaned(By Dr. Das).xlsx'.

Data loaded into DataFrame successfully.

Preprocessing the diagnosis text using spaCy...
Preprocessing complete.

Vectorizing text with TF-IDF...
Vectorization complete.

Training model and extracting keywords...

--- TOP KEYWORDS PER CATEGORY ---

## Top keywords for 'Non-Cancer' (Class 1.0):
op, post op, pain, disease, bl, abdomen, stone, gall, stone disease, gall stone, fissure, ano, fissure ano, pain abdomen, symptomatic

## Top keywords for 'Cancer' (Class 2.0):
mass, mass evaluation, imp, rectum, gb mass, stomach, plan, ptn, ehbo, met, colon, gb, tongue, sigmoid, stage

## Top keywords for 'Needs Confirmation' (Class 3.0):
evaluation, axillary, leave axillary, pr evaluation, pr, bleed pr, lump evaluation, lump, bleed, neck swell, pelvic, pelvic mass, swell evaluation, neck, swell


