# NLP — Bowel Preparation Classification

This notebook trains a model on ~1000 manually labeled colonoscopy reports (Turkish free-text),
then applies it to the full cohort (>11,000 reports) to classify bowel cleanliness as **Good / Intermediate / Poor**.

The pipeline uses TF–IDF + Logistic Regression combined with simple rule-based overrides (segment window +100; if model predicts 'orta' → keep; rules only on the extracted segment).

In [ ]:
# === 1) Imports and Config ===
import os, unicodedata, joblib
import pandas as pd, numpy as np
from sklearn.model_selection import StratifiedKFold, cross_val_predict
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, accuracy_score, precision_recall_fscore_support

# ====== PATHS (edit these for your system) ======
TRAIN_PATH   = r"examples/dummy_training.xlsx"   # ~1000 labeled training file
PREDICT_PATH = r"examples/dummy_training.xlsx"   # full cohort
OUTPUT_MODEL = "colon_cleanliness_model.pkl"
OUTPUT_FULL  = "adr.v3.xlsx"

# Column names
TRAIN_TEXT_COL   = "temizlik ifadesi"
TRAIN_LABEL_COL  = "temizlik sinifi iyi, orta, kötü"
FULL_TEXT_COL    = "BULGULAR"

In [ ]:
# === 2) Helper functions ===
def normalize(text):
    if not isinstance(text, str):
        return ''
    return ''.join(c for c in unicodedata.normalize('NFKD', text) if not unicodedata.combining(c)).lower()

def extract_segment(text):
    norm = normalize(text)
    anchors = ['kolon temizligi','kolon temizlig','kolon temizlik','kolon temizli']
    idx = -1
    for p in anchors:
        idx = norm.find(p)
        if idx != -1:
            break
    if idx != -1:
        end = norm.find('.', idx)
        if end == -1:
            end = idx + 100
        return norm[idx:end]
    return norm[:100]

def canon_label(x):
    if pd.isna(x): return np.nan
    s = str(x).strip().lower()
    if 'iyi' in s: return 'iyi'
    if 'orta' in s: return 'orta'
    if 'kötü' in s or 'kotu' in s: return 'kötü'
    return s

def classify_with_rules(segment, model_pred):
    if model_pred == 'orta':
        return 'orta'
    if 'yeterli' in segment:
        if ('degil' in segment) or ('degildi' in segment) or ('yetersiz' in segment):
            return 'kötü'
        else:
            return 'iyi'
    if ('yetersiz' in segment) or ('degil' in segment) or ('degildi' in segment):
        return 'kötü'
    if ('subopt' in segment) or ('kismen' in segment) or ('kısmen' in segment) or ('yer yer' in segment) or ('yeryer' in segment):
        return 'orta'
    return model_pred

In [ ]:
# === 3) Train model ===
train_df = pd.read_excel(TRAIN_PATH)
train_df = train_df.dropna(subset=[TRAIN_TEXT_COL, TRAIN_LABEL_COL])
X = train_df[TRAIN_TEXT_COL].astype(str).map(normalize)
y = train_df[TRAIN_LABEL_COL].map(canon_label)

pipe = Pipeline([
    ("tfidf", TfidfVectorizer(analyzer="word", ngram_range=(1,3), min_df=2, max_df=0.95)),
    ("clf", LogisticRegression(C=4.0, class_weight="balanced", max_iter=3000, solver="lbfgs", random_state=42))
])

yhat = cross_val_predict(pipe, X, y, cv=5, method="predict")
print("=== CV on TRAIN ===")
print("Accuracy:", accuracy_score(y, yhat))
print(classification_report(y, yhat, digits=2))

pipe.fit(X, y)
joblib.dump(pipe, OUTPUT_MODEL)
print("Model saved to:", OUTPUT_MODEL)

In [ ]:
# === 4) Apply to full cohort ===
full_df = pd.read_excel(PREDICT_PATH)
segments = full_df[FULL_TEXT_COL].astype(str).map(extract_segment)
model_preds = pipe.predict(segments.values)
final_preds = [classify_with_rules(seg, mp) for seg, mp in zip(segments, model_preds)]

full_df["temizlik sinifi tahmin"] = final_preds
print(full_df["temizlik sinifi tahmin"].value_counts())

full_df.to_excel(OUTPUT_FULL, index=False)
print("Predictions saved to:", OUTPUT_FULL)