In [None]:
!pip install torch torchvision -q

In [None]:
# ===============================
# üì¶ Imports
# ===============================
import pandas as pd
import numpy as np
import nltk
import re
import torch
from torch.utils.data import Dataset, DataLoader
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertForSequenceClassification
from torch.optim import AdamW # Import AdamW from torch.optim
import joblib
from tqdm import tqdm
from sklearn.utils import resample # Import resample

# ===============================
# üì• Load Dataset
# ===============================
nltk.download('punkt')
# Download the punkt_tab resource as suggested by the error message
nltk.download('punkt_tab')
df = pd.read_csv("/content/combined_data.csv", engine='python', on_bad_lines='skip')

# Auto-detect text & label columns
text_col = next((c for c in ["text","message","content","body"] if c in df.columns), None)
label_col = next((c for c in ["label","target","spam"] if c in df.columns), None)
if not text_col or not label_col:
    raise ValueError("Dataset must contain text and label columns.")

df = df[[text_col,label_col]].dropna()
df = df[df[text_col].str.strip() != ""]

# Convert labels to 0/1
df[label_col] = df[label_col].astype(str).str.lower().map({"spam":1,"1":1,"ham":0,"0":0})
df = df.dropna()

# Limit dataset for RAM efficiency
if len(df) > 5000:
    df = df.sample(5000, random_state=42)

df['body_length'] = df[text_col].apply(lambda x: len(nltk.word_tokenize(str(x))))

# Resample minority class to balance dataset
df_majority = df[df[label_col] == 0]
df_minority = df[df[label_col] == 1]

df_minority_upsampled = resample(df_minority,
                                 replace=True,
                                 n_samples=len(df_majority),
                                 random_state=42)

df_balanced = pd.concat([df_majority, df_minority_upsampled])

# ===============================
# üêç Logistic Regression
# ===============================
def detect_patterns(text):
    patterns = [r'(?i)fr[3e][3e]', r'(?i)w[i1]n', r'(?i)cl[i1]ck']
    return sum(1 for p in patterns if re.search(p,str(text)))

df['pattern_count'] = df[text_col].apply(detect_patterns)

vectorizer = TfidfVectorizer(min_df=1, stop_words='english', lowercase=True)
X_tfidf = vectorizer.fit_transform(df[text_col])
X_features = np.hstack([df[['pattern_count','body_length']].values, X_tfidf.toarray()])
scaler = StandardScaler()
X_features = scaler.fit_transform(X_features)

y = df[label_col].values
X_train, X_test, y_train, y_test = train_test_split(X_features, y, test_size=0.2, random_state=42)

logreg_model = LogisticRegression(max_iter=300)
logreg_model.fit(X_train, y_train)
joblib.dump(logreg_model, "logreg_model.pkl")
joblib.dump(vectorizer, "vectorizer.pkl")
joblib.dump(scaler, "scaler.pkl")

# ===============================
# üß† BERT
# ===============================
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
bert_model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)
bert_model.to(device)

class SpamDataset(Dataset):
    def __init__(self,texts,labels,tokenizer,max_len=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len
    def __len__(self):
        return len(self.texts)
    def __getitem__(self,idx):
        text = str(self.texts[idx])
        label = int(self.labels[idx])
        enc = self.tokenizer(text,truncation=True,padding='max_length',max_length=self.max_len,return_tensors='pt')
        return {"input_ids":enc['input_ids'].squeeze(),
                "attention_mask":enc['attention_mask'].squeeze(),
                "labels":torch.tensor(label,dtype=torch.long)}

X_train_text, X_test_text, y_train_text, y_test_text = train_test_split(
    df[text_col], df[label_col], test_size=0.2, random_state=42
)

train_dataset = SpamDataset(X_train_text.tolist(), y_train_text.tolist(), tokenizer)
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)

optimizer = AdamW(bert_model.parameters(), lr=2e-5)

bert_model.train()
for batch in tqdm(train_loader, desc="Training BERT"):
    optimizer.zero_grad()
    input_ids = batch['input_ids'].to(device)
    attention_mask = batch['attention_mask'].to(device)
    labels = batch['labels'].to(device)
    outputs = bert_model(input_ids, attention_mask=attention_mask, labels=labels)
    loss = outputs.loss
    loss.backward()
    optimizer.step()

# Save trained BERT
torch.save(bert_model.state_dict(), "bert_model.pt")
joblib.dump(tokenizer, "bert_tokenizer.pkl")

print("‚úÖ Models saved: Logistic Regression + BERT")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Training BERT: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 500/500 [01:45<00:00,  4.73it/s]


‚úÖ Models saved: Logistic Regression + BERT


In [None]:
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
import torch
from sklearn.model_selection import train_test_split
from datasets import Dataset

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

df_balanced = df_balanced.sample(frac=1).reset_index(drop=True)
train_texts, val_texts, train_labels, val_labels = train_test_split(df_balanced['text'], df_balanced['label'], test_size=0.2)

train_encodings = tokenizer(list(train_texts), truncation=True, padding=True, max_length=128)
val_encodings = tokenizer(list(val_texts), truncation=True, padding=True, max_length=128)

# Ensure labels are torch.long and in the correct format for the model
train_dataset = Dataset.from_dict({**train_encodings, "labels": [torch.tensor(label, dtype=torch.long) for label in train_labels]})
val_dataset = Dataset.from_dict({**val_encodings, "labels": [torch.tensor(label, dtype=torch.long) for label in val_labels]})

model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=2,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    eval_strategy="epoch",
    save_strategy="no",
    logging_dir='./logs',
    logging_steps=10,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
)

trainer.train()
model.save_pretrained("./bert_spam_model")
tokenizer.save_pretrained("./bert_spam_model")

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
[34m[1mwandb[0m: (1) Create a W&B account
[34m[1mwandb[0m: (2) Use an existing W&B account
[34m[1mwandb[0m: (3) Don't visualize my results
[34m[1mwandb[0m: Enter your choice:

 279698784fdbc936dc7379396eb094f34a0d9f3d


[34m[1mwandb[0m: Enter your choice:

 279698784fdbc936dc7379396eb094f34a0d9f3d


[34m[1mwandb[0m: Enter your choice:

 279698784fdbc936dc7379396eb094f34a0d9f3d


[34m[1mwandb[0m: Enter your choice:

 2


[34m[1mwandb[0m: You chose 'Use an existing W&B account'
[34m[1mwandb[0m: Logging into https://api.wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: Find your API key here: https://wandb.ai/authorize?ref=models
[34m[1mwandb[0m: Paste an API key from your profile and hit enter:

 ¬∑¬∑¬∑¬∑¬∑¬∑¬∑¬∑¬∑¬∑


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mar7706[0m ([33mintern_test[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss
1,0.1057,0.101455
2,0.1704,0.070059


('./bert_spam_model/tokenizer_config.json',
 './bert_spam_model/special_tokens_map.json',
 './bert_spam_model/vocab.txt',
 './bert_spam_model/added_tokens.json')

In [None]:
# Install Node.js (if not already installed)
!apt-get install -y nodejs npm

# Install localtunnel globally via npm
!npm install -g localtunnel


Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
The following additional packages will be installed:
  gyp javascript-common libc-ares2 libjs-events libjs-highlight.js
  libjs-inherits libjs-is-typedarray libjs-psl libjs-source-map
  libjs-sprintf-js libjs-typedarray-to-buffer libnode-dev libnode72
  libnotify-bin libnotify4 libuv1-dev node-abab node-abbrev node-agent-base
  node-ansi-regex node-ansi-styles node-ansistyles node-aproba node-archy
  node-are-we-there-yet node-argparse node-arrify node-asap node-asynckit
  node-balanced-match node-brace-expansion node-builtins node-cacache
  node-chalk node-chownr node-clean-yaml-object node-cli-table node-clone
  node-color-convert node-color-name node-colors node-columnify
  node-combined-stream node-commander node-console-control-strings
  node-copy-concurrently node-core-util-is node-coveralls node-cssom
  node-cssstyle node-debug node-decompress-response node-defaults
  node-delayed-st

In [None]:
# Python: get public IP using ipify (works in Colab too)
import urllib.request
print(urllib.request.urlopen('https://api.ipify.org').read().decode('utf8'))


34.16.187.74


In [None]:
def explain_message(text, model, vectorizer):
    # Transform text
    X_vec = vectorizer.transform([text]).toarray()[0]
    coef = model.coef_[0]
    feature_names = np.array(vectorizer.get_feature_names_out())

    # Multiply TF-IDF by coefficient to get contribution
    contribution = X_vec * coef
    top_indices = contribution.argsort()[-5:][::-1]  # top 5 words
    top_words = feature_names[top_indices]
    top_scores = contribution[top_indices]
    return list(zip(top_words, top_scores))


In [None]:
import matplotlib.pyplot as plt

def plot_top_words(explanation):
    words, scores = zip(*explanation)
    fig, ax = plt.subplots()
    ax.barh(words, scores, color='salmon')
    ax.set_xlabel("Contribution Score")
    ax.set_title("Top Words Contributing to Spam Prediction")
    ax.invert_yaxis()
    st.pyplot(fig)


In [None]:
!pip install streamlit pyngrok transformers torch scikit-learn pandas numpy nltk joblib


Collecting streamlit
  Downloading streamlit-1.52.2-py3-none-any.whl.metadata (9.8 kB)
Collecting pyngrok
  Downloading pyngrok-7.5.0-py3-none-any.whl.metadata (8.1 kB)
Collecting pydeck<1,>=0.8.0b4 (from streamlit)
  Downloading pydeck-0.9.1-py2.py3-none-any.whl.metadata (4.1 kB)
Downloading streamlit-1.52.2-py3-none-any.whl (9.0 MB)
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m9.0/9.0 MB[0m [31m64.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pyngrok-7.5.0-py3-none-any.whl (24 kB)
Downloading pydeck-0.9.1-py2.py3-none-any.whl (6.9 MB)
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m6.9/6.9 MB[0m [31m87.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pyngrok, pydeck, streamlit
Successfully installed pydeck-0.9.1 pyngrok-7.5.0 streamlit-1.52.2


In [None]:
import nltk
nltk.download('punkt')

app_code = """
import streamlit as st
import pandas as pd
import numpy as np
import nltk
import re
import joblib
import torch
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from transformers import BertTokenizer, BertForSequenceClassification

nltk.download('punkt')
st.title("üìß Spam Email/Text Classifier")

user_input = st.text_area("Enter email/text here:")

@st.cache_resource
def load_models():
    lr_model = joblib.load("logreg_model.pkl")
    vectorizer = joblib.load("vectorizer.pkl")
    scaler = joblib.load("scaler.pkl")
    tokenizer = BertTokenizer.from_pretrained('/content/bert_spam_model')
    bert_model = BertForSequenceClassification.from_pretrained('/content/bert_spam_model')
    bert_model.eval()
    return lr_model, vectorizer, scaler, tokenizer, bert_model

def detect_spam_patterns(text):
    patterns = [r'(?i)fr[3e][3e]', r'(?i)w[i1]n', r'(?i)cl[i1]ck']
    return sum(1 for p in patterns if re.search(p, str(text)))

lr_model, vectorizer, scaler, tokenizer, bert_model = load_models()

if st.button("Predict"):
    if user_input.strip() == "":
        st.warning("Please enter some text!")
    else:
        temp_df = pd.DataFrame([user_input], columns=['text'])
        temp_df['body_length'] = temp_df['text'].apply(lambda x: len(nltk.word_tokenize(str(x))))
        temp_df['pattern_count'] = temp_df['text'].apply(detect_spam_patterns)
        X_temp = np.hstack([temp_df[['pattern_count', 'body_length']].values,
                            vectorizer.transform(temp_df['text']).toarray()])
        X_temp_scaled = scaler.transform(X_temp)
        lr_pred = lr_model.predict(X_temp_scaled)[0]

        enc = tokenizer([user_input], truncation=True, padding=True, max_length=128, return_tensors='pt')
        with torch.no_grad():
            output = bert_model(enc['input_ids'], attention_mask=enc['attention_mask'])
        bert_pred = torch.argmax(output.logits, dim=1).item()

        ensemble_score = (0.7 * lr_pred) + (0.3 * bert_pred)
        ensemble_pred = 1 if ensemble_score >= 0.5 else 0

        result = "üö´ Spam" if ensemble_pred == 1 else "‚úÖ Not Spam"
        st.success(f"Prediction: {result}")
"""

with open("app.py", "w") as f:
    f.write(app_code)


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
import streamlit as st
import pandas as pd
import numpy as np
import nltk
import re
import joblib
import torch
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from transformers import BertTokenizer, BertForSequenceClassification

nltk.download('punkt')

st.set_page_config(page_title="Spam Detector Pro", page_icon="üìß", layout="centered")
st.title("üìß Spam Email/Text Classifier (Ensemble + Explainability)")
st.markdown("This app combines **Logistic Regression** and **BERT** models for spam detection and explains key words contributing to the decision.")

# Text box
user_input = st.text_area("‚úâÔ∏è Enter your email or text here:", height=150)

@st.cache_resource
def load_models():
    lr_model = joblib.load("logreg_model.pkl")
    vectorizer = joblib.load("vectorizer.pkl")
    scaler = joblib.load("scaler.pkl")
    tokenizer = BertTokenizer.from_pretrained('/content/bert_spam_model')
    bert_model = BertForSequenceClassification.from_pretrained('/content/bert_spam_model')
    bert_model.eval()
    return lr_model, vectorizer, scaler, tokenizer, bert_model

def detect_spam_patterns(text):
    patterns = [r'(?i)fr[3e][3e]', r'(?i)w[i1]n', r'(?i)cl[i1]ck']
    return sum(1 for p in patterns if re.search(p, str(text)))

def get_top_words(text, vectorizer, lr_model, top_n=10):
    """Get top TF-IDF words contributing to spam prediction."""
    feature_names = np.array(vectorizer.get_feature_names_out())
    tfidf_vec = vectorizer.transform([text]).toarray()[0]
    coef = lr_model.coef_[0]
    importance = tfidf_vec * coef  # contribution per word
    top_idx = np.argsort(importance)[-top_n:]
    top_words = feature_names[top_idx]
    top_scores = importance[top_idx]
    return list(zip(top_words, top_scores))

# Load models
lr_model, vectorizer, scaler, tokenizer, bert_model = load_models()

if st.button("üîç Predict"):
    if user_input.strip() == "":
        st.warning("Please enter some text!")
    else:
        # ---------- Logistic Regression Prediction ----------
        temp_df = pd.DataFrame([user_input], columns=['text'])
        temp_df['body_length'] = temp_df['text'].apply(lambda x: len(nltk.word_tokenize(str(x))))
        temp_df['pattern_count'] = temp_df['text'].apply(detect_spam_patterns)
        X_temp = np.hstack([temp_df[['pattern_count', 'body_length']].values,
                            vectorizer.transform(temp_df['text']).toarray()])
        X_temp_scaled = scaler.transform(X_temp)
        lr_pred_proba = lr_model.predict_proba(X_temp_scaled)[0][1]
        lr_pred = int(lr_pred_proba >= 0.5)

        # ---------- BERT Prediction ----------
        enc = tokenizer([user_input], truncation=True, padding=True, max_length=128, return_tensors='pt')
        with torch.no_grad():
            output = bert_model(enc['input_ids'], attention_mask=enc['attention_mask'])
        bert_probs = torch.nn.functional.softmax(output.logits, dim=1)
        bert_pred_proba = bert_probs[0][1].item()
        bert_pred = int(bert_pred_proba >= 0.5)

        # ---------- Ensemble ----------
        ensemble_score = (0.7 * lr_pred_proba) + (0.3 * bert_pred_proba)
        ensemble_pred = 1 if ensemble_score >= 0.5 else 0
        result = "üö´ Spam" if ensemble_pred == 1 else "‚úÖ Not Spam"
        st.success(f"**Prediction:** {result}")
        st.write(f"**Ensemble Confidence:** {ensemble_score:.2f}")

        # ---------- Visualization: Model Confidence ----------
        st.subheader("üìä Model Confidence Comparison")
        fig, ax = plt.subplots()
        models = ['Logistic Regression', 'BERT', 'Ensemble']
        probs = [lr_pred_proba, bert_pred_proba, ensemble_score]
        colors = ['orange', 'purple', 'green']
        ax.barh(models, probs, color=colors)
        ax.set_xlim(0, 1)
        ax.set_xlabel("Spam Probability")
        ax.set_title("Model Confidence Scores")
        st.pyplot(fig)

        # ---------- Explainability: Important Words ----------
        st.subheader("üß† Why It‚Äôs Spam (Important Words)")
        top_words = get_top_words(user_input, vectorizer, lr_model, top_n=10)
        if len(top_words) > 0:
            expl_df = pd.DataFrame(top_words, columns=['Word', 'Importance'])
            st.bar_chart(expl_df.set_index('Word'))
        else:
            st.info("No significant spam-related words found in this text.")


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
2025-12-18 13:40:46.679 
  command:

    streamlit run /usr/local/lib/python3.12/dist-packages/colab_kernel_launcher.py [ARGUMENTS]
2025-12-18 13:40:46.703 Session state does not function when running a script without `streamlit run`


In [None]:
import streamlit as st
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import torch
from transformers import BertTokenizer, BertForSequenceClassification
from sklearn.feature_extraction.text import TfidfVectorizer
import joblib
import re

# -------------------------------
# Load Models
# -------------------------------
tokenizer = BertTokenizer.from_pretrained('/content/bert_spam_model')
bert_model = BertForSequenceClassification.from_pretrained('/content/bert_spam_model')
lr_model = joblib.load('logreg_model.pkl')
tfidf = joblib.load('vectorizer.pkl')

# -------------------------------
# Helper functions
# -------------------------------
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'http\S+|www\S+', '', text)
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    return text

def predict_bert(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=128)
    with torch.no_grad():
        outputs = bert_model(**inputs)
        probs = torch.nn.functional.softmax(outputs.logits, dim=1).numpy()[0]
    return probs

def predict_lr(text):
    text_tfidf = tfidf.transform([text])
    probs = lr_model.predict_proba(text_tfidf)[0]
    return probs

def explain_top_words(text, top_n=5):
    text_tfidf = tfidf.transform([text])
    feature_array = np.array(tfidf.get_feature_names_out())
    tfidf_sorting = np.argsort(text_tfidf.toarray()).flatten()[::-1]
    top_words = feature_array[tfidf_sorting][:top_n]
    scores = text_tfidf.toarray()[0][tfidf_sorting][:top_n]
    return list(zip(top_words, scores))

# -------------------------------
# Streamlit UI
# -------------------------------
st.title("üì© Spam Message Detector (BERT + Logistic Regression)")
st.write("Enter a message below to detect whether it's spam, and see why.")

user_input = st.text_area("‚úâÔ∏è Enter your message:", height=150)

if st.button("üîç Predict"):
    if user_input.strip() == "":
        st.warning("Please enter some text to analyze.")
    else:
        clean_text = preprocess_text(user_input)

        # Predictions
        probs_bert = predict_bert(clean_text)
        probs_lr = predict_lr(clean_text)
        avg_probs = (probs_bert + probs_lr) / 2
        labels = ["Not Spam", "Spam"]
        final_pred = labels[np.argmax(avg_probs)]

        # Show Prediction
        st.subheader("‚úÖ Prediction:")
        st.success(f"This message is **{final_pred.upper()}**")

        # -------------------------------
        # Confidence Bar Chart
        # -------------------------------
        st.subheader("üìä Model Confidence")
        df = pd.DataFrame({
            'Model': ['BERT', 'Logistic Regression'],
            'Spam Probability': [probs_bert[1], probs_lr[1]]
        })

        fig, ax = plt.subplots()
        ax.bar(df['Model'], df['Spam Probability'])
        ax.set_ylim(0, 1)
        ax.set_ylabel("Spam Probability")
        ax.set_title("Model Confidence Comparison")
        st.pyplot(fig)

        # -------------------------------
        # Explainability Section
        # -------------------------------
        st.subheader("üßê Why It‚Äôs Spam (Important Words)")
        top_words = explain_top_words(clean_text)
        for word, score in top_words:
            st.write(f"- **{word}** (importance: {score:.3f})")




In [None]:
!kill $(ps aux | grep ngrok | grep -v grep | awk '{print $2}') 2>/dev/null


In [None]:
from pyngrok import ngrok
import time, os

!pip install pyngrok streamlit

# Set ngrok token if not already done
ngrok.set_auth_token("34MkEpvicIfJzcLmd4XCSmzl3S7_4SrgS5PEVYe7ymJvsu7mm")  # <-- paste your token

# Kill old Streamlit
os.system("kill $(ps aux | grep streamlit | grep -v grep | awk '{print $2}') 2>/dev/null")

# Start Streamlit
get_ipython().system_raw('streamlit run app.py --server.port 8501 &')
time.sleep(5)

# Create tunnel
public_url = ngrok.connect(8501)
print("‚úÖ Streamlit app running at:", public_url)


‚úÖ Streamlit app running at: NgrokTunnel: "https://synostotic-unthankful-lynsey.ngrok-free.dev" -> "http://localhost:8501"


In [None]:
!jupyter nbconvert --to script Untitled24.ipynb

This application is used to convert notebook files (*.ipynb)
        to various other formats.


Options
The options below are convenience aliases to configurable class-options,
as listed in the "Equivalent to" description-line of the aliases.
To see all configurable class-options for some <cmd>, use:
    <cmd> --help-all

--debug
    set log level to logging.DEBUG (maximize logging output)
    Equivalent to: [--Application.log_level=10]
--show-config
    Show the application's configuration (human-readable format)
    Equivalent to: [--Application.show_config=True]
--show-config-json
    Show the application's configuration (json format)
    Equivalent to: [--Application.show_config_json=True]
--generate-config
    generate default config file
    Equivalent to: [--JupyterApp.generate_config=True]
-y
    Answer yes to any questions instead of prompting.
    Equivalent to: [--JupyterApp.answer_yes=True]
--execute
    Execute the notebook prior to export.
    Equivalent to: [--ExecutePr

In [None]:
jupyter nbconvert --to notebook your_notebook.py --output clean_notebook.ipynb


SyntaxError: invalid syntax (ipython-input-2098305696.py, line 1)