<a href="https://colab.research.google.com/github/asmi04513/akhwan-nur-asmi_tugasteoridatamining/blob/main/Untitled3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [10]:
import pandas as pd
import numpy as np
import os
import pickle
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import classification_report, accuracy_score
from imblearn.over_sampling import SMOTE

In [5]:
from google.colab import files

uploaded = files.upload()  # Pilih file spam.csv dari komputer Anda

df = pd.read_csv('spam.csv', encoding='latin-1')
df = df[['v1', 'v2']]
df.columns = ['label', 'text']

Saving spam.csv to spam.csv


In [6]:
# 2. Explore
print(df['label'].value_counts())
print(df.isnull().sum())


label
ham     4825
spam     747
Name: count, dtype: int64
label    0
text     0
dtype: int64


In [7]:
# 3. Preprocess text simple
df['text_clean'] = df['text'].str.lower().str.replace(r'http\S+','', regex=True).str.replace(r'[^a-z\s]','', regex=True).str.strip()

In [11]:
# 4. Feature extraction
vect = TfidfVectorizer(ngram_range=(1,2), min_df=5, max_df=0.9)
X = vect.fit_transform(df['text_clean'])
y = np.where(df['label']=='spam', 1, 0)

# 5. Train‑test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, stratify=y, random_state=42)

# 6. Handle imbalance
sm = SMOTE(random_state=42)
X_train_res, y_train_res = sm.fit_resample(X_train, y_train)

# 7. Define models & voting
clf_nb = MultinomialNB(alpha=1.0)
clf_lr = LogisticRegression(max_iter=1000, class_weight='balanced')
voting_clf = VotingClassifier(estimators=[('nb', clf_nb), ('lr', clf_lr)], voting='hard')

# 8. Train
voting_clf.fit(X_train_res, y_train_res)

# 9. Evaluate
y_pred = voting_clf.predict(X_test)
acc = accuracy_score(y_test, y_pred)
print("Akurasi test:", acc)
print(classification_report(y_test, y_pred, target_names=['Ham','Spam']))

# 10. Cross‑validation
cv_scores = cross_val_score(voting_clf, X_train_res, y_train_res, cv=5, scoring='accuracy')
print("CV Akurasi:", cv_scores.mean())

# Buat folder 'model' jika belum ada
os.makedirs('model', exist_ok=True)

# Simpan model dan vectorizer
with open('model/voting_sms_model.pkl', 'wb') as f:
    pickle.dump(voting_clf, f)

with open('model/vect_sms.pkl', 'wb') as f:
    pickle.dump(vect, f)

print("Model dan vectorizer berhasil disimpan di folder 'model/'")



Akurasi test: 0.9796650717703349
              precision    recall  f1-score   support

         Ham       0.98      0.99      0.99       724
        Spam       0.95      0.89      0.92       112

    accuracy                           0.98       836
   macro avg       0.97      0.94      0.95       836
weighted avg       0.98      0.98      0.98       836

CV Akurasi: 0.9642775077659369
Model dan vectorizer berhasil disimpan di folder 'model/'


In [13]:
# Install Streamlit dan pyngrok (untuk jalankan Streamlit di Colab)
!pip install streamlit pyngrok


Collecting streamlit
  Downloading streamlit-1.51.0-py3-none-any.whl.metadata (9.5 kB)
Collecting pyngrok
  Downloading pyngrok-7.5.0-py3-none-any.whl.metadata (8.1 kB)
Collecting pydeck<1,>=0.8.0b4 (from streamlit)
  Downloading pydeck-0.9.1-py2.py3-none-any.whl.metadata (4.1 kB)
Downloading streamlit-1.51.0-py3-none-any.whl (10.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.2/10.2 MB[0m [31m38.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pyngrok-7.5.0-py3-none-any.whl (24 kB)
Downloading pydeck-0.9.1-py2.py3-none-any.whl (6.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.9/6.9 MB[0m [31m78.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pyngrok, pydeck, streamlit
Successfully installed pydeck-0.9.1 pyngrok-7.5.0 streamlit-1.51.0


In [14]:
# app_streamlit.py
import streamlit as st
import pickle
import numpy as np

# Load model & vectorizer
with open('model/vect_sms.pkl','rb') as f:
    vect = pickle.load(f)
with open('model/voting_sms_model.pkl','rb') as f:
    model = pickle.load(f)

st.title("SMS Spam Detector")
st.write("Masukkan pesan SMS Anda dan lihat prediksinya (ham = legit, spam = tidak diinginkan).")

user_input = st.text_area("Pesan SMS:")

if st.button("Prediksi"):
    cleaned = user_input.lower()
    cleaned = ''.join(ch for ch in cleaned if ch.isalpha() or ch.isspace())
    X_input = vect.transform([cleaned])
    pred = model.predict(X_input)[0]
    label = "Spam" if pred==1 else "Ham (Legit)"
    st.write("Hasil prediksi: **{}**".format(label))

    # tidak semua model voting punya predict_proba; jika punya:
    if hasattr(model, "predict_proba"):
        proba = model.predict_proba(X_input)[0][1]
        st.write(f"Probabilitas spam: {proba:.2%}")

# Sidebar – performansi model
st.sidebar.header("Performansi Model")
st.sidebar.write("Akurasi test: {:.2%}".format(acc))  # acc dari kode training


2025-11-21 14:46:25.805 
  command:

    streamlit run /usr/local/lib/python3.12/dist-packages/colab_kernel_launcher.py [ARGUMENTS]
2025-11-21 14:46:25.817 Session state does not function when running a script without `streamlit run`
