In [46]:
#!pip install pandas
#!pip install nltk
#!pip install scikit-learn
#!pip install mlflow
#!pip install streamlit

In [18]:
df = pd.read_csv("data/spam.csv", encoding='latin1') 

In [19]:
df.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [24]:
import pandas as pd

# define function
def load_save_data(load_file_path, save_file_path):     
    # read csv    
    df = pd.read_csv(load_file_path, encoding='latin1')    
    # select target columns
    df = df[['v2','v1']]
    # Rename column names
    df.rename(columns={"v2": "message", "v1":"label"}, inplace=True)
    
    # dropping ham rows    
    idxs = df[df['label']=='ham'].index[:1825]
    df.drop(idxs, inplace=True)    
    # shuffle the DataFrame rows
    df = df.sample(frac = 1)
    df.reset_index(inplace=True, drop=True)    
    # save csv
    df.to_csv(save_file_path, index=False)
    print(f"File saved as: {save_file_path}")

In [25]:
# load the csv file & save it as data.csv
load_save_data(load_file_path="data/spam.csv", save_file_path="data.csv")


File saved as: data.csv


In [27]:
# load saved csv & look at class balance
df = pd.read_csv("data.csv")
df.label.value_counts()

label
ham     3000
spam     747
Name: count, dtype: int64

### Train model

download the stopwords and punkt resoruces

In [37]:
import nltk
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


True

In [35]:
# import deps
import pandas as pd
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# initialize objects
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

# A bsic text processing function with variations in preprocessing like stemming / lemmatization
def preprocess_text(text):
    words = word_tokenize(text.lower())
    filtered_words = [word for word in words if word.isalpha()]
    # filtered_words = [word for word in words if word.isalpha() and word not in stop_words]
    # filtered_words = [stemmer.stem(word) for word in words if word.isalpha() and word not in stop_words]
    # filtered_words = [lemmatizer.lemmatize(word) for word in words if word.isalpha() and word not in stop_words]
    return ' '.join(filtered_words)

# basic trainin model func with variations for vectorizing
def train_model(x_train, y_train, n, c, d):    
    # Create a Vectorizer to convert text data to numerical features
    # vectorizer = CountVectorizer()
    vectorizer = TfidfVectorizer()
    x_train_vectorized = vectorizer.fit_transform(x_train)
    # Initialize the Random Forest classifier
    rf_classifier = RandomForestClassifier(n_estimators=n, criterion=c, max_depth=d)
    # Train the classifier on the training data
    rf_classifier.fit(x_train_vectorized, y_train)
    pred = rf_classifier.predict(x_train_vectorized)
    acc = accuracy_score(pred, y_train)
    return vectorizer, rf_classifier, acc

# evaluation function
def eval_met(actual, pred):
    acc = accuracy_score(actual, pred)
    prc = precision_score(actual, pred, pos_label='spam')
    rec = recall_score(actual, pred, pos_label='spam')
    f1 = f1_score(actual, pred, pos_label='spam')
    return acc, prc, rec, f1

Training and evaluation

In [40]:
# Apply text preprocessing on the message column
df['processed_message'] = df.message.apply(preprocess_text)

# Split the data into features (x) and labels (y)
x = df['processed_message']
y = df['label']

# Split the data into training and testing sets
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

# Train the model
vectorizer, model , train_acc = train_model(x_train, y_train, n=10, c='entropy', d=2)

# Evaluate the model
y_pred = model.predict(vectorizer.transform(x_test))
acc, prc, rec, f1 = eval_met(y_pred, y_test)

# Print the Results
print(f"Training Accuracy: {train_acc*100:.3f} %")
print(f"Validation Accuracy: {acc*100:.3f} %")
print(f"Precision Score: {prc*100:.3f} %")
print(f"Recall Score: {rec*100:.3f} %")
print(f"F1 Score: {f1*100:.3f} %")

Training Accuracy: 80.080 %
Validation Accuracy: 80.667 %
Precision Score: 0.000 %
Recall Score: 0.000 %
F1 Score: 0.000 %


  _warn_prf(average, modifier, msg_start, len(result))


#### mlflow tracking

In [42]:
# Run to launch MLflow server (http://localhost:500)
# !mlflow ui

^C


In [47]:
# Import Libraries
import streamlit as st  
import pandas as pd
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import mlflow
import subprocess
import os
import webbrowser

# Configure Page
st.set_page_config(
    page_title="Spam Filter",
    page_icon="🤖", 
    layout="centered",
    initial_sidebar_state="expanded") 

# load feature extracted data
df = pd.read_csv("data.csv")

# HELPER FUNCTIONS
# A bsic text processing function with options for with/without stop words or
# stemming / lemmatizing
def preprocess_text(text):
    stop_words = set(stopwords.words('english'))
    stemmer = PorterStemmer()
    lemmatizer = WordNetLemmatizer()
    words = word_tokenize(text.lower())
    # filtered_words = [word for word in words if word.isalpha()]
    filtered_words = [word for word in words if word.isalpha() and word not in stop_words]
    # filtered_words = [stemmer.stem(word) for word in words if word.isalpha() and word not in stop_words]
    # filtered_words = [lemmatizer.lemmatize(word) for word in words if word.isalpha() and word not in stop_words]
    return ' '.join(filtered_words)

# Train the model
def train_model(exp_name, df, n, c, d):     
    df['processed_message'] = df.message.apply(preprocess_text)
    # Split the data into features (X) and labels (y)
    x = df['processed_message']
    y = df['label']
    # Split the data into training and testing sets
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)
    # Create or Select Experiment 
    experiment = mlflow.set_experiment(exp_name)    
    with mlflow.start_run(experiment_id=experiment.experiment_id):          
        # Create a Vectorizer to convert text data to numerical features: BoW / TF-IDF 
        # vectorizer = CountVectorizer()
        vectorizer = TfidfVectorizer()
        x_train_vectorized = vectorizer.fit_transform(x_train)          
        x_test_vectorized = vectorizer.transform(x_test)          
        rf_classifier = RandomForestClassifier(n_estimators=n, criterion=c, max_depth=d)
        rf_classifier.fit(x_train_vectorized, y_train)
        # Make predictions on the training & test set
        y_train_pred = rf_classifier.predict(x_train_vectorized)
        y_test_pred = rf_classifier.predict(x_test_vectorized)
        # Evaluate the model
        train_acc = accuracy_score(y_train, y_train_pred)
        test_acc = accuracy_score(y_test, y_test_pred)
        f1 = f1_score(y_test, y_test_pred, pos_label='spam')        
        # Log Parameters & Metrics
        mlflow.log_params({"n_estimators":n, "Criterion": c, "Maximum Depth": d})        
        mlflow.log_metrics({"Training Accuracy": train_acc, "Test Accuracy": test_acc, "F1 Score": f1})
        # Log Model & Vectorizer
        mlflow.sklearn.log_model(rf_classifier, "model")
        mlflow.sklearn.log_model(vectorizer, "vectorizer") 
    return train_acc, test_acc

# Function for opening MLFlow UI directly from Streamlit
def open_mlflow_ui():
    # Start the MLflow tracking server as a subprocess
    cmd = "mlflow ui --port 5000"
    subprocess.Popen(cmd, shell=True)
def open_browser(url):
    webbrowser.open_new_tab(url)
    
# STREAMLIT UI   
# Sidebar for hyperparameter tuning
st.sidebar.title("Tune Hyper Params ⚙️")
n = st.sidebar.slider('N-Estimators',min_value=1, max_value=200, step=2, value=10)
d = st.sidebar.slider('Max Depth', min_value=1, max_value=20, step=2, value=2)
c = st.sidebar.selectbox('Criterion', ['gini', 'entropy', 'log_loss'], index=1)

# Launch Mlflow from Streamlit
st.sidebar.title("Mlflow Tracking 🔎")    
if st.sidebar.button("Launch 🚀"):
    open_mlflow_ui()
    st.sidebar.success("MLflow Server is Live! http://localhost:5000")
    open_browser("http://localhost:5000")

# Main Page Content
st.title("Spam Classifier Trainer 🤖")
exp_type = st.radio("Select Experiment Type", ['New Experiment', 'Existing Experiment'], horizontal=True)
if exp_type == 'New Experiment':
    exp_name = st.text_input("Enter the name for New Experiment")
else:
    try:
        if os.path.exists('./mlruns'):
            exps = [i.name for i in mlflow.search_experiments()]
            exp_name = st.selectbox("Select Experiment", exps)
        else:
            st.warning("🚨 No Previous Experiments Found! Set New Experiment ⬆️")            
    except:
        st.warning("🚨 No Previous Experiments Found! Set New Experiment ⬆️")

# Training the model starts from here    
if st.button("Train ⚙️"):
    with st.spinner('Feeding the data--->🧠'):
        tr_a, ts_a = train_model(exp_name, df, n, c, d)
    st.success('Trained!') 
    st.write(f"Training Accuracy Achieved: {tr_a:.3f}")  

2023-12-05 20:29:45.984 
  command:

    streamlit run C:\Users\User\Desktop\GithubProjects\spam-filter\venv\Lib\site-packages\ipykernel_launcher.py [ARGUMENTS]
