In [14]:
import streamlit as st
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import re
import time
import nltk
import joblib
import pandas as pd
from gensim.models import Word2Vec

In [15]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r'\W', ' ', text)
    text = re.sub(r'\s+', ' ', text, flags=re.MULTILINE)
    words = [word for word in text.split() if word.isalnum() and word.isalpha()]
    cleaned_text = ' '.join(words)
    return cleaned_text

def tokenizing_and_stopwords(text):
    tokens = word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [word for word in tokens if word.lower() not in stop_words]
    text = " ".join(filtered_tokens)
    return text

def lemmatization_and_stopwords(text):
    if not text:
        return ""
    tokens = word_tokenize(text)
    clean_text = []
    lemmatizer = nltk.WordNetLemmatizer()
    for token in tokens:
        if token.lower() not in stopwords.words('english') and len(token) > 3:
            token = lemmatizer.lemmatize(token)
            clean_text.append(token)
    result_text = " ".join(clean_text)
    return result_text

In [16]:
def load_model_and_tokenizer():
    # Load the model
    model_path = r"C:\Users\calum\Desktop\Ironhack\Week8\Final Project\tf2_model"
    model = tf.keras.models.load_model(model_path)

    # Load the tokenizer
    tokenizer_path = r"C:\Users\calum\Desktop\Ironhack\Week8\Final Project\saved_tokenizer.joblib"
    tokenizer = joblib.load(tokenizer_path)

    # Load pre-trained Word2Vec model
    word2vec_model_path = r"C:\Users\calum\Desktop\Ironhack\Week8\Final Project\word2vec.model"
    word2vec_model = Word2Vec.load(word2vec_model_path)

    return model, tokenizer, word2vec_model

In [26]:
def classify_message(message, model, tokenizer):
    # Preprocess the input message
    cleaned_message = clean_text(message)
    st.write("Cleaned message:", cleaned_message)
    tokenized_message = tokenizing_and_stopwords(cleaned_message)
    st.write("Stopwords & tokenization:", tokenized_message)
    lemmatized_message = lemmatization_and_stopwords(tokenized_message)
    st.write("Lemmatization:", lemmatized_message)

    # Convert text to sequence
    sequence = tokenizer.texts_to_sequences(lemmatized_message)
    
    # Pad sequence
    padded_sequence = pad_sequences(sequence, maxlen=500) 

    # Make predictions using the loaded model
    prediction = model.predict(padded_sequence)

    #if prediction[0] > 0.5:
        #st.error("This is bullying!")
    #else:
        #st.success("No bullying content")  

    return prediction[0]

In [30]:
def simulate_progress_bar():
    st.write("Starting a long computation...")
    progress = st.progress(0)
    for i in range(101):
        time.sleep(0.1)
        progress.progress(i)

In [31]:
def main():
    st.set_page_config(
        page_title="Detect Cyberbullying",
        page_icon="💬",
        layout="wide",
        initial_sidebar_state="expanded"
    )

    # Sidebar with project information
    st.sidebar.header("Final Project")
    st.sidebar.subheader("Data Analytics OCT/23")
    st.sidebar.write("This is a Streamlit app for detecting cyberbullying content.")
    st.sidebar.write("Carmen Matos and Juliane Petersen")
    st.sidebar.image("https://raw.githubusercontent.com/calumatos/Final_project/main/image1.png")

    # Title with an image right beside it
    st.markdown(
        """
        <div style="display: flex; justify-content: space-between; align-items: center;">
            <h1 style="font-size: 36px; font-weight: bold; color: black;">Detect Cyberbullying</h1>
            <img src="https://raw.githubusercontent.com/calumatos/Final_project/main/image3.png" style="width: 80px;">
        </div>
        """,
        unsafe_allow_html=True
    )

    # User input for message
    message = st.text_area("Enter a message:")

    if st.button("Bullying or not?"):
        if message: 
            # Load the model, tokenizer, and word2vec_model
            model, tokenizer, word2vec_model = load_model_and_tokenizer()
            
            # Simulate a progress bar
            simulate_progress_bar() 
            
            st.subheader("Pre-processing steps:")
            
            # Classify the message
            result = classify_message(message, model, tokenizer)

            # Display the result with custom styling
            if result > 0.5:
                st.markdown('<p style="color: red; font-size: 24px; font-weight: bold;">This is bullying!</p>', unsafe_allow_html=True)
            else:
                st.markdown('<p style="color: green; font-size: 24px; font-weight: bold;">No bullying content</p>', unsafe_allow_html=True)
                
        else:
            st.warning("Please enter a message to classify.")

    # Custom CSS for styling
    st.markdown(
        """
        <style>
            body {
                font-family: 'Verdana', sans-serif;
                font-size: 16px;
                line-height: 1.6;
            }
            .stText {
                font-family: 'Verdana', sans-serif;
                font-size: 18px;
                color: #333333;
            }
            .stTextArea {
                font-family: 'Verdana', sans-serif;
            }
        </style>
        """,
        unsafe_allow_html=True
    )

if __name__ == "__main__":
    main()