# CYBER BULLYING GUI NOTEBOOK

---

## NCSU CSC 591: Algorithms for Data Guided Buisness Intelligence

---
As social media usage grows across all age groups, the great majority of individuals rely on this crucial medium for day-to-day communication. Because of the pervasiveness of social media, cyberbullying may affect anybody at any time or from any location, and the internet's relative anonymity makes such personal attacks more difficult to stop than conventional bullying.


In light of this, this dataset comprises over 47000 tweets labeled with the following cyberbullying categories: Age, Ethnicity, Gender, Religion, Other sort of cyberbullying, Not cyberbullying.

Trigger Warning: These tweets either describe a bullying occurrence or are the crime itself; consequently, read them until you are comfortable.

---

#### Contributors: Anmolika Goyal(agoyal4), Anshul Navinbhai Patel(apatel28), Shubhangi Jain(sjain29)

---



Connect the Google Drive

In [1]:
from google.colab import drive
drive.mount('/content/gdrive', force_remount=True)

Mounted at /content/gdrive


### Import Libraries

In [2]:
# Installing the libraries
!pip install emoji==1.6.3
!pip install gradio

Collecting emoji==1.6.3
  Downloading emoji-1.6.3.tar.gz (174 kB)
[?25l[K     |█▉                              | 10 kB 24.3 MB/s eta 0:00:01[K     |███▊                            | 20 kB 10.5 MB/s eta 0:00:01[K     |█████▋                          | 30 kB 8.7 MB/s eta 0:00:01[K     |███████▌                        | 40 kB 8.3 MB/s eta 0:00:01[K     |█████████▍                      | 51 kB 4.5 MB/s eta 0:00:01[K     |███████████▎                    | 61 kB 5.4 MB/s eta 0:00:01[K     |█████████████▏                  | 71 kB 5.5 MB/s eta 0:00:01[K     |███████████████                 | 81 kB 4.2 MB/s eta 0:00:01[K     |█████████████████               | 92 kB 4.6 MB/s eta 0:00:01[K     |██████████████████▉             | 102 kB 5.1 MB/s eta 0:00:01[K     |████████████████████▊           | 112 kB 5.1 MB/s eta 0:00:01[K     |██████████████████████▋         | 122 kB 5.1 MB/s eta 0:00:01[K     |████████████████████████▌       | 133 kB 5.1 MB/s eta 0:00:01[K     |██

In [3]:
# General Librarires
import gradio as gr
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re, string
import emoji
from tqdm import tqdm
# Model Saving
import joblib
import pickle
# Scikit-Learn Functions
from sklearn import preprocessing, decomposition, metrics, pipeline
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
# Machine Learning
import xgboost as xgb
# NLTK
import nltk
from nltk import word_tokenize
from nltk.corpus import stopwords
nltk.download('stopwords') 
nltk.download('punkt')
stop_words = stopwords.words('english')
# Keras
from keras.models import Sequential
from keras.layers.recurrent import LSTM, GRU
from keras.layers.core import Dense, Activation, Dropout
from keras.layers.embeddings import Embedding
from keras.utils import np_utils
from keras.layers import GlobalMaxPooling1D, Conv1D, MaxPooling1D, Flatten, Bidirectional, SpatialDropout1D
from keras.preprocessing import sequence, text
from keras.callbacks import EarlyStopping
from tensorflow.keras.layers import BatchNormalization

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


### Preprocessing the Dataset

In [4]:
# Define preprocessing functions

# Remove emojis from text
def remove_emoji(txt):
  txt = re.sub(emoji.get_emoji_regexp(), r"", txt)
  return txt

# Expand common abbreviations
def expand_txt(txt):
  txt = re.sub(r"\'d", " would", txt)
  txt = re.sub(r"\'ll", " will", txt)
  txt = re.sub(r"can\'t", "can not", txt)
  txt = re.sub(r"\'ve", " have", txt)
  txt = re.sub(r"\'re", " are", txt)
  txt = re.sub(r"\'s", " is", txt)
  txt = re.sub(r"\'m", " am", txt)
  txt = re.sub(r"n\'t", " not", txt)
  txt = re.sub(r"\'t", " not", txt)
  return txt

# Remove characters, links, mentions, and punctuations
def clean_nonwanted_chars(txt):
  # Remove characters
  txt = txt.replace('\n', ' ')
  txt = txt.replace('\r', '')
  # Remove mentions and links
  txt = re.sub(r'[^\x00-\x7f]',r'', txt)
  # Remove punctuations
  punc_remove = string.punctuation
  punc_list = str.maketrans('', '', punc_remove)
  txt = txt.translate(punc_list)
  txt = [word for word in txt.split() if word not in stop_words]
  txt = ' '.join(txt)
  return txt

# Remove Hashtags
def remove_hash(txt):
  txt = " ".join(word.strip() for word in re.split('#(?!(?:hashtag)\b)[\w-]+(?=(?:\s+#[\w-]+)*\s*$)', txt)) 
  txt = " ".join(word.strip() for word in re.split('#|_', txt))
  return txt

# Remove characters from between the words
def remove_chars(txt):
    clean = []
    for word in txt.split(' '):
        if ('&' in word) | ('$' in word):
            clean.append('')
        else:
            clean.append(word)
    txt = ' '.join(clean)
    return txt

# Remove multiple spaces and tabs
def remove_space(txt):
  txt = re.sub("\s\s+" , " ", txt)
  return txt

In [5]:
# Process the textual data
def preprocess_text(txt):
  txt = txt.lower()
  txt = remove_emoji(txt)
  txt = expand_txt(txt)
  txt = clean_nonwanted_chars(txt)
  txt = remove_hash(txt)
  txt = remove_chars(txt)
  txt = remove_space(txt)
  # Stemming the text
  tokens = nltk.word_tokenize(txt)
  PS = nltk.stem.PorterStemmer()
  txt = ' '.join([PS.stem(words) for words in tokens])
  return txt

In [None]:
# Github Models

# Load all the variables and models
with open(r"https://github.com/anshulp2912/Cyberbullying_Tweet_Classification/blob/main/models/LE.pkl", "rb") as input_file:
  LE = pickle.load(input_file)
with open('https://github.com/anshulp2912/Cyberbullying_Tweet_Classification/blob/main/models/CountVectorizer.pkl', 'wb') as input_file:
  CV = pickle.load(input_file)
with open('https://github.com/anshulp2912/Cyberbullying_Tweet_Classification/blob/main/models/TFIDFVectorizer.pkl', 'wb') as input_file:
  TFIDF = pickle.load(input_file)
with open('https://github.com/anshulp2912/Cyberbullying_Tweet_Classification/blob/main/models/SVD.pkl', 'wb') as input_file:
  SVD = pickle.load(input_file)
with open('https://github.com/anshulp2912/Cyberbullying_Tweet_Classification/blob/main/models/SVMScaler.pkl', 'wb') as input_file:
  SVMScaler = pickle.load(input_file)
with open('https://github.com/anshulp2912/Cyberbullying_Tweet_Classification/blob/main/models/embeddings.pkl', 'wb') as input_file:
  embeddings = pickle.load(input_file)
with open('https://github.com/anshulp2912/Cyberbullying_Tweet_Classification/blob/main/models/NNScaler.pkl', 'wb') as input_file:
  NNScaler = pickle.load(input_file)
with open('https://github.com/anshulp2912/Cyberbullying_Tweet_Classification/blob/main/models/token.pkl', 'wb') as input_file:
  token = pickle.load(input_file)
with open('https://github.com/anshulp2912/Cyberbullying_Tweet_Classification/blob/main/models/word_index.pkl', 'wb') as input_file:
  word_index = pickle.load(input_file)

model_CV_LR = joblib.load('https://github.com/anshulp2912/Cyberbullying_Tweet_Classification/blob/main/models/model_CV_LR.sav')
model_TFIDF_LR = joblib.load('https://github.com/anshulp2912/Cyberbullying_Tweet_Classification/blob/main/models/model_TFIDF_LR.sav')
model_CV_NB = joblib.load('https://github.com/anshulp2912/Cyberbullying_Tweet_Classification/blob/main/models/model_CV_NB.sav')
model_TFIDF_NB = joblib.load('https://github.com/anshulp2912/Cyberbullying_Tweet_Classification/blob/main/models/model_TFIDF_NB.sav')
model_CV_XG = joblib.load('https://github.com/anshulp2912/Cyberbullying_Tweet_Classification/blob/main/models/model_CV_XG.sav')
model_TFIDF_XG = joblib.load('https://github.com/anshulp2912/Cyberbullying_Tweet_Classification/blob/main/models/model_TFIDF_XG.sav')
model_CV_SVM = joblib.load('https://github.com/anshulp2912/Cyberbullying_Tweet_Classification/blob/main/models/model_CV_SVM.sav')
model_TFIDF_SVM = joblib.load('https://github.com/anshulp2912/Cyberbullying_Tweet_Classification/blob/main/models/model_TFIDF_SVM.sav')
GLOVE_XB = joblib.load('https://github.com/anshulp2912/Cyberbullying_Tweet_Classification/blob/main/models/GLOVE_XB.sav')
vanillann = joblib.load('https://github.com/anshulp2912/Cyberbullying_Tweet_Classification/blob/main/models/vanillann.sav')
biLSTM = joblib.load('https://github.com/anshulp2912/Cyberbullying_Tweet_Classification/blob/main/models/biLSTM.sav')


In [6]:
# Load from Google Drive

# Load all the variables and models
with open(r"/content/gdrive/Shareddrives/ADBI_Capstone/models/LE.pkl", "rb") as input_file:
  LE = pickle.load(input_file)
input_file.close()
with open('/content/gdrive/Shareddrives/ADBI_Capstone/models/CountVectorizer.pkl', 'rb') as input_file:
  CV = pickle.load(input_file)
input_file.close()
with open('/content/gdrive/Shareddrives/ADBI_Capstone/models/TFIDFVectorizer.pkl', 'rb') as input_file:
  TFIDF = pickle.load(input_file)
input_file.close()
with open('/content/gdrive/Shareddrives/ADBI_Capstone/models/SVD.pkl', 'rb') as input_file:
  SVD = pickle.load(input_file)
input_file.close()
with open('/content/gdrive/Shareddrives/ADBI_Capstone/models/SVMScaler.pkl', 'rb') as input_file:
  SVMScaler = pickle.load(input_file)
input_file.close()
with open('/content/gdrive/Shareddrives/ADBI_Capstone/models/embeddings.pkl', 'rb') as input_file:
  embeddings = pickle.load(input_file)
input_file.close()
with open('/content/gdrive/Shareddrives/ADBI_Capstone/models/NNScaler.pkl', 'rb') as input_file:
  NNScaler = pickle.load(input_file)
input_file.close()
with open('/content/gdrive/Shareddrives/ADBI_Capstone/models/token.pkl', 'rb') as input_file:
  token = pickle.load(input_file)
input_file.close()
with open('/content/gdrive/Shareddrives/ADBI_Capstone/models/word_index.pkl', 'rb') as input_file:
  word_index = pickle.load(input_file)
input_file.close()

model_CV_LR = joblib.load('/content/gdrive/Shareddrives/ADBI_Capstone/models/model_CV_LR.sav')
model_TFIDF_LR = joblib.load('/content/gdrive/Shareddrives/ADBI_Capstone/models/model_TFIDF_LR.sav')
model_CV_NB = joblib.load('/content/gdrive/Shareddrives/ADBI_Capstone/models/model_CV_NB.sav')
model_TFIDF_NB = joblib.load('/content/gdrive/Shareddrives/ADBI_Capstone/models/model_TFIDF_NB.sav')
model_CV_XG = joblib.load('/content/gdrive/Shareddrives/ADBI_Capstone/models/model_CV_XG.sav')
model_TFIDF_XG = joblib.load('/content/gdrive/Shareddrives/ADBI_Capstone/models/model_TFIDF_XG.sav')
model_CV_SVM = joblib.load('/content/gdrive/Shareddrives/ADBI_Capstone/models/model_CV_SVM.sav')
model_TFIDF_SVM = joblib.load('/content/gdrive/Shareddrives/ADBI_Capstone/models/model_TFIDF_SVM.sav')
GLOVE_XB = joblib.load('/content/gdrive/Shareddrives/ADBI_Capstone/models/GLOVE_XB.sav')
vanillann = joblib.load('/content/gdrive/Shareddrives/ADBI_Capstone/models/vanillann.sav')
biLSTM = joblib.load('/content/gdrive/Shareddrives/ADBI_Capstone/models/biLSTM.sav')




In [7]:
# Word Vector Functions

#Using word_tokenize to create vectors which are normalized for the wole sentence
def tokenized_sentence(s):
    text = str(s).lower()
    #use word_tokenize to split the text into words
    text = word_tokenize(text)
    #create a list named text which stores the words that are not in stop_words
    text = [word for word in text if not word in stop_words]
    #check if the word is an alphanumeric
    text = [word for word in text if word.isalpha()]
    values = []
    #for each word in text, append the value of the vector for that word into the values list
    for word in text:
        try:
            values.append(embeddings[word])
        except:
            continue
    values = np.array(values)
    vectors = values.sum(axis=0)
    if(type(vectors) != np.ndarray):
        return np.zeros(300)
    #return the normalized vectors of the sentence
    return vectors / np.sqrt((vectors ** 2).sum())

In [8]:
def predict_LR(input_text, wordset):
  if wordset=='CV':
    x_test = CV.transform(input_text)
    y_pred = model_CV_LR.predict(x_test)
    y_pred = LE.inverse_transform(y_pred)
  else:
    x_test = TFIDF.transform(input_text)
    y_pred = model_TFIDF_LR.predict(x_test)
    y_pred = LE.inverse_transform(y_pred)
  return y_pred[0]

def predict_NB(input_text, wordset):
  if wordset=='CV':
    x_test = CV.transform(input_text)
    y_pred = model_CV_NB.predict(x_test)
    y_pred = LE.inverse_transform(y_pred)
  else:
    x_test = TFIDF.transform(input_text)
    y_pred = model_TFIDF_NB.predict(x_test)
    y_pred = LE.inverse_transform(y_pred)
  return y_pred[0]

def predict_XG(input_text, wordset):
  if wordset=='CV':
    x_test = CV.transform(input_text)
    y_pred = model_CV_XG.predict(x_test)
    y_pred = LE.inverse_transform(y_pred)
  else:
    x_test = TFIDF.transform(input_text)
    y_pred = model_TFIDF_XG.predict(x_test)
    y_pred = LE.inverse_transform(y_pred)
  return y_pred[0]

def predict_SVM(input_text, wordset):
  if wordset=='CV':
    x_test = CV.transform(input_text)
    x_test = SVD.transform(x_test)
    x_test = SVMScaler.transform(x_test)
    y_pred = model_CV_SVM.predict(x_test)
    y_pred = LE.inverse_transform(y_pred)
  else:
    x_test = TFIDF.transform(input_text)
    x_test = SVD.transform(x_test)
    x_test = SVMScaler.transform(x_test)
    y_pred = model_TFIDF_SVM.predict(x_test)
    y_pred = LE.inverse_transform(y_pred)
  return y_pred[0]

def predict_glove(input_text):
  x_test = []
  for sentence in tqdm(input_text):
    x_test.append(tokenized_sentence(sentence))
  x_test = np.array(x_test)
  y_pred = GLOVE_XB.predict(x_test)
  y_pred = LE.inverse_transform(y_pred)
  return y_pred[0]

def predict_vanilla_ann(input_text):
  x_test = []
  for sentence in tqdm(input_text):
    x_test.append(tokenized_sentence(sentence))
  x_test = np.array(x_test)
  x_test = NNScaler.transform(x_test)
  y_pred = vanillann.predict(x_test).argmax(axis=-1)
  y_pred = LE.inverse_transform(y_pred)
  return y_pred[0]

def predict_biLSTM(input_text):
  x_test = token.texts_to_sequences(input_text)
  x_test = sequence.pad_sequences(x_test, maxlen=100)
  y_pred = biLSTM.predict(x_test).argmax(axis=-1)
  y_pred = LE.inverse_transform(y_pred)
  return y_pred[0]

In [9]:
def prediction(input_text, model, word_set):
  input_text = preprocess_text(input_text)
  input_text = np.array([input_text])
  if model=="Logistic Regression":
    if word_set=='Count Vectorizer':
      return predict_LR(input_text,'CV')
    else:
      return predict_LR(input_text,'TFIDF')
  elif model=="Naive Bayes":
    if word_set=='Count Vectorizer':
      return predict_NB(input_text,'CV')
    else:
      return predict_NB(input_text,'TFIDF')
  elif model=="XGBoost":
    if word_set=='Count Vectorizer':
      return predict_XG(input_text,'CV')
    else:
      return predict_XG(input_text,'TFIDF')
  elif model=="Support Vector Machine":
    if word_set=='Count Vectorizer':
      return predict_SVM(input_text,'CV')
    else:
      return predict_SVM(input_text,'TFIDF')
  elif model=="Sent2Vec":
    return predict_glove(input_text)
  elif model=="Vanilla ANN":
    return predict_vanilla_ann(input_text)
  else:
    return predict_biLSTM(input_text)

In [10]:
# Gradio application
app_gradio = gr.Interface(
    fn = prediction,
    inputs = ["text",gr.inputs.Radio(["Logistic Regression", "Naive Bayes", "XGBoost", "Support Vector Machine","Sent2Vec","Vanilla ANN", "Bi-LSTM"]),gr.inputs.Radio(["Count Vectorizer", "TFIDF"])],
    outputs = "text",
    title="CYBERBULLYING TEXT CLASSIFICATION",
    description="As social media usage grows across all age groups, the great majority of individuals rely on this crucial medium for day-to-day communication. Because of the pervasiveness of social media, cyberbullying may affect anybody at any time or from any location, and the internet's relative anonymity makes such personal attacks more difficult to stop than conventional bullying."
)
app_gradio.launch()

Colab notebook detected. To show errors in colab notebook, set `debug=True` in `launch()`
Running on public URL: https://20260.gradio.app

This share link expires in 72 hours. For free permanent hosting, check out Spaces (https://huggingface.co/spaces)


(<fastapi.applications.FastAPI at 0x7fd639aba2d0>,
 'http://127.0.0.1:7860/',
 'https://20260.gradio.app')