# Data Modelling for Cyberbullying Classifier

## Import required libraries

In [4]:
import sys
!{sys.executable} -m pip install imbalanced-learn


Collecting imbalanced-learn
  Using cached imbalanced_learn-0.13.0-py3-none-any.whl.metadata (8.8 kB)
Collecting sklearn-compat<1,>=0.1 (from imbalanced-learn)
  Using cached sklearn_compat-0.1.3-py3-none-any.whl.metadata (18 kB)
Using cached imbalanced_learn-0.13.0-py3-none-any.whl (238 kB)
Using cached sklearn_compat-0.1.3-py3-none-any.whl (18 kB)
Installing collected packages: sklearn-compat, imbalanced-learn
Successfully installed imbalanced-learn-0.13.0 sklearn-compat-0.1.3



[notice] A new release of pip is available: 24.3.1 -> 25.1.1
[notice] To update, run: C:\Users\panka\AppData\Local\Programs\Python\Python312\python.exe -m pip install --upgrade pip


In [1]:
import pandas as pd
import re
import string
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report
import joblib
# !pip install imbalanced-learn
from imblearn.over_sampling import SMOTE

## Data Understanding

In [2]:
data = pd.read_csv('cyberbullying_dataset.csv')

In [3]:
data.head()

Unnamed: 0,Text,oh_label
0,`- This is not ``creative``. Those are the di...,0.0
1,` :: the term ``standard model`` is itself le...,0.0
2,"True or false, the situation as of March 200...",0.0
3,"Next, maybe you could work on being less cond...",0.0
4,This page will need disambiguation.,0.0


In [4]:
# Check the data length
len(data)

231802

#### So the data consists of text data and their respective labels

In [5]:
data['oh_label'].value_counts()

oh_label
0.0    204093
1.0     27706
Name: count, dtype: int64

## Checking for missing data

In [6]:
data.isnull().sum()

Text        1
oh_label    3
dtype: int64

#### We have very less null values removing the null values

In [7]:
data.dropna(inplace=True)

In [8]:
# Check the data length again
len(data)

231799

In [9]:
data.shape

(231799, 2)

## Text Processing

### Stopwords and punctuations removal

In [10]:
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r"http\S+|www\S+|https\S+", '', text)         # remove URLs
    text = re.sub(r'@\w+|#\w+', '', text)                        # remove mentions and hashtags
    text = re.sub(r'[^a-z0-9\s]', '', text)                         # keep only alphabets and spaces
    text = re.sub(r'\s+', ' ', text).strip()                     # remove extra spaces
    return text


In [11]:
data['clean_text'] = data['Text'].apply(preprocess_text)

KeyboardInterrupt: 

In [11]:
data

Unnamed: 0,Text,oh_label
0,`- This is not ``creative``. Those are the di...,0.0
1,` :: the term ``standard model`` is itself le...,0.0
2,"True or false, the situation as of March 200...",0.0
3,"Next, maybe you could work on being less cond...",0.0
4,This page will need disambiguation.,0.0
...,...,...
231797,She pretty I love this song I miss the old kel...,1.0
231798,Status-Online Im ZxkillergirlzX! I'm Zxkillerg...,0.0
231799,JR so cute EXO M Better I agree like yeah yeah...,0.0
231800,! !,0.0


In [None]:
data['clean_text'][231800]

#### Here we might have left with only the single spaces. let's evaluate that

In [None]:
original_len = len(data)
data = data[data['clean_text'].str.strip() != '']
print(f"Removed {original_len - len(data)} empty rows after cleaning.")


In [None]:
# Display the data again
data

## Label Encoding

In [12]:
le = LabelEncoder()
data['label'] = le.fit_transform(data['oh_label'])

In [13]:
data

Unnamed: 0,Text,oh_label,label
0,`- This is not ``creative``. Those are the di...,0.0,0
1,` :: the term ``standard model`` is itself le...,0.0,0
2,"True or false, the situation as of March 200...",0.0,0
3,"Next, maybe you could work on being less cond...",0.0,0
4,This page will need disambiguation.,0.0,0
...,...,...,...
231797,She pretty I love this song I miss the old kel...,1.0,1
231798,Status-Online Im ZxkillergirlzX! I'm Zxkillerg...,0.0,0
231799,JR so cute EXO M Better I agree like yeah yeah...,0.0,0
231800,! !,0.0,0


In [14]:
data['label'].value_counts()

label
0    204093
1     27706
Name: count, dtype: int64

## Train-Test Split

In [15]:
# 1. Vectorize text
#X = data['clean_text']
X = data['Text']
y = data['label']


In [20]:

tfidf = TfidfVectorizer(stop_words='english')
X_tfidf = tfidf.fit_transform(X)


In [21]:
# 2. Apply SMOTE to balance classes
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_tfidf, y)

# 3. Now split the balanced dataset
X_train, X_test, y_train, y_test = train_test_split(
    X_resampled, y_resampled, test_size=0.2, random_state=42, stratify=y_resampled
)

In [22]:
X_train.shape, y_train.shape

((326548, 257614), (326548,))

In [23]:
y_train.value_counts()

label
1    163274
0    163274
Name: count, dtype: int64

In [24]:
X_test.shape, y_test.shape

((81638, 257614), (81638,))

# Model Training

In [25]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(class_weight='balanced', max_iter=1000)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)


In [26]:
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.93      0.86      0.89     40819
           1       0.87      0.93      0.90     40819

    accuracy                           0.90     81638
   macro avg       0.90      0.90      0.90     81638
weighted avg       0.90      0.90      0.90     81638



### Save the model and vectorizer

In [27]:
joblib.dump((tfidf, model), "cyberbullying_pipeline_deploy.pkl")

['cyberbullying_pipeline_deploy.pkl']

### Testing the pipeline

In [28]:
# Load the saved model
vectorizer, clf = joblib.load("cyberbullying_pipeline_deploy.pkl")

def predict(text):
    X = vectorizer.transform([text])
    proba = clf.predict_proba(X)[0]
    label = clf.predict(X)[0]
    return label, round(max(proba), 2)  # e.g., (1, 0.87)

### Test the model with random texts

In [29]:
predict("You're such a loser. No one wants you here.")

(1, 1.0)

In [31]:
predict("Wow, you're really smart... not.")

(1, 0.77)

In [32]:
predict("Hey, just checking in to see how you're doing today.")

(0, 0.78)

In [33]:
predict("Nice job messing that up, genius.")

(0, 0.63)

In [34]:
text = "You're such a loser. No one wants you here."
label, confidence = predict(text)
print(f"Prediction: {'Cyberbullying' if label == 1 else 'Not Cyberbullying'} ({confidence * 100:.1f}% confidence)")


Prediction: Cyberbullying (100.0% confidence)


#### Other Models

In [23]:
nb = MultinomialNB()
nb.fit(X_train, y_train)
y_pred_nb = nb.predict(X_test)

print("Naive Bayes:\n", classification_report(y_test, y_pred_nb))


Naive Bayes:
               precision    recall  f1-score   support

           0       0.85      0.88      0.86     40819
           1       0.88      0.84      0.86     40819

    accuracy                           0.86     81638
   macro avg       0.86      0.86      0.86     81638
weighted avg       0.86      0.86      0.86     81638



In [24]:
svm = LinearSVC(class_weight='balanced', max_iter=1000)
svm.fit(X_train, y_train)
y_pred_svm = svm.predict(X_test)

print("SVM (Linear):\n", classification_report(y_test, y_pred_svm))



SVM (Linear):
               precision    recall  f1-score   support

           0       0.96      0.86      0.90     40819
           1       0.87      0.96      0.91     40819

    accuracy                           0.91     81638
   macro avg       0.91      0.91      0.91     81638
weighted avg       0.91      0.91      0.91     81638



In [16]:
final_pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(stop_words='english', lowercase=True)),
    ('clf', LogisticRegression(class_weight='balanced', max_iter=1000))
])


In [17]:
# Train
final_pipeline.fit(X_train, y_train)

NameError: name 'X_train' is not defined