# Import the Libraries

In [None]:
# Download the Dependencies
!pip install emoji
!pip install symspellpy
!pip install contractions
!pip install transformers
!pip install google-genai
!pip install huggingface_hub
!pip install sentencepiece accelerate
!pip install git+https://github.com/huggingface/transformers.git
!pip install torchaudio
!pip install -q bitsandbytes

In [None]:
import pandas as pd
import numpy as np
import os
import gdown
import time

import re
import html
import emoji
import string
import symspellpy
import contractions
from bs4 import BeautifulSoup
from textblob import Word

import spacy
nlp = spacy.load("en_core_web_sm")

import torch
from transformers import pipeline
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, AutoModelForCausalLM
from transformers import pipeline

from huggingface_hub import login

import nltk
from nltk.corpus import stopwords
from nltk import word_tokenize, pos_tag
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger_eng')
pd.set_option('display.max_colwidth', None)

import warnings
warnings.filterwarnings('ignore')

**Mounting google drive Or Getting the Data Files**

In [None]:
# Mounting google Drive for Colab
# from google.colab import drive
# drive.mount('/content/drive')

# Getting the data files 
!gdown --id 1SQ6KjMklH6ShI_KWSz_yU31WFrwofu6E
!gdown --id 1U1WAYYonIfMxvo_rPTFev92l3Z2ptpK7

# Importing and Explorint the data

In [None]:
# Move to a specific project directory
# %cd /content/drive/MyDrive/Cyberbullying_Detection

**Loading the Dataset**

In [None]:
# Read the 2 files of the dataset
bully = pd.read_csv('/kaggle/working/bully_data.csv')
non_bully = pd.read_csv('/kaggle/working/not_bully_data.csv')

# adding a label column for each df with values {bully : 1, non_bully : 0}
bully['label'] = 1
non_bully['label'] = 0

**Exploring the dataset**

In [None]:
# Print the Dataframes Shape
print(f'The shape of bully data is {bully.shape}')
print(f'The shape of non_bully data is {non_bully.shape}')

In [None]:
# Concatinate the both of the datasets only with post and label columns
all_data = pd.concat([bully.loc[:,['post','label']], non_bully.loc[:,['post','label']]], axis=0).reset_index(drop=True)
all_data.head()

In [None]:
all_data.info()

# Preprocessing Phase

In [None]:
# Dropping the null samples if exists in the dataset
all_data.dropna(inplace=True)

**Remove the HTML Tags**

In [None]:
# This functions uses BeautifulSoup To remove html tags from the data like <br>
def remove_tags(text):
  text = BeautifulSoup(text, "html.parser").get_text()
  return text

In [None]:
# Applying the remove tags function to the post column
all_data.post = all_data.post.apply(remove_tags)

**Normalize the Repeated Characters**<br>
In the data there are some samples that contains words with too much repeated chars Like `plaaaaaaay` which is misspelled and supposed to be `play`so we normalized the words that contains more than 2 repeated characters into only to characters so other text correction techniques could handle them

In [None]:
# Function to fix the repeated characters from the words
REPEATED_CHAR_PATTERN = re.compile(r"(.)\1{2,}")

def normalize_repeated_chars(word, max_repeats=2):
    if not word:
        return word
    return REPEATED_CHAR_PATTERN.sub(lambda m: m.group(1) * max_repeats, word)

In [None]:
# Applying the normalize_repeated_chars function to the post column
all_data.post = all_data.post.apply(normalize_repeated_chars)

**HTML Unscape**<br>
When text is scraped from the web, it often includes special characters written in HTML-safe form — for example:

- `&amp;` instead of `&` <br>
- `&lt;` instead of `<` <br>
- `&gt;` instead of `>` <br>
- `&quot;` instead of `"` <br>
- `&#39;` instead of `'` <br>

In [None]:
# Function to fix the HTML Unscape
all_data.post = all_data.post.apply(lambda text : html.unescape(text))

**URL Removal**

In [None]:
# This function uses regular expressions to remove the URLs
URL_PATTERN = re.compile(r"http\S+|www.\S+")

def remove_url(text):
    if not text:
        return text
    return URL_PATTERN.sub("", text)

In [None]:
# apply the remove_url function to column post
all_data.post = all_data.post.apply(remove_url)

**Emojies Removal**

In [None]:
# Removing the Emojies if exists fromt the text
def remove_emojis(text):
    if not text:
        return text
    text = emoji.replace_emoji(text, "")
    return text

In [None]:
# Apply the remove_emojis to the post column
all_data.post = all_data.post.apply(remove_emojis)

**Remove Q/A tags**
<br>
The column `post` is collected from other columns one is `Question` and the other is `Answer` which were merged in the column post.<br>
when merging the Question part was tagged as `Q: question` and the Answer was tagged as `A: answer` So we need to remove them

In [None]:
# Creating function that uses regular expression to do so
def remove_aq(text: str) -> str:
    return re.sub(r'(A:|Q:)', '', str(text)).strip()

In [None]:
# Apply the remove_aq function to the column post
all_data.post = all_data.post.apply(remove_aq)

**Stripping the text**

In [None]:
# Strip the additional spaces if exists
all_data.post = all_data.post.str.strip()

**Remove the Non-ASCII Characters**

In [None]:
# Removing the characters that are Non-ASCII 'Not english'
def remove_non_ascii(text):
    if not text: return text
    return text.encode("ascii", "ignore").decode()

In [None]:
# applying the function remove_non_ascii to the column post
all_data.post = all_data.post.apply(remove_non_ascii)

**Remove Numbers**

In [None]:
# Function to Remove any number within the data
def remove_numbers(text: str) -> str:
    return re.sub(r'\d+', '', str(text)).strip()

In [None]:
# Apply the remove_numbers to the post column
all_data.post = all_data.post.apply(remove_numbers)

**Lowering**

In [None]:
# Set all the text into Lower Case in post Column
all_data.post = all_data.post.str.lower()

**Punctuation Removal**

In [None]:
# Function to remove the Punctuation 
def remove_punctuation(text: str) -> str:

    # Define punctuation to remove
    keep_chars = {'.', ',', "'"}
    remove_chars = set(string.punctuation) - keep_chars

    # Replace removable punctuation with space
    pattern = "[" + re.escape("".join(remove_chars)) + "]"
    text = re.sub(pattern, " ", text)

    # Tokenize and join back
    tokens = nltk.word_tokenize(text)
    cleaned = " ".join(tokens)

    # Clean up spacing before kept punctuation
    cleaned = re.sub(r"\s+([.,'])", r"\1", cleaned)
    cleaned = re.sub(r"\s{2,}", " ", cleaned).strip()

    return cleaned

In [None]:
all_data.post = all_data.post.apply(remove_punctuation)

**Text Correction and Appreviations Expansions**
- This Step involves using the LLM called `microsoft/phi-2` to apply the correction for the misspelled text problem and Expand any Existed Formal or Informal abbreviations like *LOL* shall be *Laugh Out Loud*

In [None]:
# Load the model & tokenizer
model_name = "microsoft/phi-2"
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    trust_remote_code=True,
    device_map="auto",
    torch_dtype="auto"
)

phi_pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    max_new_tokens=128,
    do_sample=False,
    pad_token_id=tokenizer.eos_token_id
)

In [None]:
def fix_text_phi2(text: str) -> str:
    # Minimal, strict prompt
    prompt = f"""Instruct: Expand abbreviations and correct spelling only, do not paraphrase, KEEP SLANG, PROFANITY and BAD WORDS OF MOUTH AS-IS, Do not change meaning and do not rephrase, tone, or word order.  
Input: {text}  
Output:"""


    out = phi_pipe(prompt, max_new_tokens=128, do_sample=False)[0]["generated_text"]
    
    # Extract only after "Output:"
    if "Output:" in out:
        out = out.split("Output:")[-1].strip()

    # Take only first line
    out = out.split("\n")[0].strip()
    
    # Lowercase & remove punctuation
    out = re.sub(r"[^\w\s]", "", out).lower()
    
    return out

In [None]:
# Start timing
start_time = time.time()

all_data.post = all_data.post.apply(fix_text_phi2)

# End timing
end_time = time.time()

# Calculate elapsed time
elapsed_time = end_time - start_time
print(f"Transformation completed in {elapsed_time:.4f} seconds")

In [None]:
# Saving the result data
all_data.to_csv("all_data.csv", index=False)

***

**Loading the saved results**

In [None]:
# Read the csv file of the processed data
all_data = pd.read_csv('all_data.csv')

In [None]:
# Printing the Information about the dataset
all_data.info()

In [None]:
# Drop the Null Values if exists after preprocessing
all_data.dropna(inplace=True)

**Contraction Fix**<br>
- After observing the dataset we found that there are 2 types of contractions as
  - E.g: `im` which is supposed to be `i'm` but without the `'` and so we need to transform to `i am`
  - The other one is directly `don't` which we have to change it into `do not` using the following function:

In [None]:
# Dictionary mapping contractions (without apostrophes) to expanded forms
expansions = {
    r"\bim\b": "i am",
    r"\bits\b": "it is",
    r"\bive\b": "i have",
    r"\bill\b": "i will",
    r"\bid\b": "i would",
    r"\bdont\b": "do not",
    r"\bdoesnt\b": "does not",
    r"\bdidnt\b": "did not",
    r"\bhasnt\b": "has not",
    r"\bhavent\b": "have not",
    r"\bhadnt\b": "had not",
    r"\bwon't\b": "will not",
    r"\bwouldnt\b": "would not",
    r"\bcant\b": "cannot",
    r"\bcouldnt\b": "could not",
    r"\bshouldnt\b": "should not",
    r"\bwasnt\b": "was not",
    r"\bwerent\b": "were not",
    r"\barent\b": "are not",
    r"\bitll\b": "it will",
    r"\btheyre\b": "they are",
    r"\byoure\b": "you are",
    r"\bwere\b": "we are",
    r"\bwhats\b": "what is",
    r"\btheres\b": "there is",
    r"\bisnt\b": "is not",
    r"\blets\b": "let us",
    r"\byall\b": "you all",
    r"\bhes\b": "he is",
    r"\bshes\b": "she is",
    r"\bthat's\b": "that is",
    r"\bwhos\b": "who is",
}

In [None]:
# This function uses the previous dictionary for fixing the Contractions
def expand_contractions(text: str) -> str:
    text = text.lower()
    for pattern, repl in expansions.items():
        text = re.sub(pattern, repl, text)
    return text

# Apply to the column post
all_data["post"] = all_data["post"].apply(expand_contractions)

In [None]:
# Fixing the right contractions that were already true in the data
all_data.post = all_data.post.apply(lambda text : contractions.fix(text))

In [None]:
# Show sample of the dataset after the previous processing steps
all_data

**Ensuring No Punctuation**

In [None]:
# The LLM model may have added some dots or commas as grammer rules so we need to remove if exists
PUNCT_TABLE = str.maketrans("", "", string.punctuation)

def remove_punct(text):
    if not text: return text
    return text.translate(PUNCT_TABLE)

# Apply the function to column post
all_data.post = all_data.post.apply(remove_punct)

**Named Entity Recognesion Replacement**
- This function replaces named entities in the text (e.g., person names, locations, organizations)
- with their corresponding entity labels (e.g., PERSON, LOCATION, ORGANIZATION) to generalize the text.


In [None]:
# Function to replace the NER if exists with it's entity
def replace_named_entities(text: str) -> str:
    doc = nlp(text)
    new_tokens = []
    
    for token in doc:
        # Check if token is part of an entity
        if token.ent_type_:
            # Replace first token of an entity with its type (lowercase)
            # and skip the remaining tokens of the same entity
            if token.i == token.ent_iob_ == 1 or token.ent_iob_ == "B":
                new_tokens.append(token.ent_type_.lower())
            # Skip other tokens of same entity
        else:
            new_tokens.append(token.text.lower())

    return " ".join(new_tokens)

In [None]:
# Apply the replace_named_entities to the column post
all_data.post = all_data.post.apply(replace_named_entities)

**Tokenization, Lemmatization and Stopwords Removal**

In [None]:
# Initialize tools
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

# Define words to always keep (even if stopwords or lemmatized)
important_words = {"ass", "bitch", "fuck", "shit", "slut", "dumb", "stupid"}

# POS mapping
def get_wordnet_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

# Preprocessing function
def preprocess_text(text):
    tokens = word_tokenize(text.lower())  # Normalize case
    pos_tags = pos_tag(tokens)
    
    lemmas = []
    for word, pos in pos_tags:
        lemma = lemmatizer.lemmatize(word, get_wordnet_pos(pos))
        
        # If the original or lemmatized word is in important words, keep it
        if word in important_words or lemma in important_words:
            lemmas.append(word)
        # Otherwise, apply normal rules
        elif lemma.isalpha() and lemma not in stop_words:
            lemmas.append(lemma)
    
    return lemmas  # or " ".join(lemmas) if you prefer string output

In [None]:
# Apply the preprocess_text function for the column post
all_data.post = all_data.post.apply(preprocess_text)

In [None]:
# Saving the Preprocessed data as a CSV File For the future use
all_data.to_csv('all_data_processed.csv', index=False)