## 1. Import Libraries

In [1]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.sentiment import SentimentIntensityAnalyzer
from collections import Counter

## 2. NLTK Resource Setup

In [2]:
# Sentimental analysis
nltk.download('vader_lexicon')

# Stopwords
nltk.download('stopwords')

# Tokeniser
nltk.download('punkt_tab')

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/Aditya/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/Aditya/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/Aditya/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

## 2. Load Datasets

In [3]:
# twitter data
tweets_df = pd.read_csv("datasets/cyberbullying_tweets.csv")

# text msg data
aggro_mendeley_df = pd.read_csv("datasets/aggressive_all.csv")

## 3. Extract cyberbullying texts from raw dataframe

In [4]:
aggro_tweets_df = tweets_df[tweets_df['cyberbullying_type'] != 'not_cyberbullying']

print(aggro_mendeley_df.columns)

Index(['No.', 'Message'], dtype='object')


## 4. Extract texts from each dataframe and combine to one

In [5]:
# Extract toxic messages from both datasets
aggro_mendeley_texts = aggro_mendeley_df['Message']  # From Mendeley dataset
aggro_tweet_texts = aggro_tweets_df['tweet_text']           # From filtered Twitter dataset

# Combine into a single Series
combined_aggro_texts = pd.concat([aggro_tweet_texts, aggro_mendeley_texts], ignore_index=True)

NameError: name 'mendeley_df' is not defined

## 5. Create function to clean and tokenise text

In [None]:
stop_words = set(stopwords.words('english'))

def clean_and_tokenise(text):
    """
    Input: Social Media Text (Series)
    Output: Tokens (Series)
    This function converts text into lower case, removes URLs, mentions, 
    hastags, numbers, non-letters, and stopwords and outputs tokens.
    """
    # Lowercase
    text = text.lower()
    
    # Remove URLs, mentions, hashtags, numbers, and non-letters
    text = re.sub(r"http\S+|www\S+|@\w+|#\w+|\d+", "", text)
    text = re.sub(r"[^\w\s]", "", text)  # Remove punctuation
    
    # Tokenise
    tokens = word_tokenize(text)
    
    # Remove stopwords and short words
    tokens = [word for word in tokens if word not in stop_words and len(word) > 1]
    
    return tokens

## 6. Clean and tokenise text

In [None]:
# Ensure all entries are string type (and handle NaNs)
combined_aggro_texts = combined_aggro_texts.fillna("").astype(str)

# Apply to all toxic messages
combined_tokens = combined_aggro_texts.apply(clean_and_tokenise)

In [None]:
# Example preview
print(combined_tokens)

## 7. Extract tokens with low sentiment analysis scores

In [None]:
# Initialize VADER
sia = SentimentIntensityAnalyzer()

# Flatten all tokens into one list
all_tokens = [word for tokens in combined_tokens for word in tokens]

# Compute sentiment scores
word_sentiments = {word: sia.polarity_scores(word)['compound'] for word in set(all_tokens)}

# Filter words with strongly negative sentiment (e.g., score < -0.5)
# toxic_words_via_sentiment = [word for word in all_tokens if word_sentiments.get(word, 0) < 0]

toxic_words_via_sentiment = [
    word for word in all_tokens 
    if -0.45 < word_sentiments.get(word, 0) < -0.25
]

In [None]:
# Count frequency
toxic_word_freq = Counter(toxic_words_via_sentiment)

# Preview top toxic words by sentiment
print(toxic_word_freq.most_common(50))

## 8. Export the most frequent toxic words as csv

In [None]:
# Take top 1000 most frequent toxic words
top_toxic = toxic_word_freq.most_common(1000)

# Convert to DataFrame
top_toxic_df = pd.DataFrame(top_toxic, columns=['word', 'frequency'])

# Preview
print(top_toxic_df)

In [None]:
# Export as csv
top_toxic_df.to_csv("top_toxic_words.csv", index=False)