In [1]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Bidirectional, LSTM, Dense
import pandas as pd
import nltk
from nltk.corpus import stopwords
import string

In [2]:
twitter_df = pd.read_csv('cyberbullying_tweets.csv')

In [3]:
words_to_remove = set()
words_to_remove.add('http')
words_to_remove.add('rt')

In [4]:
def shorten_word(word):
    shortened_word = []
    repeat_count = 1
    last_char = ''
    for char in word:
        if char == last_char:
            repeat_count += 1
        else:
            repeat_count = 1
            last_char = char
        if repeat_count <= 2:
            shortened_word.append(char)
    return ''.join(shortened_word)

In [5]:
def preprocess_text(text):
    text = text.lower()
    text = nltk.word_tokenize(text)
    text = [word for word in text if word not in stopwords.words('english')]
    text = [word for word in text if not any(char in word for char in string.punctuation)]
    text = [word for word in text if word not in words_to_remove]
    text = [shorten_word(word) for word in text]
    text = ' '.join(text)
    return text

In [6]:
for index, row in twitter_df.iterrows():
    twitter_df.loc[index, 'processed_tweet_text'] = preprocess_text(row['tweet_text'])

In [7]:
twitter_df.head(15)

Unnamed: 0,tweet_text,cyberbullying_type,processed_tweet_text
0,"In other words #katandandre, your food was cra...",not_cyberbullying,words katandandre food crapilicious mkr
1,Why is #aussietv so white? #MKR #theblock #ImA...,not_cyberbullying,aussietv white mkr theblock imacelebrityau tod...
2,@XochitlSuckkks a classy whore? Or more red ve...,not_cyberbullying,xochitlsuckks classy whore red velvet cupcakes
3,"@Jason_Gio meh. :P thanks for the heads up, b...",not_cyberbullying,meh p thanks heads concerned another angry dud...
4,@RudhoeEnglish This is an ISIS account pretend...,not_cyberbullying,rudhoeenglish isis account pretending kurdish ...
5,"@Raja5aab @Quickieleaks Yes, the test of god i...",not_cyberbullying,raja5aab quickieleaks yes test god good bad in...
6,Itu sekolah ya bukan tempat bully! Ga jauh kay...,not_cyberbullying,itu sekolah ya bukan tempat bully ga jauh kaya...
7,Karma. I hope it bites Kat on the butt. She is...,not_cyberbullying,karma hope bites kat butt nasty mkr
8,@stockputout everything but mostly my priest,not_cyberbullying,stockputout everything mostly priest
9,Rebecca Black Drops Out of School Due to Bully...,not_cyberbullying,rebecca black drops school due bullying


In [10]:
from collections import Counter

tokens = []

for text in twitter_df['processed_tweet_text']:
    tokens.extend(text.split())
    
vocab = set(tokens)

word_counts = Counter(tokens)

print(f"Vocabulary Size: {len(vocab)}")

Vocabulary Size: 52456


In [11]:
twitter_df = twitter_df[twitter_df['processed_tweet_text'] != '']
twitter_df.head(15)

In [13]:
def convert_cyberbullying_type(row):
    if row['cyberbullying_type'] == 'not_cyberbullying':
        return 'not_cyberbullying'
    else:
        return 'cyberbullying'
    
twitter_df['cyberbullying_type'] = twitter_df.apply(convert_cyberbullying_type, axis=1)

In [14]:
twitter_df[['cyberbullying_type', 'processed_tweet_text']].to_csv('processed_cyberbullying_tweets.csv', index=False)