### Importing Libraries

In [3]:
pip install nltk


Collecting nltk
  Downloading nltk-3.8.1-py3-none-any.whl (1.5 MB)
     ---------------------------------------- 0.0/1.5 MB ? eta -:--:--
     --- ------------------------------------ 0.1/1.5 MB 2.8 MB/s eta 0:00:01
     ------ --------------------------------- 0.2/1.5 MB 2.5 MB/s eta 0:00:01
     --------- ------------------------------ 0.3/1.5 MB 2.7 MB/s eta 0:00:01
     ---------- ----------------------------- 0.4/1.5 MB 2.6 MB/s eta 0:00:01
     ------------ --------------------------- 0.5/1.5 MB 2.2 MB/s eta 0:00:01
     --------------- ------------------------ 0.6/1.5 MB 2.1 MB/s eta 0:00:01
     ----------------- ---------------------- 0.7/1.5 MB 2.0 MB/s eta 0:00:01
     --------------------- ------------------ 0.8/1.5 MB 2.1 MB/s eta 0:00:01
     ------------------------ --------------- 0.9/1.5 MB 2.2 MB/s eta 0:00:01
     -------------------------- ------------- 1.0/1.5 MB 2.1 MB/s eta 0:00:01
     ------------------------------- -------- 1.2/1.5 MB 2.2 MB/s eta 0:00:01
    

In [5]:
pip install scikit-learn


Collecting scikit-learn
  Obtaining dependency information for scikit-learn from https://files.pythonhosted.org/packages/1c/49/30ffcac5af06d08dfdd27da322ce31a373b733711bb272941877c1e4794a/scikit_learn-1.3.2-cp39-cp39-win_amd64.whl.metadata
  Downloading scikit_learn-1.3.2-cp39-cp39-win_amd64.whl.metadata (11 kB)
Collecting threadpoolctl>=2.0.0 (from scikit-learn)
  Obtaining dependency information for threadpoolctl>=2.0.0 from https://files.pythonhosted.org/packages/81/12/fd4dea011af9d69e1cad05c75f3f7202cdcbeac9b712eea58ca779a72865/threadpoolctl-3.2.0-py3-none-any.whl.metadata
  Downloading threadpoolctl-3.2.0-py3-none-any.whl.metadata (10.0 kB)
Downloading scikit_learn-1.3.2-cp39-cp39-win_amd64.whl (9.3 MB)
   ---------------------------------------- 0.0/9.3 MB ? eta -:--:--
   ---------------------------------------- 0.0/9.3 MB ? eta -:--:--
   ---------------------------------------- 0.0/9.3 MB ? eta -:--:--
   ---------------------------------------- 0.0/9.3 MB ? eta -:--:--
   ---

In [6]:
import pandas as pd
import numpy as np
import re
import string
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from nltk.tokenize import RegexpTokenizer
from nltk import PorterStemmer, WordNetLemmatizer
import pickle

In [7]:
# Importing dataset
data = pd.read_csv("D:\cyberbullying_tweets.csv")

#### Encoding the *classification_type* values.

In [3]:
labelencoder = LabelEncoder()
data['cyberbullying_type_encoded'] = labelencoder.fit_transform(data['cyberbullying_type'])
data[['cyberbullying_type', 'cyberbullying_type_encoded']].value_counts()

cyberbullying_type   cyberbullying_type_encoded
religion             5                             7998
age                  0                             7992
gender               2                             7973
ethnicity            1                             7961
not_cyberbullying    3                             7945
other_cyberbullying  4                             7823
dtype: int64

#### Preprocessing

In [8]:
# preprocessing functions

# converting tweet text to lower case
def text_lower(text):
    return text.str.lower()

# removing stopwoords from the tweet text
def clean_stopwords(text):
    # stopwords list that needs to be excluded from the data
    stopwordlist = ['a', 'about', 'above', 'after', 'again', 'ain', 'all', 'am', 'an',
             'and','any','are', 'as', 'at', 'be', 'because', 'been', 'before',
             'being', 'below', 'between','both', 'by', 'can', 'd', 'did', 'do',
             'does', 'doing', 'down', 'during', 'each','few', 'for', 'from',
             'further', 'had', 'has', 'have', 'having', 'he', 'her', 'here',
             'hers', 'herself', 'him', 'himself', 'his', 'how', 'i', 'if', 'in',
             'into','is', 'it', 'its', 'itself', 'just', 'll', 'm', 'ma',
             'me', 'more', 'most','my', 'myself', 'now', 'o', 'of', 'on', 'once',
             'only', 'or', 'other', 'our', 'ours','ourselves', 'out', 'own', 're',
             's', 'same', 'she', "shes", 'should', "shouldve",'so', 'some', 'such',
             't', 'than', 'that', "thatll", 'the', 'their', 'theirs', 'them',
             'themselves', 'then', 'there', 'these', 'they', 'this', 'those',
             'through', 'to', 'too','under', 'until', 'up', 've', 'very', 'was',
             'we', 'were', 'what', 'when', 'where','which','while', 'who', 'whom',
             'why', 'will', 'with', 'won', 'y', 'you', "youd","youll", "youre",
             "youve", 'your', 'yours', 'yourself', 'yourselves']
    STOPWORDS = set(stopwordlist)
    return " ".join([word for word in str(text).split() if word not in STOPWORDS])

# cleaning and removing punctuations
def clean_puctuations(text):
    english_puctuations = string.punctuation
    translator = str.maketrans('','', english_puctuations)
    return text.translate(translator)

# cleaning and removing repeating characters
def clean_repeating_characters(text):
    return re.sub(r'(.)1+', r'1', text)

# cleaning and removing URLs
def clean_URLs(text):
    return re.sub(r"((www.[^s]+)|(http\S+))","",text)

# cleaning and removing numeric data
def clean_numeric(text):
    return re.sub('[0-9]+', '', text)

# Tokenization of tweet text
def tokenize_tweet(text):
    tokenizer = RegexpTokenizer('\w+')
    text = text.apply(tokenizer.tokenize)
    return text

# stemming    
def text_stemming(text):
    st = PorterStemmer()
    text = [st.stem(word) for word in text]
    return text

# lemmatization
def text_lemmatization(text):
    lm = WordNetLemmatizer()
    text = [lm.lemmatize(word) for word in text]
    return text

In [5]:
# defining preprocess function

def preprocess(text):
    text = text_lower(text)
    text = text.apply(lambda text: clean_stopwords(text))
    text = text.apply(lambda x : clean_puctuations(x))
    text = text.apply(lambda x: clean_repeating_characters(x))
    text = text.apply(lambda x : clean_URLs(x))
    text = text.apply(lambda x: clean_numeric(x))
    text = tokenize_tweet(text)
    text = text.apply(lambda x: text_stemming(x))
    text = text.apply(lambda x: text_lemmatization(x))
    text = text.apply(lambda x : " ".join(x))
    return text

data['tweet_text'] = preprocess(data['tweet_text'])
data

Unnamed: 0,tweet_text,cyberbullying_type,cyberbullying_type_encoded
0,word katandandr food crapilici mkr,not_cyberbullying,3
1,aussietv white mkr theblock imacelebrityau tod...,not_cyberbullying,3
2,xochitlsuckkk classi whore red velvet cupcak,not_cyberbullying,3
3,jasongio meh p thank head up but not concern a...,not_cyberbullying,3
4,rudhoeenglish isi account pretend kurdish acco...,not_cyberbullying,3
...,...,...,...
47687,black ppl arent expect anyth depend anyth yet ...,ethnicity,1
47688,turner not withhold disappoint turner call cou...,ethnicity,1
47689,swear god dumb nigger bitch got bleach hair re...,ethnicity,1
47690,yea fuck rt therealexel nigger fuck unfollow m...,ethnicity,1


#### Model Creation

In [20]:
print(data.columns)


Index(['tweet_text', 'cyberbullying_type'], dtype='object')


In [21]:
# Splitting the data into train and test
X, y = data['tweet_text'], data['cyberbullying_type']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.3, random_state= 41)


In [22]:
# Transforming the data using TF-IDF Vectorizer
vectoriser = TfidfVectorizer(ngram_range=(1,2), max_features= 500000)
vectoriser.fit(X_train)
# print("No. of feature words: ",len(vectoriser.get_feature_names()))

In [23]:
# Dumping the vectoriser
pickle.dump(vectoriser, open('tdf_vectorizer', 'wb'))

In [24]:
X_train = vectoriser.transform(X_train)
X_test = vectoriser.transform(X_test)

In [25]:
# Model
svm_model_linear = SVC(kernel= 'linear', C = 1).fit(X_train, y_train)
svm_predictions  = svm_model_linear.predict(X_test)
accuracy = svm_model_linear.score(X_test, y_test)
print(accuracy)

0.8348476376852111


In [26]:
# dumping the model
pickle.dump(svm_model_linear, open('model.bin', 'wb'))

#### Predicting Custom input

In [57]:
def preprocess(text):
    # Your text preprocessing code goes here
    processed_text = text  # Modify this line to perform actual preprocessing
    return processed_text

In [58]:
# Function for custom input prediction
def custom_input_prediction(text):
    import nltk
    import pandas as pd
    import pickle

    # Download NLTK data (if not already downloaded)
    nltk.download('omw-1.4')

    # Preprocess the input text
    text = preprocess(text)

    print("Text after preprocessing:", text)  # Debugging print

    # Load the TfidfVectorizer and model
    vectoriser = pickle.load(open("tdf_vectorizer", "rb"))
    model = pickle.load(open("model.bin", "rb"))

    # Transform the input text using the TfidfVectorizer
    # Wrap the text in a list to make it iterable
    text = [text]
    text = vectoriser.transform(text)

    print("Text after transformation:", text)  # Debugging print

    # Make a prediction using the model
    prediction = model.predict(text)
    prediction = prediction[0]

    print("Prediction:", prediction)  # Debugging print

    # Interpret the prediction
    interpretations = {
        0: "Age",
        1: "Ethnicity",
        2: "Gender",
        3: "Not Cyberbullying",
        4: "Other Cyberbullying",
        5: "Religion"
    }

    for i in interpretations.keys():
        if i == prediction:
            return interpretations[i]

# Example input text
something_2 = "Michelle ( Heartless Bitch) Bachman WON'T Sign Anti-bullying law Cause She Doesn't want 2 ""turn boys in2 girls"" WTF ?@ChrisLarson82"

# Perform the custom input prediction
result = custom_input_prediction(something_2)


[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\CR2000TU\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


Text after preprocessing: Michelle ( Heartless Bitch) Bachman WON'T Sign Anti-bullying law Cause She Doesn't want 2 turn boys in2 girls WTF ?@ChrisLarson82
Text after transformation:   (0, 352113)	0.20136965461128556
  (0, 352106)	0.12092506917824254
  (0, 349391)	0.20136965461128556
  (0, 349270)	0.1067161863723129
  (0, 335522)	0.20136965461128556
  (0, 335302)	0.08515574490209986
  (0, 324168)	0.20136965461128556
  (0, 324154)	0.12771606676840166
  (0, 277136)	0.20136965461128556
  (0, 277134)	0.1391301468194622
  (0, 273894)	0.15393716065475277
  (0, 273722)	0.07556643583884227
  (0, 195936)	0.20136965461128556
  (0, 195934)	0.16772090125559938
  (0, 176342)	0.20136965461128556
  (0, 176327)	0.11838503514453443
  (0, 152505)	0.20136965461128556
  (0, 152504)	0.19375513064677918
  (0, 137026)	0.20136965461128556
  (0, 137023)	0.1778431082952228
  (0, 123762)	0.20136965461128556
  (0, 123374)	0.07337379750778253
  (0, 89817)	0.16231831052451587
  (0, 89660)	0.09444292046964481
  (0, 