In [4]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import fasttext
import re

# function to clean text
def clean_text(text):
    text = str(text)  # Convert to string to handle NaN values
    text = text.lower()
    text = re.sub(r'http\S+', '', text)
    text = re.sub(r'@\w+', '', text)
    return text

# Load and combine datasets
df1 = pd.read_csv('data.csv', encoding='ISO-8859-1')
df2 = pd.read_csv('CyberBullying Comments Dataset.csv', encoding='ISO-8859-1')
df_combined = pd.concat([df1, df2], ignore_index=True)
df_combined['text_message'] = df_combined['text_message'].fillna('')  # Handle missing values
df_combined['text_message'] = df_combined['text_message'].apply(clean_text)

# Split combined dataset into train and test sets
df_train, df_test = train_test_split(df_combined[['label_bullying', 'text_message']], test_size=0.20, random_state=42)

# Save train and test sets in fastText format
def save_in_fastText_format(filename, data_frame):
    with open(filename + ".txt", "w", encoding="utf-8") as f:
        for text, label in zip(data_frame.text_message, data_frame.label_bullying):
            if not pd.isnull(label):
                label_str = str(int(label))
                f.writelines("__label__" + label_str + " " + str(text) + "\n")

save_in_fastText_format("train", df_train)
save_in_fastText_format("test", df_test)

# Retrain fastText model
model = fasttext.train_supervised('train.txt')

# Save the trained model
model.save_model('classifier_updated.bin')

# Evaluate model
def evaluate_model(model, test_file):
    y_true = []
    texts = []
    errors = 0  # To count the number of errors

    with open(test_file, 'r', encoding='utf-8') as f:
        for line in f:
            parts = line.split(' ', 1)
            if len(parts) != 2:
                errors += 1
                continue  # Skip lines with incorrect format

            label, text = parts
            try:
                y_true.append(int(label.replace('__label__', '')))
                texts.append(text.strip())
            except ValueError:
                errors += 1
                continue  # Skip lines where the label is not an integer

    y_pred = [int(model.predict(text)[0][0].replace('__label__', '')) for text in texts]
    print(classification_report(y_true, y_pred, target_names=['Non-Bullying', 'Bullying']))
    print(f"Skipped {errors} lines due to formatting errors.")

# Evaluate the model using the test file
evaluate_model(model, 'test.txt')


              precision    recall  f1-score   support

Non-Bullying       0.79      0.88      0.83      2361
    Bullying       0.79      0.65      0.72      1623

    accuracy                           0.79      3984
   macro avg       0.79      0.77      0.77      3984
weighted avg       0.79      0.79      0.79      3984

Skipped 8 lines due to formatting errors.


In [1]:
# import pandas as pd
# from sklearn.model_selection import train_test_split
# from sklearn.metrics import confusion_matrix
# import numpy as np
# import fasttext
# import codecs 


In [2]:
# df = pd.read_csv('twitter_parsed_dataset.csv')

In [3]:
# df.shape

(16851, 2)

In [4]:
# df.label_bullying.value_counts()

label_bullying
0.0    11501
1.0     5347
Name: count, dtype: int64

In [5]:
# df.head()

Unnamed: 0,text_message,label_bullying
0,@halalflaws @biebervalue @greenlinerzjm I read...,0.0
1,@ShreyaBafna3 Now you idiots claim that people...,0.0
2,"RT @Mooseoftorment Call me sexist, but when I ...",1.0
3,"@g0ssipsquirrelx Wrong, ISIS follows the examp...",1.0
4,#mkr No No No No No No,0.0


In [6]:
# df_train, df_test = train_test_split(df[['label_bullying', 'text_message']], test_size=0.20, random_state=42)

In [7]:
# def save_in_fastText_format(filename, data_frame):
    # with open(filename + ".txt", "w", encoding="utf-8") as f:
        # for text, label in zip(data_frame.text_message, data_frame.label_bullying):
            # if not pd.isnull(label):
                # label_str = str(int(label))
                # f.writelines("__label__" + label_str + " " + str(text) + "\n")



In [8]:
# save_in_fastText_format("train", df_train)
# save_in_fastText_format("test", df_test)

In [9]:
# model = fasttext.train_supervised('train.txt')

In [10]:
# print(model.labels)

['__label__0', '__label__1']


In [11]:
# def print_results(N, p, r):
    # print("Precision: {:.3f}".format(p))
    # print("Recall: {:.3f}".format(r))
    # return p

In [12]:
# print_results(*model.test('test.txt'))

Precision: 0.847
Recall: 0.847


0.8468842729970326

In [13]:
# model.predict(['you look good'])

([['__label__0']], [array([0.97929204], dtype=float32)])