### Load Data, drop empty Txt and Settings
----

In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
import pandas as pd
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.preprocessing import MinMaxScaler
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.svm import SVC

In [13]:
SET_datapath = "data_pp_cut.json"

SET_SVM_active = False

df = pd.read_json(SET_datapath, orient="columns")

df.drop(df[df.comment_text.str.len() == 0].index, inplace=True)
df.reset_index(drop=True, inplace=True)

df.head()

Unnamed: 0,comment_text,toxic
0,u created request scholarlyarticles awaiting c...,1
1,looking lgbt profession category point get beh...,1
2,"new york city regulation regarding removal, , ...",0
3,thank speedy rollback would believe first time...,1
4,want talk stuff perfectly willing refer commen...,0


### Word Embedding

----

#### BERT

In [14]:
from transformers import BertTokenizer, TFBertModel

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = TFBertModel.from_pretrained('bert-base-uncased')

# Tokenize input comments
for i in rang
encoded_comments = tokenizer(df['comment_text'].tolist(), padding=True, truncation=True, max_length=128, return_tensors='tf')

# Pass tokenized comments through BERT model to get embeddings
bert_embeddings = model(encoded_comments)['pooler_output'].numpy()

"""
import torch
from transformers import BertTokenizer, BertModel

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

# Tokenize input comments
encoded_comments = tokenizer(df['comment_text'].tolist(), padding=True, truncation=True, max_length=128, return_tensors='pt')

# Pass tokenized comments through BERT model to get embeddings
with torch.no_grad():
    outputs = model(**encoded_comments)
    bert_embeddings = outputs.pooler_output.numpy()"""

SyntaxError: invalid syntax (2241737607.py, line 7)

In [15]:
from tqdm import tqdm
import numpy as np
from transformers import BertTokenizer, TFBertModel

batch_size = 2000
bert_embeddings = []

# Tokenize and process comments in batches
for i in range(0, len(df), batch_size):
    comments_batch = df['comment_text'].iloc[i:i+batch_size].tolist()
    encoded_comments = tokenizer(comments_batch, padding=True, truncation=True, max_length=128, return_tensors='tf')
    batch_embeddings = model(encoded_comments)['pooler_output'].numpy()
    bert_embeddings.append(batch_embeddings)

# Concatenate embeddings from all batches
bert_embeddings = np.concatenate(bert_embeddings, axis=0)

In [None]:
from tqdm import tqdm
import numpy as np
from transformers import BertTokenizer, TFBertModel

batch_size = 2000
bert_embeddings = []

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = TFBertModel.from_pretrained('bert-base-uncased')

# Tokenize and process comments in batches
for i in range(0, len(df), batch_size):
    comments_batch = df['comment_text'].iloc[i:i+batch_size].tolist()
    encoded_comments = tokenizer(comments_batch, padding=True, truncation=True, max_length=128, return_tensors='tf')

bert_embeddings = model(encoded_comments)['pooler_output'].numpy()

In [None]:
import torch

class BertDataSet(Dataset):

    def __init__(self, sentences, toxic_labels):
        self.sentences = sentences
        #target is a matrix with shape [#1 x #6(toxic, obscene, etc)]
        self.targets = toxic_labels.to_numpy()

    def __len__(self):
        return len(self.sentences)


    def __getitem__(self, idx):
        sentence = self.sentences[idx]
        bert_senten = tokenizer.encode_plus(sentence,
                                            add_special_tokens = True, # [CLS],[SEP]
                                            max_length = max_len,
                                            pad_to_max_length = True,
                                            truncation = True,
                                            return_attention_mask = True
                                             )
        ids = torch.tensor(bert_senten['input_ids'], dtype = torch.long)
        mask = torch.tensor(bert_senten['attention_mask'], dtype = torch.long)
        toxic_label = torch.tensor(self.targets[idx], dtype = torch.float)


        return {
            'ids' : ids,
            'mask' : mask,
            'toxic_label':toxic_label
        }

train_dataset = BertDataSet(df['comment_text'], df[['toxic']])


In [None]:
from torch.utils.data import DataLoader, Dataset
train_dataloader = DataLoader(train_dataset, batch_size = train_batch, pin_memory = True, num_workers = 4, shuffle = True)

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

In [None]:
import transformers
%%time
model = transformers.BertForSequenceClassification.from_pretrained('bert-base-cased', num_labels = 6)
model.to(device)
model.train()

In [None]:
%%time
for a in train_dataloader:
    ids = a['ids'].to(device)
    mask = a['mask'].to(device)
    output = model(ids, mask)
    break

### Split Data for Models
----

In [None]:
X_train, X_test, y_train, y_test = train_test_split(bert_embeddings, df['toxic'].values, test_size=0.2, random_state=99)

# Scale input features using MinMaxScaler
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [None]:
print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)

### Train Models and get Scores

----

#### Try... Neural Network

In [None]:
"""import numpy as np
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation
from keras.preprocessing.text import Tokenizer
from keras.utils import to_categorical
from sklearn.datasets import fetch_20newsgroups

# load the 20 newsgroups dataset
newsgroups_train = fetch_20newsgroups(subset='train', shuffle=True)
newsgroups_test = fetch_20newsgroups(subset='test', shuffle=True)

# convert text to numerical data
tokenizer = Tokenizer(num_words=2000)
tokenizer.fit_on_texts(newsgroups_train.data)
X_train = tokenizer.texts_to_matrix(newsgroups_train.data, mode='tfidf')
X_test = tokenizer.texts_to_matrix(newsgroups_test.data, mode='tfidf')

# convert labels to categorical data
num_classes = np.max(newsgroups_train.target) + 1
y_train = to_categorical(newsgroups_train.target, num_classes)
y_test = to_categorical(newsgroups_test.target, num_classes)

# define the model architecture
model = Sequential()
model.add(Dense(512, input_shape=(2000,)))
model.add(Activation('relu'))
model.add(Dropout(0.5))
model.add(Dense(num_classes))
model.add(Activation('softmax'))

# compile the model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# train the model
#model.fit(X_train, y_train, batch_size=32, epochs=5, validation_data=(X_test, y_test))"""

In [None]:
import csv
def model_train(model, X_train, X_test, y_train, y_test):
    model.fit(X_train,y_train)
    y_pred_tr = model.predict(X_train)
    y_pred = model.predict(X_test)
    report = classification_report(y_test,y_pred)
    recall = report.split()[-2]
    print("--------------------Training Performance---------------------")
    print(accuracy_score(y_train,y_pred_tr))
    print(classification_report(y_train,y_pred_tr))
    print("-------------------------------------------------------------")
    print("--------------------Testing Performance----------------------")
    print(accuracy_score(y_test,y_pred))
    print(classification_report(y_test,y_pred))
    sns.heatmap(confusion_matrix(y_test, y_pred),cmap='viridis',annot=True,fmt='.4g')
    plt.xlabel('Predicted Class')
    plt.ylabel('Actual Class')
    plt.show()

    # Save recall value in a CSV file
    model_name = "bert_" + model.__class__.__name__
    new_recall = recall

    # read existing data from csv file
    existing_data = []
    with open("performance.csv", mode="r") as csv_file:
        reader = csv.DictReader(csv_file)
        for row in reader:
            existing_data.append(row)

    # update recall value if modelname already exists
    for row in existing_data:
        if row["Model"] == model_name:
            row["Recall"] = new_recall

    # write data back to csv file
    with open("performance.csv", mode="w", newline="") as csv_file:
        writer = csv.DictWriter(csv_file, fieldnames=["Model", "Recall"])
        writer.writeheader()
        for row in existing_data:
            writer.writerow(row)
        # write new row if model doesn't exist
        if model_name not in [row["Model"] for row in existing_data]:
            writer.writerow({"Model": model_name, "Recall": new_recall})

#### Naive Bayes:

In [None]:
modelNB = MultinomialNB()
model_train(modelNB, X_train, X_test, y_train, y_test)

#### Support Vector Machine

In [None]:
if SET_SVM_active:
    modelSVM = SVC(kernel = 'linear', max_iter = 100000, verbose=True)
    model_train(modelSVM, X_train, X_test, y_train, y_test)

#### Logistic Regression (Best Model) :

In [None]:
modelLR = LogisticRegression(max_iter=1000)
model_train(modelLR, X_train, X_test, y_train, y_test)

#### Random forest:

In [None]:
model = RandomForestClassifier(n_estimators=100,max_depth=15,max_features='sqrt')
model_train(model, X_train, X_test, y_train, y_test)

In [None]:
print("finished")