### Load Data, drop empty Txt and Settings
----

In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
import pandas as pd
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.preprocessing import MinMaxScaler
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
SET_datapath = "data_pp.json"

df = pd.read_json(SET_datapath, orient="columns")

df.drop(df[df.comment_text.str.len() == 0].index, inplace=True)
df.reset_index(drop=True, inplace=True)

df.head()

Unnamed: 0,comment_text,toxic,sentence_lengths,toxic_encoded
0,u created request scholarlyarticles awaiting c...,1,7,1
1,looking lgbt profession category point get beh...,1,19,1
2,"new york city regulation regarding removal, , ...",0,15,0
3,thank speedy rollback would believe first time...,1,21,1
4,want talk stuff perfectly willing refer commen...,0,62,0


### Word Embedding

----

#### BERT

In [3]:
from transformers import BertTokenizer, TFBertModel

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = TFBertModel.from_pretrained('bert-base-uncased')

# Tokenize input comments
encoded_comments = tokenizer(df['comment_text'].tolist(), padding=True, truncation=True, max_length=128, return_tensors='tf')

# Pass tokenized comments through BERT model to get embeddings
bert_embeddings = model(encoded_comments)['pooler_output'].numpy()

Some layers from the model checkpoint at bert-base-uncased were not used when initializing TFBertModel: ['mlm___cls', 'nsp___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


ResourceExhaustedError: Exception encountered when calling layer 'embeddings' (type TFBertEmbeddings).

{{function_node __wrapped__ResourceGather_device_/job:localhost/replica:0/task:0/device:CPU:0}} OOM when allocating tensor with shape[311575,128,768] and type float on /job:localhost/replica:0/task:0/device:CPU:0 by allocator cpu [Op:ResourceGather]

Call arguments received by layer 'embeddings' (type TFBertEmbeddings):
  • input_ids=tf.Tensor(shape=(311575, 128), dtype=int32)
  • position_ids=None
  • token_type_ids=tf.Tensor(shape=(311575, 128), dtype=int32)
  • inputs_embeds=None
  • past_key_values_length=0
  • training=False

### Split Data for Models
----

In [None]:
X_train, X_test, y_train, y_test = train_test_split(bert_embeddings, df['toxic'].values, test_size=0.2, random_state=99)

# Scale input features using MinMaxScaler
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [None]:
print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)

### Train Models and get Scores

----

#### Try... Neural Network

In [None]:
"""import numpy as np
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation
from keras.preprocessing.text import Tokenizer
from keras.utils import to_categorical
from sklearn.datasets import fetch_20newsgroups

# load the 20 newsgroups dataset
newsgroups_train = fetch_20newsgroups(subset='train', shuffle=True)
newsgroups_test = fetch_20newsgroups(subset='test', shuffle=True)

# convert text to numerical data
tokenizer = Tokenizer(num_words=2000)
tokenizer.fit_on_texts(newsgroups_train.data)
X_train = tokenizer.texts_to_matrix(newsgroups_train.data, mode='tfidf')
X_test = tokenizer.texts_to_matrix(newsgroups_test.data, mode='tfidf')

# convert labels to categorical data
num_classes = np.max(newsgroups_train.target) + 1
y_train = to_categorical(newsgroups_train.target, num_classes)
y_test = to_categorical(newsgroups_test.target, num_classes)

# define the model architecture
model = Sequential()
model.add(Dense(512, input_shape=(2000,)))
model.add(Activation('relu'))
model.add(Dropout(0.5))
model.add(Dense(num_classes))
model.add(Activation('softmax'))

# compile the model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# train the model
#model.fit(X_train, y_train, batch_size=32, epochs=5, validation_data=(X_test, y_test))"""

In [None]:
import csv
def model_train(model, X_train, X_test, y_train, y_test):
    model.fit(X_train,y_train)
    y_pred_tr = model.predict(X_train)
    y_pred = model.predict(X_test)
    report = classification_report(y_test,y_pred)
    recall = report.split()[-2]
    print("--------------------Training Performance---------------------")
    print(accuracy_score(y_train,y_pred_tr))
    print(classification_report(y_train,y_pred_tr))
    print("-------------------------------------------------------------")
    print("--------------------Testing Performance----------------------")
    print(accuracy_score(y_test,y_pred))
    print(classification_report(y_test,y_pred))
    sns.heatmap(confusion_matrix(y_test, y_pred),cmap='viridis',annot=True,fmt='.4g')
    plt.xlabel('Predicted Class')
    plt.ylabel('Actual Class')
    plt.show()

    # Save recall value in a CSV file
    model_name = "bert_" + model.__class__.__name__
    new_recall = recall

    # read existing data from csv file
    existing_data = []
    with open("performance.csv", mode="r") as csv_file:
        reader = csv.DictReader(csv_file)
        for row in reader:
            existing_data.append(row)

    # update recall value if modelname already exists
    for row in existing_data:
        if row["Model"] == model_name:
            row["Recall"] = new_recall

    # write data back to csv file
    with open("performance.csv", mode="w", newline="") as csv_file:
        writer = csv.DictWriter(csv_file, fieldnames=["Model", "Recall"])
        writer.writeheader()
        for row in existing_data:
            writer.writerow(row)
        # write new row if model doesn't exist
        if model_name not in [row["Model"] for row in existing_data]:
            writer.writerow({"Model": model_name, "Recall": new_recall})

#### Naive Bayes:

In [None]:
modelNB = MultinomialNB()
model_train(modelNB, X_train, X_test, y_train, y_test)

#### Logistic Regression (Best Model) :

In [None]:
modelLR = LogisticRegression(max_iter=1000)
model_train(modelLR, X_train, X_test, y_train, y_test)

#### Random forest:

In [None]:
model = RandomForestClassifier(n_estimators=100,max_depth=15,max_features='sqrt')
model_train(model, X_train, X_test, y_train, y_test)

In [None]:
print("finished")