### Load Data, drop empty Txt and Settings
----

In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
import pandas as pd
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.preprocessing import MinMaxScaler
import seaborn as sns
import matplotlib.pyplot as plt

In [13]:
## SETTINGS
SET_vectorizer_hash = False
SET_vectorizer_tfidf = False
SET_vectorizer_glove = True
SET_vectorizer_OHE = False



SET_datapath = "data_pp_cut.json"

df = pd.read_json(SET_datapath, orient="columns")

df.drop(df[df.comment_text.str.len() == 0].index, inplace=True)
df.reset_index(drop=True, inplace=True)

df.head()

Unnamed: 0,comment_text,toxic
0,u created request scholarlyarticles awaiting c...,1
1,looking lgbt profession category point get beh...,1
2,"new york city regulation regarding removal, , ...",0
3,thank speedy rollback would believe first time...,1
4,want talk stuff perfectly willing refer commen...,0


### Word Embedding

----

#### Hash Vectorizer

In [14]:
if SET_vectorizer_hash:

    vectorizer_hash = HashingVectorizer(ngram_range=(1,2), n_features=2500)
    X = vectorizer_hash.fit_transform(df["comment_text"].values.tolist()).toarray()
    y = df["toxic"].values

#### TFidf Vectorizer

In [15]:
if SET_vectorizer_tfidf:

    vectorizer_tfidf = TfidfVectorizer(ngram_range=(1,2), max_features=2500)
    X = vectorizer_tfidf.fit_transform(df["comment_text"].values.tolist()).toarray()
    y = df['toxic'].values

#### Glove Vectorizer

In [16]:
import numpy as np
np.seterr(invalid='ignore')

def get_embedding(series, model, tokenizer):
    embedding_matrix = []

    for text in series.to_list():
        text_vec = np.zeros(model.vector_size)
        number_of_vectors = 0

        tokens = tokenizer.tokenize(text)

        for token in tokens:
            number_of_vectors += 1

            if token in glove_model.key_to_index:
                text_vec = text_vec + model[token]

        embedding_matrix.append( text_vec / number_of_vectors)

    return np.array(embedding_matrix)

In [17]:
if SET_vectorizer_glove:
    import gensim.downloader as api

    glove_model = api.load("glove-wiki-gigaword-300")

In [18]:
if SET_vectorizer_glove:
    from nltk.tokenize import RegexpTokenizer
    tokenizer = RegexpTokenizer(r"[a-zA-Z0-9]+")

    X = get_embedding(df.comment_text, glove_model, tokenizer)
    y = df.toxic.values

#### OneHotEncoder

In [19]:
if SET_vectorizer_OHE:

    from sklearn.preprocessing import OneHotEncoder
    encoder = OneHotEncoder()
    model_ohe = encoder.fit_transform(df[["comment_text"]]).toarray()

#### BERT

In [20]:

"""from transformers import BertTokenizer, TFBertModel

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = TFBertModel.from_pretrained('bert-base-uncased')

# Tokenize input comments
encoded_comments = tokenizer(df['comment_text'].tolist(), padding=True, truncation=True, max_length=128, return_tensors='tf')

# Pass tokenized comments through BERT model to get embeddings
bert_embeddings = model(encoded_comments)['pooler_output'].numpy()"""


"from transformers import BertTokenizer, TFBertModel\n\ntokenizer = BertTokenizer.from_pretrained('bert-base-uncased')\nmodel = TFBertModel.from_pretrained('bert-base-uncased')\n\n# Tokenize input comments\nencoded_comments = tokenizer(df['comment_text'].tolist(), padding=True, truncation=True, max_length=128, return_tensors='tf')\n\n# Pass tokenized comments through BERT model to get embeddings\nbert_embeddings = model(encoded_comments)['pooler_output'].numpy()"

In [21]:
"""X_train, X_test, y_train, y_test = train_test_split(bert_embeddings, df['toxic'].values, test_size=0.2, random_state=99)

# Scale input features using MinMaxScaler
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)"""

"X_train, X_test, y_train, y_test = train_test_split(bert_embeddings, df['toxic'].values, test_size=0.2, random_state=99)\n\n# Scale input features using MinMaxScaler\nscaler = MinMaxScaler()\nX_train = scaler.fit_transform(X_train)\nX_test = scaler.transform(X_test)"

### Split Data for Models
----

In [22]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=99)
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [23]:
print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)

X_train shape: (1181, 300)
X_test shape: (296, 300)
y_train shape: (1181,)
y_test shape: (296,)


### Train Models and get Scores

----

#### Try... Neural Network

In [24]:
import numpy as np
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation
from keras.preprocessing.text import Tokenizer
from keras.utils import to_categorical
from sklearn.datasets import fetch_20newsgroups

# load the 20 newsgroups dataset
newsgroups_train = fetch_20newsgroups(subset='train', shuffle=True)
newsgroups_test = fetch_20newsgroups(subset='test', shuffle=True)

# convert text to numerical data
tokenizer = Tokenizer(num_words=2000)
tokenizer.fit_on_texts(newsgroups_train.data)
X_train = tokenizer.texts_to_matrix(newsgroups_train.data, mode='tfidf')
X_test = tokenizer.texts_to_matrix(newsgroups_test.data, mode='tfidf')

# convert labels to categorical data
num_classes = np.max(newsgroups_train.target) + 1
y_train = to_categorical(newsgroups_train.target, num_classes)
y_test = to_categorical(newsgroups_test.target, num_classes)

# define the model architecture
model = Sequential()
model.add(Dense(512, input_shape=(2000,)))
model.add(Activation('relu'))
model.add(Dropout(0.5))
model.add(Dense(num_classes))
model.add(Activation('softmax'))

# compile the model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# train the model
#model.fit(X_train, y_train, batch_size=32, epochs=5, validation_data=(X_test, y_test))

In [25]:
def model_train(model, X_train, X_test, y_train, y_test):
    model.fit(X_train,y_train)
    y_pred_tr = model.predict(X_train)
    y_pred = model.predict(X_test)

    print("--------------------Training Performance---------------------")
    print(accuracy_score(y_train,y_pred_tr))
    print(classification_report(y_train,y_pred_tr))
    print("-------------------------------------------------------------")
    print("--------------------Testing Performance----------------------")
    print(accuracy_score(y_test,y_pred))
    print(classification_report(y_test,y_pred))
    
    sns.heatmap(confusion_matrix(y_test, y_pred),cmap='viridis',annot=True,fmt='.4g')
    plt.xlabel('Predicted Class')
    plt.ylabel('Actual Class')
    plt.show()

#### Naive Bayes:

In [26]:
modelNB = MultinomialNB()
model_train(modelNB, X_train, X_test, y_train, y_test)

ValueError: y should be a 1d array, got an array of shape (11314, 20) instead.

#### Logistic Regression (Best Model) :

In [None]:
modelLR = LogisticRegression(max_iter=1000)
model_train(modelLR, X_train, X_test, y_train, y_test)

#### Random forest:

In [None]:
model = RandomForestClassifier(n_estimators=100,max_depth=15,max_features='sqrt')
model_train(model, X_train, X_test, y_train, y_test)

In [None]:
print("finished")