# Toxicity Classification in Online Comments

### Yitian Gao


In [None]:
import pandas as pd
import numpy as np

# read csv
# df = pd.read_csv(file_name)

# for tfidf or embedding csv file
def process_feature_csv(df):
    features = df.iloc[:, 26:].to_numpy()
    labels =  df["Toxicity"].to_numpy()
    return features, labels


In [12]:
train_df = pd.read_csv("train_embedding.csv")
dev_df = pd.read_csv("dev_embedding.csv")
test_df = pd.read_csv("test_embedding.csv")

train_features, train_labels = process_feature_csv(train_df)
dev_features, dev_labels = process_feature_csv(dev_df)
test_features, _ = process_feature_csv(test_df)
test_ids = test_df["ID"].tolist()

In [13]:
from collections import Counter
print(Counter(dev_labels))

Counter({0: 12165, 1: 2835})


#### Supervised ML algorithm and Evaluation

In [14]:
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import precision_recall_fscore_support
NBclf = GaussianNB()
NBclf.fit(train_features, train_labels)

print("Naive Bayes accuracy", NBclf.score(dev_features, dev_labels))
dev_preds = NBclf.predict(dev_features)
precision_score, recall_score, fscore_score, _ = precision_recall_fscore_support(dev_labels, dev_preds, average="binary")

print("Precision score is ", precision_score)
print("Recall score is ", recall_score)
print("F1_score is ", fscore_score)

Naive Bayes accuracy 0.6834
Precision score is  0.33584905660377357
Recall score is  0.690652557319224
F1_score is  0.451933064050779


In [15]:
from sklearn.linear_model import LogisticRegression

LRclf = LogisticRegression(random_state=666, max_iter=300)
LRclf.fit(train_features, train_labels)

print("Logistic Regressions accuracy", LRclf.score(dev_features, dev_labels))
dev_preds = LRclf.predict(dev_features)
precision_score, recall_score, fscore_score, _ = precision_recall_fscore_support(dev_labels, dev_preds, average="binary")

print("Precision score is ", precision_score)
print("Recall score is ", recall_score)
print("F1_score is ", fscore_score)

Logistic Regressions accuracy 0.8336666666666667
Precision score is  0.6523297491039427
Recall score is  0.25679012345679014
F1_score is  0.36851430017717035


In [16]:
from sklearn.neighbors import KNeighborsClassifier
KNNclf = KNeighborsClassifier(n_neighbors=3)

KNNclf.fit(train_features, train_labels)

print("K-Nearest Neighbors accuracy", KNNclf.score(dev_features, dev_labels))

dev_preds = KNNclf.predict(dev_features)
precision_score, recall_score, fscore_score, _ = precision_recall_fscore_support(dev_labels, dev_preds, average="binary")

print("Precision score is ", precision_score)
print("Recall score is ", recall_score)
print("F1_score is ", fscore_score)

K-Nearest Neighbors accuracy 0.7926666666666666
Precision score is  0.41590214067278286
Recall score is  0.23985890652557318
F1_score is  0.3042505592841163


In [17]:
test_preds = LRclf.predict(test_features).tolist()
assert len(test_ids) == len(test_preds)

f = open("test_predictions.csv", "w")
f.write("ID,Toxicity\n")
for test_id, test_pred in zip(test_ids, test_preds):
    f.write(str(test_id) + "," +str(test_pred) + "\n")
f.close()

#### Baseline

In [18]:
from sklearn.dummy import DummyClassifier
ZEROclf = DummyClassifier(strategy="most_frequent")
ZEROclf.fit(train_features, train_labels)
print("Zero Rule baseline accuracy", ZEROclf.score(dev_features, dev_labels))

Zero Rule baseline accuracy 0.811


#### Research Question 1: Semi-supervised Learning

In [39]:

unlabeled_df = pd.read_csv("unlabeled_embedding.csv")
unlabeled_features = unlabeled_df.iloc[:, 1:].to_numpy()

unlabeled_labels = np.array([-1] * unlabeled_features.shape[0])

semi_features = np.concatenate((train_features, unlabeled_features), axis=0)
semi_labels = np.concatenate((train_labels, unlabeled_labels), axis=0)

In [42]:
from sklearn.semi_supervised import SelfTrainingClassifier

LRclf = LogisticRegression(random_state=666, max_iter=300)
self_training_model = SelfTrainingClassifier(LRclf)
self_training_model.fit(semi_features, semi_labels)
print("Self Training (Semi supervised Learning) accuracy", self_training_model.score(dev_features, dev_labels))

Self Training (Semi supervised Learning) accuracy 0.8302666666666667


In [43]:
dev_preds = self_training_model(dev_features)
precision_score, recall_score, fscore_score, _ = precision_recall_fscore_support(dev_labels, dev_preds, average="binary")

print("Precision score is ", precision_score)
print("Recall score is ", recall_score)
print("F1_score is ", fscore_score)

TypeError: 'SelfTrainingClassifier' object is not callable

In [None]:
# LRclf = LogisticRegression(random_state=666, max_iter=300)
self_training_model = SelfTrainingClassifier(NBclf)
self_training_model.fit(semi_features, semi_labels)

print("Self Training (Semi supervised Learning) accuracy", self_training_model.score(dev_features, dev_labels))
dev_preds = self_training_model(dev_features)
precision_score, recall_score, fscore_score, _ = precision_recall_fscore_support(dev_labels, dev_preds, average="binary")

print("Precision score is ", precision_score)
print("Recall score is ", recall_score)
print("F1_score is ", fscore_score)

In [None]:
self_training_model = SelfTrainingClassifier(KNNclf)
self_training_model.fit(semi_features, semi_labels)
print("Self Training (Semi supervised Learning) accuracy", self_training_model.score(dev_features, dev_labels))

dev_preds = self_training_model(dev_features)
precision_score, recall_score, fscore_score, _ = precision_recall_fscore_support(dev_labels, dev_preds, average="binary")

print("Precision score is ", precision_score)
print("Recall score is ", recall_score)
print("F1_score is ", fscore_score)