# Hate speech classification

In [1]:
"""
Hate speech classification baseline using sklearn
Dataset: https://www.kaggle.com/c/jigsaw-toxic-comment-classification-challenge/data
"""

__author__ = "don.tuggener@zhaw.ch"

import csv
import pdb
import re
import sys
import pickle
import random
import nltk
import os.path

from collections import Counter
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_predict, train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
import zipfile

# Get stopwords

In [2]:
random.seed(42)
#nltk.download('stopwords')
STOPWORDS = stopwords.words('english')
STEMMER = SnowballStemmer("english")


# Read Data Function

In [7]:
def read_data(remove_stopwords=True, remove_numbers=True, do_stem=True, reprocess=False):
    """
    Read CSV with annotated data.
    We'll binarize the classification, i.e. subsume all hate speach related classes
    'toxic, severe_toxic, obscene, threat, insult, identity_hate'
    into one.
    """

    if not os.path.isfile('Y.pkl') or not os.path.isfile('X.pkl'):
        reprocess = True

    if reprocess:
        print('unzipping training data...')
        zip_ref = zipfile.ZipFile('train.csv.zip', 'r')
        zip_ref.extractall('train_data')
        zip_ref.close()
        print('unzipping finished')
        
        X, Y = [], []

        for i, row in enumerate(csv.reader(open('train_data/train.csv', encoding='utf8'))):
            if i > 0: # Skip the header line
                sys.stderr.write('\r'+str(i))
                sys.stderr.flush()
                text = re.findall('\w+', row[1].lower())
                if remove_stopwords:
                    text = [w for w in text if not w in STOPWORDS]

                if remove_numbers:
                    text = [w for w in text if not re.sub('\'\.,','',w).isdigit()]

                if do_stem:
                    text = [STEMMER.stem(w) for w in text]

                label = 1 if '1' in row[2:] else 0 # Any hate speach label
                X.append(' '.join(text))
                Y.append(label)

        sys.stderr.write('\n')
        pickle.dump(X, open('X.pkl', 'wb'))
        pickle.dump(Y, open('Y.pkl', 'wb'))
    else:
        X = pickle.load(open('X.pkl', 'rb'))
        Y = pickle.load(open('Y.pkl', 'rb'))

    print(len(X), 'data points read')
    print('Label distribution:',Counter(Y))
    print('As percentages:')

    for label, count_ in Counter(Y).items():
        print(label, ':', round(100*(count_/len(X)), 2))
        

    return X, Y

# Tfidf  

In [8]:
print('Loading data', file=sys.stderr)
X, Y = read_data()

print('Vectorizing with TFIDF', file=sys.stderr)
tfidfizer = TfidfVectorizer(max_features=1000)
X_tfidf_matrix = tfidfizer.fit_transform(X)
print('Data shape:', X_tfidf_matrix.shape)
do_downsample = True

if do_downsample: # Only take 20% of the data
    X_tfidf_matrix, X_, Y, Y_ = train_test_split(X_tfidf_matrix, Y, test_size=0.8, random_state=42, stratify=Y)
    print('Downsampled data shape:', X_tfidf_matrix.shape)

print('Classification and evaluation', file=sys.stderr)
# Randomly split data into 80% training and 20% testing, preserve class distribution with stratify
X_train, X_test, Y_train, Y_test = train_test_split(X_tfidf_matrix, Y, test_size=0.2, random_state=42, stratify=Y)



Loading data


unzipping training data...


50

unzipping finished


159571
Vectorizing with TFIDF


159571 data points read
Label distribution: Counter({0: 143346, 1: 16225})
As percentages:
0 : 89.83
1 : 10.17
Data shape: (159571, 1000)
Downsampled data shape: (31914, 1000)


Classification and evaluation


# Calculate accuracy of different models

In [9]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import cross_val_score
import pandas as pd

models = [
    RandomForestClassifier(n_estimators=200, max_depth=3, random_state=0),
    LinearSVC(),
    MultinomialNB(),
    LogisticRegression(random_state=0)]

CV = 2
cv_df = pd.DataFrame(index=range(CV * len(models)))
entries = []

for model in models:
    model_name = model.__class__.__name__
    accuracies = cross_val_score(model, X_tfidf_matrix, Y, scoring='accuracy', cv=CV)
    
    for fold_idx, accuracy in enumerate(accuracies):
        entries.append((model_name, fold_idx, accuracy))
    
cv_df = pd.DataFrame(entries, columns=['model_name', 'fold_idx', 'accuracy'])

### Compare models with graphic

In [10]:
import matplotlib.pyplot as plt
# Plot accurency with graphic

import seaborn as sns
sns.boxplot(x='model_name', y='accuracy', data=cv_df)
sns.stripplot(x='model_name', y='accuracy', data=cv_df, 
              size=8, jitter=True, edgecolor="gray", linewidth=2)
plt.show()


<Figure size 640x480 with 1 Axes>

### Compare models with number

In [11]:
# Plot accurency with numbers
cv_df.groupby('model_name').accuracy.mean()


model_name
LinearSVC                 0.950429
LogisticRegression        0.943692
MultinomialNB             0.940841
RandomForestClassifier    0.898320
Name: accuracy, dtype: float64

# Use Linear SVC

## Using Linear SVC

In [12]:
from sklearn.svm import LinearSVC

clf = LinearSVC(class_weight='balanced', C=4.5) # Weight samples inverse to class imbalance
# clf = SVC(kernel='rbf', class_weight='balanced', C=4.5) # Weight samples inverse to class imbalance
# clf = SVC(class_weight='balanced') # Weight samples inverse to class imbalance

print('Fitting data...')
if os.path.exists('trained_classifier.pkl'):
    with open('trained_classifier.pkl', 'rb') as fid:
        clf = pickle.load(fid)
else:
    clf.fit(X_train, Y_train)
    with open('trained_classifier.pkl', 'wb') as fid:
        pickle.dump(clf, fid)

print('Predicting data...')
if os.path.exists('trained_classifier_prediction.pkl'):
    with open('trained_classifier_prediction.pkl', 'rb') as fid:
        y_pred = pickle.load(fid)
else:
    y_pred = clf.predict(X_test)
    with open('trained_classifier_prediction.pkl', 'wb') as fid:
        pickle.dump(y_pred, fid)


print(classification_report(Y_test, y_pred), file=sys.stderr)
print(confusion_matrix(Y_test, y_pred.tolist()), file=sys.stderr)

Fitting data...
Predicting data...


             precision    recall  f1-score   support

          0       0.97      0.90      0.94      5734
          1       0.47      0.78      0.59       649

avg / total       0.92      0.89      0.90      6383

[[5166  568]
 [ 144  505]]


## Using cross validation

In [13]:
y_pred = cross_val_predict(clf, X_tfidf_matrix, Y, cv=numcv)
print(classification_report(Y, y_pred), file=sys.stderr)

NameError: name 'numcv' is not defined