# Toxic Comments Classification

This is a sample notebook working with the data sets in https://figshare.com/articles/Wikipedia_Talk_Labels_Toxicity/4563973

## Step1: Load data from Datalake channel

### Load data

In [None]:
import numpy as np
import pandas as pd
from abeja.datalake import Client as DatalakeClient

# set channel id
channel_id = 'XXXXXXXXXXXXX'

# set file name
data_file_name = 'toxicity_annotated_comments.tsv'
annotation_file_name = 'toxicity_annotations.tsv'


def load_file_from_datalake(channel_id, file_name):
    datalake_client = DatalakeClient()
    channel = datalake_client.get_channel(channel_id)

    # load file
    for f in channel.list_files():
        if f.metadata['filename'] == file_name:
            file_path = f.download_url
            print('load ' + file_name)
            return file_path

        
data_file = load_file_from_datalake(channel_id, data_file_name)
annotation_file = load_file_from_datalake(channel_id, annotation_file_name)

data = pd.read_csv(data_file, sep='\t', index_col=0)
annotation =  pd.read_csv(annotation_file, sep='\t')

### Clean data

In [None]:
# check raw data
data.head()

In [None]:
# check annotaion
annotation.head()

In [None]:
# labels a comment as "toxic" if the majority of annoatators did so
labels = annotation.groupby('rev_id')['toxicity'].mean() > 0.5

In [None]:
# join labels and comments
data['toxic'] = labels

In [None]:
# remove unneccesary words (newline and tab tokens)
data['comment'] = data['comment'].apply(lambda x: x.replace("NEWLINE_TOKEN", " "))
data['comment'] = data['comment'].apply(lambda x: x.replace("TAB_TOKEN", " "))

In [None]:
# check cleaned data
data.head()

In [None]:
# show "toxic" comments 
print("No of toxic comment: {}".format(len(data[data['toxic'] == True])))
print("No of non-toxic comment: {}".format(len(data[data['toxic'] == False])))
print("\n--- sample of toxic comment ---")
data[data['toxic'] == True]['comment'].head()

In [None]:
# Check for missing values in dataset
null_check = data.isnull().sum()
null_check

### Visualize data

In [None]:
!pip install wordcloud
import matplotlib.pyplot as plt
%matplotlib inline

from wordcloud import WordCloud

# Combine all comments for the desired sentiment
toxic_comments = data[data['toxic'] == False]['comment'].values
combined_text = " ".join([x for x in toxic_comments])

# Initialize wordcloud object
wc = WordCloud(background_color='white', max_words=50, collocations=False)

# Generate and plot wordcloud
plt.imshow(wc.generate(combined_text))
plt.axis('off')
plt.show()

### Prepare dataset for training

In [None]:
# split into train, test, dev dataset 
X_train = data[data['split']=='train']['comment'].values
X_test = data[data['split']=='test']['comment'].values
# X_dev = data[data['split']=='dev']['comment'].values
Y_train = data[data['split']=='train']['toxic'].values
Y_test = data[data['split']=='test']['toxic'].values
# Y_dev = data[data['split']=='dev']['toxic'].values

## Step2: Preprocessing

### Convert words to vector with Bag-of-Words

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

# extract 2-grams of words in addition to the 1-grams
vectorizer = CountVectorizer(max_features=10000, ngram_range=(1,2))
feature_train = vectorizer.fit_transform(X_train)

### Weighten important words by using TF-IDF

In [None]:
from sklearn.feature_extraction.text import TfidfTransformer

# Tf–idf term weighting
transformer = TfidfTransformer(norm='l2')
feature_train = transformer.fit_transform(feature_train).toarray()

In [None]:
feature_names = np.array(vectorizer.get_feature_names())
index = np.argsort(feature_train[0])[::-1]
feature_words = feature_names[index]

In [None]:
print("--- original comment ---")
print(X_train[0])

print("\n---- top 10 features ---")
print(feature_words[:10])

print("\n--- label ---")
print(Y_train[0])

print("\n---- vectorized comment ---")
print(feature_train[0])

## Step3: Train Classifier using TF-IDF features

### Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.metrics import log_loss
from sklearn.metrics import accuracy_score

clf1 = Pipeline([
    ('vect', CountVectorizer(max_features=10000, ngram_range=(1,2))),
    ('tfidf', TfidfTransformer(norm='l2')),
    ('clf', LogisticRegression()),
])

clf1 = clf1.fit(X_train, Y_train)

# evaluate model
print("[{}] Accuracy: train = {}, test = {}".format(
        clf1.__class__.__name__,
        clf1.score(X_train, Y_train),
        clf1.score(X_test, Y_test)))

In [None]:
from sklearn.metrics import classification_report
print(classification_report(Y_test, clf1.predict(X_test)))

### Gradient-Boosted Decision Tree

In [None]:
from sklearn.ensemble import GradientBoostingClassifier

clf2 = Pipeline([
    ('vect', CountVectorizer(max_features=10000, ngram_range=(1,2))),
    ('tfidf', TfidfTransformer(norm='l2')),
    ('clf', GradientBoostingClassifier(n_estimators=10, verbose=1)),
])

clf2 = clf2.fit(X_train, Y_train)

# evaluate model
print("[{}] Accuracy: train = {}, test = {}".format(
        clf2.__class__.__name__,
        clf2.score(X_train, Y_train),
        clf2.score(X_test, Y_test)))

In [None]:
print(classification_report(Y_test, clf2.predict(X_test)))

## Step4: Prediction

In [None]:
# correctly classify nice comment
clf1.predict(['Thanks for you contribution, you did a great job!'])

In [None]:
# correctly classify nasty comment
clf1.predict(['People as stupid as you should not edit Wikipedia!'])

In [None]:
from sklearn.externals import joblib

# save model for deployment
joblib.dump(clf1, 'model.pkl')