## Importing Packages 

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import spacy
from spacy.training import Example
from tqdm.auto import tqdm
import time

# Importing my previously defined functions
from my_cleaning_functions import *

In [2]:
np.set_printoptions(precision=4)
sns.set(font_scale=1.5)
plt.style.use('fivethirtyeight')

%matplotlib inline
%config InlineBackend.figure_format = 'retina'

## Importing Dataset

In [3]:
test_data = pd.read_csv("data/test.csv")
test_labels = pd.read_csv("data/test_labels.csv")

print("The dataset contains %s entries with %s features." 
      % (test_data.shape[0], test_data.shape[1]))

The dataset contains 153164 entries with 2 features.


In [27]:
class_names = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate', 'non_toxic']

## Cleaning & Restructuring

In [4]:
test_data = clean_comments_column(test_data)
test_data.head()

Unnamed: 0,id,comment_text
0,00001cee341fdb12,Yo bitch Ja Rule is more succesful then you'll...
1,0000247867823ef7,"== From RfC == The title is fine as it is, ..."
2,00013b17ad220c46,""" == Sources == * Zawe Ashton on Lapland..."
3,00017563c3f7919a,":If you have a look back at the source, the in..."
4,00017695ad8997eb,I don't anonymously edit articles at all.


In [5]:
test_data = restructure_test_dataframe(data=test_data, labels=test_labels)

First data entry:  ('Thank you for understanding. I think very highly of you and would not revert without discussion.', {'cats': {'toxic': 0, 'severe_toxic': 0, 'obscene': 0, 'threat': 0, 'insult': 0, 'identity_hate': 0, 'non_toxic': 1}})


## Modelling

In [6]:
nlp = spacy.load("toxic-comments-textcat")
nlp.pipe_names

['textcat_multilabel']

In [7]:
test = []

for texts, annotation in tqdm(test_data):
    
    doc = nlp.make_doc(texts)
    test.append(Example.from_dict(doc, annotation))  
    
score = nlp.evaluate(test)

  0%|          | 0/63978 [00:00<?, ?it/s]

In [10]:
print(score["cats_auc_per_type"])

{'toxic': 0.9542003453132076, 'severe_toxic': 0.9784865323920249, 'obscene': 0.967421974425708, 'threat': 0.9701070328834158, 'insult': 0.9601088394175237, 'identity_hate': 0.962007379134363, 'non_toxic': 0.9549437623537513}


## Explaining predictions using lime

In [11]:
from lime import lime_text
from lime.lime_text import LimeTextExplainer

In [25]:
# first comment: test_data[0][0], second comment: test_data[1][0] etc...

doc = nlp(test_data[0][0])
print(doc.cats) 

{'toxic': 0.0009806256275624037, 'severe_toxic': 0.0004294771351851523, 'obscene': 0.0019398706499487162, 'threat': 0.0022096331231296062, 'insult': 0.0016055178130045533, 'identity_hate': 0.0006365900044329464, 'non_toxic': 0.9989380240440369}


In [28]:
explainer = LimeTextExplainer(class_names=class_names)

📌 Pick up from here by re-phrasing the following to fit the spacy pipeline

In [31]:
# idx = 1340
# exp = explainer.explain_instance(newsgroups_test.data[idx], c.predict_proba, num_features=6, labels=[0, 17])
# print('Document id: %d' % idx)
# print('Predicted class =', class_names[nb.predict(test_vectors[idx]).reshape(1,- 1)[0,0]])
# print('True class: %s' % class_names[newsgroups_test.target[idx]])

In [None]:
# print(test_data[0][0])
# print(test_data[2][0])

In [None]:
# doc1 = nlp.make_doc(test_data[0][0])
# doc2 = nlp.make_doc(test_data[2][0])

# example_score = textcat_multilabel.predict([doc1, doc2])