In [1]:
from sklearn.model_selection import train_test_split
from collections import Counter
from tqdm import tqdm_notebook

import pandas as pd
import nltk

import config
import log

In [2]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /Users/colan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

### Loading in Data

In [3]:
log.info("Loading Training Data")
df = pd.read_csv(config.TRAIN_PATH, header=0, nrows=1000, usecols=['target', 'comment_text'])

[94mINFO:[0m Loading Training Data


In [4]:
df

Unnamed: 0,target,comment_text
0,0.000000,"This is so cool. It's like, 'would you want yo..."
1,0.000000,Thank you!! This would make my life a lot less...
2,0.000000,This is such an urgent design problem; kudos t...
3,0.000000,Is this something I'll be able to install on m...
4,0.893617,haha you guys are a bunch of losers.
...,...,...
995,0.000000,I call dibs on the Tonya Harding pic
996,0.629630,This is just freaking sad. Another great place...
997,0.000000,This is a test comment (trying out the new Civ...
998,0.000000,This is a test comment (trying out the new Civ...


### Simplifying Targets

For this sample we are going to only use the input text and have it guess a sample label. To get the sample label we are going to create 3 intermediaries for the calculated toxicity target betwee 0 and 1.

* 0.00-0.20 -> not toxic
* 0.21-0.60 -> mildly toxic
* 0.61-1.00 -> toxic

In [5]:
targets = df['target'].apply(lambda t: 'not_toxic' if t <= 0.2 else 'mildly_toxic' if t <= 0.6 else 'toxic')
df.update(targets)

In [6]:
df

Unnamed: 0,target,comment_text
0,not_toxic,"This is so cool. It's like, 'would you want yo..."
1,not_toxic,Thank you!! This would make my life a lot less...
2,not_toxic,This is such an urgent design problem; kudos t...
3,not_toxic,Is this something I'll be able to install on m...
4,toxic,haha you guys are a bunch of losers.
...,...,...
995,not_toxic,I call dibs on the Tonya Harding pic
996,toxic,This is just freaking sad. Another great place...
997,not_toxic,This is a test comment (trying out the new Civ...
998,not_toxic,This is a test comment (trying out the new Civ...


### Removing Case

In [7]:
targets = df['comment_text'].apply(lambda comment: comment.lower())
df.update(targets)

In [8]:
df

Unnamed: 0,target,comment_text
0,not_toxic,"this is so cool. it's like, 'would you want yo..."
1,not_toxic,thank you!! this would make my life a lot less...
2,not_toxic,this is such an urgent design problem; kudos t...
3,not_toxic,is this something i'll be able to install on m...
4,toxic,haha you guys are a bunch of losers.
...,...,...
995,not_toxic,i call dibs on the tonya harding pic
996,toxic,this is just freaking sad. another great place...
997,not_toxic,this is a test comment (trying out the new civ...
998,not_toxic,this is a test comment (trying out the new civ...


### Splitting

We split the data so we can estimate the performace of our algorithm. You can set the switch value below to decide the split. Default is to 0.8 where 80% of the training is trained on and 20% is tested on.

In [9]:
split_percentage = 0.8

In [10]:
training_set, testing_set = train_test_split(df, test_size = 1.0 - split_percentage)

log.info(f'Train Data Size: {len(training_set)}')
log.info(f'Test Data Size: {len(testing_set)}')

[94mINFO:[0m Train Data Size: 800
[94mINFO:[0m Test Data Size: 200


### Formatting and Bag of Words

We'll use the 1000 most occurring words for our bag of words in this example.

In [11]:
training_data = []
training_data = []

word_counts = Counter()

In [12]:
train = training_set.to_dict()
test = testing_set.to_dict()

In [13]:
tweet_tokenizer = nltk.tokenize.TweetTokenizer()

for key in tqdm_notebook(train['comment_text']):
    tokens = tweet_tokenizer.tokenize(train['comment_text'][key])
    train['comment_text'][key] = tokens
    
    c = Counter(tokens)
    word_counts.update(c)

HBox(children=(IntProgress(value=0, max=800), HTML(value='')))




In [14]:
most_common_words = word_counts.most_common(1000)

In [15]:
log.info('training set updates')

for key in tqdm_notebook(train['comment_text']):
    tokenized_sentence = train['comment_text'][key]
    bow = {}
    
    for word in most_common_words:
        word = word[0]
        
        if word in tokenized_sentence:
            bow[word] = tokenized_sentence.count(word)
        else:
            bow[word] = 0
            
    train['comment_text'][key] = bow

[94mINFO:[0m training set updates


HBox(children=(IntProgress(value=0, max=800), HTML(value='')))




In [16]:
log.info('testing set updates')

for key in tqdm_notebook(test['comment_text']):
    tokenized_sentence = tweet_tokenizer.tokenize(test['comment_text'][key])
    bow = {}
    
    for word in most_common_words:
        word = word[0]
        
        if word in tokenized_sentence:
            bow[word] = tokenized_sentence.count(word)
        else:
            bow[word] = 0

    test['comment_text'][key] = bow

[94mINFO:[0m testing set updates


HBox(children=(IntProgress(value=0, max=200), HTML(value='')))




In [17]:
log.info('formatting training data for NLTK')

final_training_set = []
for key in tqdm_notebook(train['target']):
    final_training_set.append((train['comment_text'][key], train['target'][key]))

[94mINFO:[0m formatting training data for NLTK


HBox(children=(IntProgress(value=0, max=800), HTML(value='')))




In [18]:
log.info('formatting testing data for nltk')

final_testing_set = []
for key in tqdm_notebook(test['target']):
    final_testing_set.append((test['comment_text'][key], test['target'][key]))

[94mINFO:[0m formatting testing data for nltk


HBox(children=(IntProgress(value=0, max=200), HTML(value='')))




### Training

In [19]:
naive_bayes_classifier = nltk.NaiveBayesClassifier.train(final_training_set)

### Evaluation

In [20]:
nltk.classify.accuracy(naive_bayes_classifier, final_testing_set)

0.75

In [21]:
naive_bayes_classifier.show_most_informative_features(50)

Most Informative Features
                   bunch = 1               toxic : not_to =     88.2 : 1.0
                   going = 2               toxic : not_to =     36.8 : 1.0
                    want = 3               toxic : not_to =     35.9 : 1.0
                   after = 2               toxic : not_to =     35.9 : 1.0
                  people = 3              mildly : not_to =     24.7 : 1.0
              apartments = 1               toxic : not_to =     22.1 : 1.0
                      or = 3               toxic : not_to =     21.1 : 1.0
                    guys = 1               toxic : not_to =     21.0 : 1.0
                purposes = 1               toxic : not_to =     21.0 : 1.0
                     law = 2               toxic : not_to =     20.6 : 1.0
                    feds = 1              mildly : not_to =     19.6 : 1.0
                   mayor = 2              mildly : not_to =     18.0 : 1.0
                    come = 2              mildly : not_to =     18.0 : 1.0