In [19]:
import tensorflow as tf
import pandas as pd
import platform
import numpy as np

from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [20]:
train = pd.read_csv('./data/train.csv')
test = pd.read_csv('./data/test.csv')

In [4]:
train.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,22256635,"Nonsense? kiss off, geek. what I said is true...",1,0,0,0,0,0
1,27450690,"""\n\n Please do not vandalize pages, as you di...",0,0,0,0,0,0
2,54037174,"""\n\n """"Points of interest"""" \n\nI removed the...",0,0,0,0,0,0
3,77493077,Asking some his nationality is a Racial offenc...,0,0,0,0,0,0
4,79357270,The reader here is not going by my say so for ...,0,0,0,0,0,0


In [5]:
lens = train.comment_text.str.len()

lens.mean(), lens.std(), lens.max()

(395.3418639346486, 595.1020716997122, 5000)

In [6]:
hist = lens.hist(bins=100)

hist.set_xlim(0, 1000)
start, end = hist.get_xlim()
hist.xaxis.set_ticks(np.arange(start, end, 100));

In [7]:
label_cols = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
train['none'] = 1-train[label_cols].max(axis=1)
train.describe()

Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate,none
count,95851.0,95851.0,95851.0,95851.0,95851.0,95851.0,95851.0,95851.0
mean,499435900000.0,0.096368,0.010068,0.053301,0.003182,0.049713,0.008492,0.897862
std,289013600000.0,0.295097,0.099832,0.224635,0.05632,0.217352,0.091762,0.302831
min,22256640.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,247343700000.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
50%,500129700000.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
75%,750108800000.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
max,999988200000.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [8]:
COMMENT = 'comment_text'
train[COMMENT].fillna('unknown', inplace=True)
test[COMMENT].fillna('unknown', inplace=True)

In [9]:
import re, string
re_token = re.compile(f'([{string.punctuation}“”¨«»®´·º½¾¿¡§£₤‘’])')
def tokenize(s): return re_tok.sub(r' \1 ', s).split()

In [10]:
split = train.comment_text.str.split()
split.apply(lambda x: [entry.strip(string.punctuation) for entry in x])

0        [Nonsense, kiss, off, geek, what, I, said, is,...
1        [, Please, do, not, vandalize, pages, as, you,...
2        [, Points, of, interest, I, removed, the, poin...
3        [Asking, some, his, nationality, is, a, Racial...
4        [The, reader, here, is, not, going, by, my, sa...
5         [Fried, chickens, Is, dat, sum, fried, chickens]
6        [Why, can, you, put, English, for, example, on...
7        [Guy, Fawkes, im, a, resident, in, bridgwater,...
8        [as, far, as, nicknames, go, this, article, is...
9        [Woodland, Meadows, Good, to, hear, that, you,...
10       [, Well, I, just, finished, a, good, bit, of, ...
11       [Discussion, should, take, place, on, the, art...
12       [Uh, oh, you, called, my, bluff, I, am, intimi...
13       [, We, should, also, contact, the, living, des...
14       [, May, 2008, UTC, Notability, of, Your, New, ...
15       [, While, I, agree, that, this, article, isn't...
16       [a, Turkish, citizen, and, him, having, receiv.