In [3]:
import pandas as pd, numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [4]:
train = pd.read_csv('input/train.csv')
test = pd.read_csv('input/test.csv')
subm = pd.read_csv('input/sample_submission.csv')

In [4]:
train.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [5]:
label_cols = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
train['none'] = 1-train[label_cols].max(axis=1)
train.describe()

Unnamed: 0,toxic,severe_toxic,obscene,threat,insult,identity_hate,none
count,159571.0,159571.0,159571.0,159571.0,159571.0,159571.0,159571.0
mean,0.095844,0.009996,0.052948,0.002996,0.049364,0.008805,0.898321
std,0.294379,0.099477,0.223931,0.05465,0.216627,0.09342,0.302226
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,1.0
50%,0.0,0.0,0.0,0.0,0.0,0.0,1.0
75%,0.0,0.0,0.0,0.0,0.0,0.0,1.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [6]:
df = pd.DataFrame([[np.nan, 2, np.nan, 0],
...                    [3, 4, np.nan, 1],
...                    [np.nan, np.nan, np.nan, 5],
...                    [np.nan, 3, np.nan, 4]],
...                   columns=list('ABCD'))

In [11]:
df.fillna(0)

Unnamed: 0,A,B,C,D
0,0.0,2.0,0.0,0
1,3.0,4.0,0.0,1
2,0.0,0.0,0.0,5
3,0.0,3.0,0.0,4


In [10]:
df

Unnamed: 0,A,B,C,D
0,0.0,2.0,0.0,0
1,3.0,4.0,0.0,1
2,0.0,0.0,0.0,5
3,0.0,3.0,0.0,4


In [6]:
train['comment_text'][0]

"Explanation\nWhy the edits made under my username Hardcore Metallica Fan were reverted? They weren't vandalisms, just closure on some GAs after I voted at New York Dolls FAC. And please don't remove the template from the talk page since I'm retired now.89.205.38.27"

In [20]:
train[train['comment_text'].isin(['nan'])]

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate,none


In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer
corpus = [
     'This is the first document.',
 ]
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(corpus)
print(vectorizer.get_feature_names())
print(X.shape)
print(X.toarray)

['document', 'first', 'is', 'the', 'this']
(1, 5)
<bound method _cs_matrix.toarray of <1x5 sparse matrix of type '<class 'numpy.float64'>'
	with 5 stored elements in Compressed Sparse Row format>>


In [8]:
import re, string
re_tok = re.compile(f'([{string.punctuation}“”¨«»®´·º½¾¿¡§£₤‘’])')
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [43]:
s="Explanation\nWhy the edits#$%&\'() made under my username Hardcore Metallica Fan were reverted? They weren't vandalisms, just closure on some GAs after I voted at New York Dolls FAC. And please don't remove the template from the talk page since I'm retired now.89.205.38.27"
re_tok.sub(r' \1 ', s)


"Explanation\nWhy the edits #  $  %  &  '  (  )  made under my username Hardcore Metallica Fan were reverted ?  They weren ' t vandalisms ,  just closure on some GAs after I voted at New York Dolls FAC .  And please don ' t remove the template from the talk page since I ' m retired now . 89 . 205 . 38 . 27"

In [44]:
re.sub(r'(\b[a-z]+) \1', r'\1', 'cat in the the hat')


'cat in the hat'

In [1]:
import re, string
re_tok = re.compile(f'([{string.punctuation}“”¨«»®´·º½¾¿¡§£₤‘’])')
def tokenize(s): return re_tok.sub(r' \1 ', s).split()

In [10]:
COMMENT='comment_text'
n = train.shape[0]
vec = TfidfVectorizer(ngram_range=(1,2), tokenizer=tokenize,
               min_df=3, max_df=0.9, strip_accents='unicode', use_idf=1,
               smooth_idf=1, sublinear_tf=1 )
trn_term_doc = vec.fit_transform(train[COMMENT])
test_term_doc = vec.transform(test[COMMENT])

In [15]:
trn_term_doc.shape

(159571, 426005)

In [32]:
print(trn_term_doc)

  (0, 155788)	0.08387304169167062
  (0, 409552)	0.050464296356772576
  (0, 359013)	0.04305623955927827
  (0, 146170)	0.05958322312749654
  (0, 231608)	0.05824510929741852
  (0, 389638)	0.06447039503637214
  (0, 247352)	0.04009335280681195
  (0, 394106)	0.08584686131575688
  (0, 181782)	0.12307662723794742
  (0, 239747)	0.14144239767301586
  (0, 158369)	0.09331994330310292
  (0, 404947)	0.053984137023003295
  (0, 313199)	0.0721378910588976
  (0, 41026)	0.03476653974063545
  (0, 372640)	0.04649183348345615
  (0, 405772)	0.10167216335676989
  (0, 6837)	0.053085714729024396
  (0, 350338)	0.06000137354577802
  (0, 395665)	0.13995024181834184
  (0, 11880)	0.020422857416240035
  (0, 216481)	0.04369540773786537
  (0, 115362)	0.12586966621676254
  (0, 268787)	0.031841956504032994
  (0, 335723)	0.04728814119210447
  (0, 172411)	0.11169554291171112
  :	:
  (159570, 195684)	0.11659612651724692
  (159570, 194171)	0.13480189086616873
  (159570, 373891)	0.10267115585521343
  (159570, 67978)	0.1192264

In [19]:
test_term_doc[0].toarray()

array([[0., 0., 0., ..., 0., 0., 0.]])

In [26]:
train['toxic']

0         0
1         0
2         0
3         0
4         0
5         0
6         1
7         0
8         0
9         0
10        0
11        0
12        1
13        0
14        0
15        0
16        1
17        0
18        0
19        0
20        0
21        0
22        0
23        0
24        0
25        0
26        0
27        0
28        0
29        0
         ..
159541    1
159542    0
159543    0
159544    0
159545    0
159546    1
159547    0
159548    0
159549    0
159550    0
159551    0
159552    0
159553    0
159554    1
159555    0
159556    0
159557    0
159558    0
159559    0
159560    0
159561    0
159562    0
159563    0
159564    0
159565    0
159566    0
159567    0
159568    0
159569    0
159570    0
Name: toxic, Length: 159571, dtype: int64