In [3]:
import re
import string
import time
import pickle
import pandas as pd
import numpy as np
import NbSvmClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [4]:
train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')
subm = pd.read_csv('submissions/sample_submission.csv')

In [5]:
COMMENT = 'comment_text'
def comments_preprocessing(df):
    df[COMMENT].fillna("unknown", inplace=True)
    df[COMMENT] = df[COMMENT].apply(lambda x: str(x))
    df[COMMENT] = df[COMMENT].apply(lambda x: x.lower())
    # df[COMMENT] = df[COMMENT].apply(lambda x: x.translate(str.maketrans('','',string.punctuation)))
    df[COMMENT] = df[COMMENT].apply(lambda x: x.replace('\n',''))
    df[COMMENT] = df[COMMENT].apply(lambda x: x.replace('\r',''))
    return df

In [6]:
train = comments_preprocessing(train)
test = comments_preprocessing(test)

print('train shape', train.shape)
print('test shape', test.shape)
train.head(5)

train shape (95851, 8)
test shape (226998, 2)


Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,22256635,"nonsense? kiss off, geek. what i said is true...",1,0,0,0,0,0
1,27450690,""" please do not vandalize pages, as you did wi...",0,0,0,0,0,0
2,54037174,""" """"points of interest"""" i removed the """"point...",0,0,0,0,0,0
3,77493077,asking some his nationality is a racial offenc...,0,0,0,0,0,0
4,79357270,the reader here is not going by my say so for ...,0,0,0,0,0,0


In [7]:
re_tok = re.compile(f'([{string.punctuation}“”¨«»®´·º½¾¿¡§£₤‘’])')
def tokenize(s): 
    output = re_tok.sub(r' \1 ', s).split()
    return output

In [8]:
n = train.shape[0]
vec = TfidfVectorizer(ngram_range=(1,2), tokenizer=tokenize,
               min_df=3, max_df=0.9, strip_accents='unicode', use_idf=True,
               smooth_idf=True, sublinear_tf=True)
trn_term_doc = vec.fit_transform(train[COMMENT])
test_term_doc = vec.transform(test[COMMENT])

In [9]:
trn_term_doc, test_term_doc

(<95851x284039 sparse matrix of type '<class 'numpy.float64'>'
 	with 10422196 stored elements in Compressed Sparse Row format>,
 <226998x284039 sparse matrix of type '<class 'numpy.float64'>'
 	with 26090084 stored elements in Compressed Sparse Row format>)

In [10]:
x = trn_term_doc
test_x = test_term_doc

In [11]:
categories = train.columns.values.tolist()[2:]
toxic_dataframes = [train[train[x] == 1] for x in categories]
print(categories)

['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']


In [12]:
preds = pd.DataFrame({'id': test['id']})

for i, col in enumerate(categories):
    print('fit', col)
    m = NbSvmClassifier.NbSvmClassifier()
    m.fit(x, train[col])
    preds[col] = m.predict_proba(test_x)[:,1]

fit toxic
fit severe_toxic
fit obscene
fit threat
fit insult
fit identity_hate


In [13]:
print('preds shape', preds.shape)
preds.head()

preds shape (226998, 7)


Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,6044863,0.012396,0.001535,0.005828,0.000462,0.005548,0.001601
1,6102620,0.008652,0.000812,0.00617,0.000401,0.004417,0.001327
2,14563293,0.00381,0.001084,0.003675,0.000427,0.003168,0.00093
3,21086297,0.037713,0.002595,0.011549,0.00054,0.011352,0.001166
4,22982444,0.010104,0.00205,0.00567,0.000501,0.003938,0.001761


In [14]:
print('amount marked true in each category')
for col in categories:
    total = preds[col].sum()
    print(col, total)
preds.describe()

amount marked true in each category
toxic 9477.54735283
severe_toxic 834.583553981
obscene 4695.04493976
threat 220.199829049
insult 4324.13935953
identity_hate 690.398633533


Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate
count,226998.0,226998.0,226998.0,226998.0,226998.0,226998.0,226998.0
mean,500497900000.0,0.041752,0.003677,0.020683,0.00097,0.019049,0.003041
std,288650000000.0,0.13627,0.034399,0.105617,0.015028,0.09093,0.026685
min,6044863.0,4.2e-05,5.1e-05,0.000417,6.8e-05,3.4e-05,0.000207
25%,250776700000.0,0.005847,0.000912,0.003764,0.00044,0.003399,0.001162
50%,500919800000.0,0.012657,0.001185,0.005729,0.000502,0.005811,0.001437
75%,750354400000.0,0.025499,0.001731,0.008606,0.000578,0.009748,0.001789
max,999997800000.0,1.0,0.99996,1.0,0.999953,1.0,0.999999


In [18]:
# output to csv
preds.to_csv('submissions/submission_nbsvm.csv', index=False)

In [16]:
pickle.dump(preds, open('pickles/predictions_word_grams.p', 'wb'))