In [1]:
import joblib
import re
import string

import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score, cohen_kappa_score, f1_score, classification_report
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.naive_bayes import MultinomialNB

In [2]:
# categories = [
#     "alt.atheism",
#     "misc.forsale",
#     "sci.space",
#     "soc.religion.christian",
#     "talk.politics.guns",
# ]

categories = [ 
    "hasBadWords.True",
    "hasBadWords.False",
             ]

news_group_data = pd.read_json('datasets/dataset.json') # dataset.json
news_group_data['target'] = news_group_data.hasBadWords.apply(lambda x: "hasBadWords.True" if x == True else "hasBadWords.False")
# news_group_data.drop(['violation'], axis=1, inplace=True)
news_group_data.shape

# news_group_data = fetch_20newsgroups(
#     subset="all", remove=("headers", "footers", "quotes"), categories=categories
# )

# df = pd.DataFrame(
#     dict(
#         text=news_group_data["text"],
#         target=news_group_data["target"]
#     )
# )
# df["target"] = df.target.map(lambda x: categories[x])

(86439, 4)

In [3]:
df = pd.DataFrame(
    dict(
        text=news_group_data["text"],
        target=news_group_data["target"]
    )
)

In [4]:
df.head()

Unnamed: 0,text,target
0,My Favorite Slut,hasBadWords.False
1,girlfriends sit on each other's faces with the...,hasBadWords.False
2,bound beauty kisses her girlfriend,hasBadWords.False
3,MORGAN - Anytime - Nail Painting On The Slave'...,hasBadWords.False
4,TRANSGENDER COACHING (wmv) PART 1,hasBadWords.False


In [5]:
def process_text(text):
    text = str(text).lower()
    text = re.sub(
        f"[{re.escape(string.punctuation)}]", " ", text
    )
    text = " ".join(text.split())
    return text

df["clean_text"] = df.text.map(process_text)

In [6]:

df = df[['clean_text', 'target']]
df.head()

Unnamed: 0,text,target,clean_text
0,My Favorite Slut,hasBadWords.False,my favorite slut
1,girlfriends sit on each other's faces with the...,hasBadWords.False,girlfriends sit on each other s faces with the...
2,bound beauty kisses her girlfriend,hasBadWords.False,bound beauty kisses her girlfriend
3,MORGAN - Anytime - Nail Painting On The Slave'...,hasBadWords.False,morgan anytime nail painting on the slave s face
4,TRANSGENDER COACHING (wmv) PART 1,hasBadWords.False,transgender coaching wmv part 1


In [7]:
df_train, df_test = train_test_split(df, test_size=0.20, stratify=df.target)#, shuffle=True)

In [8]:
vec = CountVectorizer(
    ngram_range=(1, 3), 
    stop_words="english",
)

X_train = vec.fit_transform(df_train.clean_text)
X_test = vec.transform(df_test.clean_text)

y_train = df_train.target
y_test = df_test.target

In [34]:
from sklearn.model_selection import GridSearchCV
param={'alpha': [0.00001, 0.0001, 0.001, 0.1, 1, 10, 100,1000]}

nb = MultinomialNB()
clf=GridSearchCV(nb, param, scoring='f1_macro', cv=10, return_train_score=True) 

clf.fit(X_train, y_train)
print('Best estimator:', clf.best_estimator_)
      
preds = clf.predict(X_test)
print('Каппа-коэффициент Коэна: ', cohen_kappa_score(y_test, preds))
print(classification_report(y_test, preds))

Best estimator: MultinomialNB(alpha=1e-05)
Каппа-коэффициент Коэна:  0.5641204994146402
                   precision    recall  f1-score   support

hasBadWords.False       0.99      0.97      0.98     16632
 hasBadWords.True       0.49      0.71      0.58       656

         accuracy                           0.96     17288
        macro avg       0.74      0.84      0.78     17288
     weighted avg       0.97      0.96      0.96     17288



In [22]:
new_df = pd.DataFrame(df_test.clean_text)

In [23]:
new_df['predict'] = new_df.apply(lambda x: preds)

In [24]:
new_df.tail()

Unnamed: 0,clean_text,predict
52388,9037 clip store,hasBadWords.False
35836,lotion oil fetish asmr redheads hairy bush sol...,hasBadWords.False
77116,cum in handy,hasBadWords.False
46912,3161 clip store,hasBadWords.False
3072,old man watches tempest play with herself,hasBadWords.False


In [25]:
# new_df.to_csv('datasets/cat_pred.csv', index=False)

In [26]:
# joblib.dump(nb, "nb.joblib")
joblib.dump(clf, "nb.joblib")
joblib.dump(vec, "vec.joblib")

['vec.joblib']

In [27]:
nb_saved = joblib.load("nb.joblib")
vec_saved = joblib.load("vec.joblib")

In [28]:
%time
# sample_text = ["Space, Stars, Planets and Astronomy!"]
sample_text = ["strap on dildo fucking male strap on amateur forced feminization strap on bondage whipping caning dildo female domination cross dressing spanking humiliation sissy slut big tits MILF blonde BDSM i sissy training dildo blow job        "]
clean_sample_text = process_text(sample_text)
sample_vec = vec_saved.transform(sample_text)
nb_saved.predict(sample_vec)

CPU times: user 6 µs, sys: 1 µs, total: 7 µs
Wall time: 13.1 µs


array(['hasBadWords.True'], dtype='<U17')