In [13]:
import joblib
import re
import string
from bs4 import BeautifulSoup as bs
import numpy as np
import pandas as pd

import nltk
from nltk.corpus import stopwords  #stopwords
from nltk import word_tokenize,sent_tokenize # tokenizing
from nltk.stem import PorterStemmer,LancasterStemmer  # using the Porter Stemmer and Lancaster Stemmer and others
from nltk.stem.snowball import SnowballStemmer
from nltk.stem import WordNetLemmatizer  # lammatizer from WordNet

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score, cohen_kappa_score, f1_score, classification_report
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.naive_bayes import MultinomialNB

import logging  # Setting up the loggings to monitor gensim
logging.basicConfig(format="%(levelname)s - %(asctime)s: %(message)s", datefmt= '%H:%M:%S', level=logging.INFO)

import warnings
warnings.filterwarnings("ignore") #, category=np.VisibleDeprecationWarning)

In [14]:
# categories = [
#     "alt.atheism",
#     "misc.forsale",
#     "sci.space",
#     "soc.religion.christian",
#     "talk.politics.guns",
# ]

categories = [ 
    "hasBadWords.True",
    "hasBadWords.False",
             ]

news_group_data = pd.read_json('../datasets/dataset.json') # dataset.json
news_group_data['target'] = news_group_data.hasBadWords.apply(lambda x: "hasBadWords.True" if x == True else "hasBadWords.False")
# news_group_data.drop(['violation'], axis=1, inplace=True)
news_group_data.shape

# news_group_data = fetch_20newsgroups(
#     subset="all", remove=("headers", "footers", "quotes"), categories=categories
# )

# df = pd.DataFrame(
#     dict(
#         text=news_group_data["text"],
#         target=news_group_data["target"]
#     )
# )
# df["target"] = df.target.map(lambda x: categories[x])

(86439, 4)

In [15]:
df = pd.DataFrame(
    dict(
        text=news_group_data["text"],
        target=news_group_data["target"]
    )
)

In [16]:
df.head()

Unnamed: 0,text,target
0,My Favorite Slut,hasBadWords.False
1,girlfriends sit on each other's faces with the...,hasBadWords.False
2,bound beauty kisses her girlfriend,hasBadWords.False
3,MORGAN - Anytime - Nail Painting On The Slave'...,hasBadWords.False
4,TRANSGENDER COACHING (wmv) PART 1,hasBadWords.False


In [17]:
# function to clean and pre-process the text.
def clean_text(text):  
    
    # 1. Removing html tags
    text = bs(text,"lxml").get_text()
    
    # 2. Retaining only alphabets.
    text = re.sub("[^a-zA-Z]"," ", text)
    
    # 3. Converting to lower case and splitting
    word_tokens = text.lower().split()
    
    # 4. Remove stopwords
    le = WordNetLemmatizer()
    stop_words = set(stopwords.words("english")+ ['14000kbps', 'november', '1080p', 'email', 
                                                 '4k', 'mp4', 'error', '404', '2022', 'hd'])     
    word_tokens = [le.lemmatize(w) for w in word_tokens if not w in stop_words]
    
    cleaned_review = " ".join(word_tokens)
    return cleaned_review

df["clean_text"] = df.text.map(clean_text)

In [18]:
df = df[['clean_text', 'target']]
df.head()

Unnamed: 0,clean_text,target
0,favorite slut,hasBadWords.False
1,girlfriend sit face ass,hasBadWords.False
2,bound beauty kiss girlfriend,hasBadWords.False
3,morgan anytime nail painting slave face,hasBadWords.False
4,transgender coaching wmv part,hasBadWords.False


In [19]:
df_train, df_test = train_test_split(df, test_size=0.20, stratify=df.target)#, shuffle=True)

In [20]:
vec = CountVectorizer(
    ngram_range=(1, 3), 
    stop_words="english",
)

X_train = vec.fit_transform(df_train.clean_text)
X_test = vec.transform(df_test.clean_text)

y_train = df_train.target
y_test = df_test.target

In [21]:
from sklearn.model_selection import GridSearchCV
param={'alpha': [0.00001, 0.0001, 0.001, 0.1, 1, 10, 100,1000]}

nb = MultinomialNB()
clf=GridSearchCV(nb, param, scoring='f1_macro', cv=10, return_train_score=True) 

clf.fit(X_train, y_train)
print('Best estimator:', clf.best_estimator_)
      
preds = clf.predict(X_test)
print('Каппа-коэффициент Коэна: ', cohen_kappa_score(y_test, preds))
print(classification_report(y_test, preds))

Best estimator: MultinomialNB(alpha=1)
Каппа-коэффициент Коэна:  0.5895560923783691
                   precision    recall  f1-score   support

hasBadWords.False       0.98      0.99      0.98     16632
 hasBadWords.True       0.61      0.60      0.60       656

         accuracy                           0.97     17288
        macro avg       0.80      0.79      0.79     17288
     weighted avg       0.97      0.97      0.97     17288



In [22]:
new_df = pd.DataFrame(df_test.clean_text)

In [23]:
new_df['predict'] = new_df.apply(lambda x: preds)

In [24]:
new_df.tail()

Unnamed: 0,clean_text,predict
76802,programmerswife,hasBadWords.False
61571,mature bbw squirt anal masturbation milf as pu...,hasBadWords.False
7279,marcia wood wearing black top want show new fa...,hasBadWords.False
65815,custom clip question anything u alkatrazentert...,hasBadWords.False
56334,best tape gagged damsel,hasBadWords.False


In [25]:
# new_df.to_csv('datasets/cat_pred.csv', index=False)

In [26]:
# joblib.dump(nb, "nb.joblib")
joblib.dump(clf, "nb.joblib")
joblib.dump(vec, "vec.joblib")

['vec.joblib']

In [27]:
nb_saved = joblib.load("nb.joblib")
vec_saved = joblib.load("vec.joblib")

In [37]:
%time
# sample_text = ["Space, Stars, Planets and Astronomy!"]
sample_text = ["""start	end	text
0	6000	Mom's stuff is so pretty. She has so many cute things.
6000	9000	Which one's the most? Which one's the most?
9000	12000	What do you want?
12000	15000	I got so pretty.
15000	17000	You like the pink one.
17000	19000	You look so cute in the pink one.
19000	21000	Do you like this like weird shoulder thing though?
21000	23000	How does it look?
23000	24000	I don't know.
24000	25000	Wait, this is the dress.
25000	26000	It's a dress.
26000	32000	Do you like it?
32000	33000	I don't know.
33000	34000	You look so pretty.
34000	37000	You don't see?
37000	38000	Yeah.
38000	39000	Cute.
39000	41000	You look so pretty though.
41000	43000	No, you don't.
43000	45000	You want to check on me yourself?
45000	46000	Yeah.
46000	47000	What's in this?
47000	49000	I want to see you in the back maybe.
49000	50000	Yeah?
50000	53000	You can't take it off.
53000	61000	I'm thinking of this.
61000	63000	I'm not so cute.
63000	67000	You are.
67000	71000	Wait, now maybe I want to check it out.
71000	72000	Okay.
72000	73000	How do we get this?
73000	74000	Yeah.
74000	76000	Here's a cup.
76000	77000	Let's go on.
77000	78000	Oh, yeah.
78000	79000	Okay.
79000	80000	Okay.
80000	83000	Oh, here's some free.
83000	84000	Dolly.
84000	85000	You can't release a box.
85000	86000	You don't stop.
86000	90000	You can just tell me you don't want to stop it everywhere.
90000	91000	Okay.
91000	92000	Okay.
92000	93000	Okay.
93000	94000	Okay.
94000	95000	Okay.
95000	96000	Okay.
96000	97000	Okay.
97000	98000	Okay.
98000	99000	Okay.
99000	100000	Okay.
100000	101000	Okay.
101000	102000	Okay.
102000	103000	Okay.
103000	104000	Okay.
104000	105000	Okay.
105000	106000	Okay, lips are soft too.
106000	107000	So I go on.
107000	108000	Okay.
108000	109000	Okay.
109000	110000	Okay.
110000	111000	Okay.
111000	112000	Alright.
112000	113000	There's going on here.
113000	114000	Please.
114000	115000	I'm trying.
115000	118000	I can't believe what I'm saying.
118000	119000	You have really pretty lingerie.
119000	121000	That's not what I focused on first.
121000	123000	I focused on two naked bodies.
123000	125000	That's what I focused on.
125000	130000	Do you want us to put them over our clothes?
130000	140080	if you're not inspect enough this week okay here's your step sisters this is
140080	146520	bizarre I don't see you trying anything on I see the two of you naked embrace
146520	152720	kissing each other and touching each other let me deploy to that it's it's you
152720	159320	keep telling us it's my luxury what are you doing those are my clothes you
159320	166680	try them all on and obviously took them all off but we were trying more on to
166680	172280	be fair you don't even make sense I don't see them on right now if you had
172280	176680	come like two minutes sooner really wouldn't make any difference you shouldn't
176680	182720	be going through my drawers picking up my private undergarments and wearing them
182720	187680	we want to be sharing things that's what you said I'm telling her just like
187680	192360	her with the ridiculous answers sharing things yeah you're gonna share
192360	200100	something right now you know what you're gonna share you know the business
200100	207200	don't you my chair says what do we call that chair the spanking chair welcome
207200	212560	to the household new step daughter can you do it like separately she doesn't have
212560	219080	to see this oh yes she does I think she's seen enough I've seen enough to your
219080	224480	first you're gonna show her how we do things around here not that it seems to
224480	245400	make much difference for me look at your bottom I have to spend you all week for
245400	274440	things you realize the gravity of what you've done
274440	282280	this is multi-level misbehavior it's not fun going through some of
282280	310360	those private things I don't buy that for a second I think you were having fun
310360	327280	it was it was it was it was it was it was disrespectful it invades my
327280	350360	privacy this is your new step sister should invite her into our proper home
350360	379320	not going to battery you're such a bad influence
379320	400040	you see a brush there yeah hand it to her and you get back on the bed and you
400040	404520	watch what's about to happen to you in a few minutes
404520	410160	no too bad you want to go off you're the ones who took them off don't ask to put
410160	416360	clothes on now just don't know what you want okay now you know what to do
416360	422040	don't you hand me the brush and you ask me for the spanking you deserve with that
422040	425560	hair brush
425560	440760	oh please give me the spanking I deserve with that hair brush
440760	452240	I hope this stays in mind like that horrible image of the
452240	471960	talking heads home a bad image stay in mind
471960	473560	I'm very so bad.
473560	474880	I don't want to know.
474880	476360	I'm going to know.
476360	478000	I'm going to know.
478000	482000	I'm going to know.
482000	483000	Please.
483000	485800	Is this what it's?
485800	486800	Answer.
486800	489640	It's very cool, people.
489640	490960	I need help.
490960	491960	Help.
491960	492960	Help.
492960	493960	Help.
493960	494960	Help.
494960	495960	Help.
495960	496960	Help.
496960	498960	Help.
498960	500960	You lift up a little.
500960	503960	Help me.
503960	505160	Easy,osta.
505160	508160	Like this.
508160	512160	Help.
512160	514160	Help me.
514160	516160	Help.
516160	519160	Help.
519160	519760	Help me.
519760	520760	Help me.
520760	521760	Help me.
521760	522760	Help me.
522760	524760	Help me.
524760	527600	Help.
527600	530400	Help me.
530400	530900	Help!
532600	533480	Help!
534740	536540	Help!
544480	545240	Hey.
555160	556460	So, were you unless?
556540	557180	Yeah!
557280	558780	Were ever going to find the two of you here
558780	560380	Anywhere like that in the house?
560380	561380	No, I promise.
561380	562180	I promise.
562180	563180	Good.
563180	564180	Good.
564180	565180	Chase positions.
571180	573980	You're doing a little work.
573980	574980	I'm going to get over my lap.
574980	584980	This is how we do things in this house.
594980	596980	I don't know what's in store for you.
596980	603980	Don't you?
603980	626780	I hope you're going to learn how to behave in this house.
626780	629980	And you're not going to be walking around with the red bottom leg.
629980	631980	She knows all the time, right?
631980	633980	I miss my real mom.
633980	635980	I'm your mom now.
637980	639980	You live here now.
639980	641980	You're stuck here.
641980	665980	I can't do it here.
665980	671980	Is that enough of the spiking I've given her?
671980	673980	Is that enough of the spiking I've given her?
673980	675980	She doesn't deserve it.
675980	677980	That's what you do.
677980	679980	That's what you do.
679980	707980	Sit down for a moment.
707980	709980	Four deer.
709980	711980	Four deer.
711980	713980	We're not even halfway down.
713980	715980	No way.
715980	719980	Don't get too out of the breath.
727980	731980	Where's that brush going?
731980	739980	I need to hurt you.
739980	745980	And you tell her what to ask me since you heard that phrase so many times.
745980	749980	Ask her to give you the spiking.
749980	755980	Is that enough of the spiking I've given her?
755980	761980	Please, boys, give me the spiking to decide what that brush is.
761980	763980	Certainly what that is.
763980	767980	Oh, my God.
767980	769980	Oh!
769980	771980	Oh!
771980	773980	Oh!
773980	775980	Oh!
775980	777980	Oh!
777980	779980	There you see it all.
779980	781980	Her first word's about.
781980	783980	This one's stuck in it.
783980	791980	What the two of you were doing was really quite shocked.
791980	793980	I'm sorry.
793980	795980	I'm actually getting off lightly.
795980	797980	No!
797980	799980	No!
799980	801980	No!
801980	803980	No!
803980	805980	No!
805980	807980	No!
807980	809980	No!
809980	811980	No!
811980	813980	I see you were scoring the chip.
813980	815980	You're getting the light treatment off of her.
815980	817980	Over here.
817980	819980	There we go.
819980	821980	There we go.
821980	823980	No, it's funny yet.
823980	825980	No!
825980	827980	Oh!
827980	829980	No!
829980	831980	No!
831980	833980	No!
833980	835980	No!
835980	837980	No!
837980	839980	No!
839980	841980	No!
841980	843980	No!
843980	845980	No!
845980	847980	No!
847980	849980	No!
849980	851980	No!
851980	853980	No!
853980	855980	No!
855980	857980	No!
857980	859980	You're never going to poke if anybody else is drawing again.
859980	861980	I'm so sorry.
861980	865980	You're ever going to try anybody's laserion again.
865980	869980	And you're certainly never going to get naked with your skepticism again.
869980	871980	Are you?
871980	873980	Shock it.
873980	875980	No!
875980	877980	No!
877980	879980	No!
879980	881980	No!
881980	883980	No!
883980	885980	No!
885980	887980	No!
887980	889980	No!
889980	891980	No!
891980	893980	No!
893980	895980	No!
895980	897980	No!
897980	899980	No!
899980	901980	No!
901980	903980	No!
903980	905980	Yes.
905980	909980	I don't have my again.
909980	913980	And you get, stand up there with her.
913980	915980	You get some closer sounds.
915980	921980	You can rub those red bottles of yours and then you're going to get.
921980	923980	No!
923980	925980	No!
925980	927980	No!
927980	929980	No!
929980	931980	I'm really sorry.
931980	933980	I didn't think this would happen.
933980	935980	I was going to force you.
935980	937980	I'm going to force you.
937980	941980	You're pretty scared.
941980	943980	It hurts.
943980	945980	You know.
945980	947980	I mean, you're a bully.
947980	949980	It's got so many marks on you already.
949980	951980	She's evil.
951980	953980	She's a horrible mother.
953980	955980	Literally an evil stepmother.
955980	957980	I thought your mom was your like.
957980	983980	So nice."""]
# sample_text = ["strap on dildo fucking male strap on amateur forced feminization strap on bondage whipping caning dildo female domination cross dressing spanking humiliation sissy slut big tits MILF blonde BDSM i sissy training dildo blow job        "]
clean_sample_text = clean_text(sample_text[0])
sample_vec = vec_saved.transform(sample_text)
nb_saved.predict(sample_vec)

CPU times: user 1e+03 ns, sys: 1 µs, total: 2 µs
Wall time: 2.86 µs


array(['hasBadWords.False'], dtype='<U17')