In [None]:
# All imports
import re
import string
from bs4 import BeautifulSoup as bs
import numpy as np
import pandas as pd
import joblib

import nltk
from nltk.corpus import stopwords  #stopwords
from nltk import word_tokenize,sent_tokenize # tokenizing
from nltk.stem import PorterStemmer,LancasterStemmer  # using the Porter Stemmer and Lancaster Stemmer and others
from nltk.stem.snowball import SnowballStemmer
from nltk.stem import WordNetLemmatizer  # lammatizer from WordNet
from sklearn.utils import shuffle
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression, RidgeClassifier
from sklearn.metrics import accuracy_score, mean_absolute_error, mean_squared_error, \
cohen_kappa_score, f1_score, classification_report
from sklearn.model_selection import StratifiedKFold, train_test_split

import warnings
warnings.filterwarnings('ignore')

In [None]:
data = pd.read_json('../datasets/dataset.json') # dataset.json test_data.json

data['target'] = data.hasBadWords.apply(lambda x: 1 if x == True else 0)

df = pd.DataFrame(
    dict(
        raw_text=data["text"],
        labels=data["target"]
    )
)

# RANDOM_STATE = 42

# df = pd.read_csv('../datasets/current_train_data.csv')

# df = shuffle(df, random_state=RANDOM_STATE).reset_index(drop=True)
# df = df[:1000]

df.shape

In [None]:
df.head()

In [None]:
# function to clean and pre-process the text.
def clean_text(text):  
    
    # 1. Removing html tags
    text = bs(text,"lxml").get_text()
    
    # 2. Retaining only alphabets.
    text = re.sub("[^a-zA-Z]"," ", text)
    
    # 3. Converting to lower case and splitting
    word_tokens = text.lower().split()
    
    # 4. Remove stopwords
    le = WordNetLemmatizer()
    stop_words = set(stopwords.words("english")+ ['14000kbps', 'november', '1080p', 'email', 
                                                 '4k', 'mp4', 'error', '404', '2022', 'hd'])     
    word_tokens = [le.lemmatize(w) for w in word_tokens if not w in stop_words]
    
    cleaned_review = " ".join(word_tokens)
    return cleaned_review

In [None]:
df["text"] = df.raw_text.map(clean_text)

In [None]:
df.drop(labels=['raw_text'], axis=1, inplace=True)
df = df[['text', 'labels']]
df.head()

In [None]:
df_train, df_test = train_test_split(df, test_size=0.20, stratify=df.labels, shuffle=True, random_state=42)

In [None]:
vec = CountVectorizer(
    ngram_range=(1, 3)
)

X_train = vec.fit_transform(df_train.text)
X_test = vec.transform(df_test.text)

y_train = df_train.labels
y_test = df_test.labels

# 30k
Каппа-коэффициент Коэна:  0.8765533700929965
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      5946
           1       0.98      0.80      0.88        54

    accuracy                           1.00      6000
   macro avg       0.99      0.90      0.94      6000
weighted avg       1.00      1.00      1.00      6000

# 50k
Каппа-коэффициент Коэна:  0.938671923783697
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      9907
           1       0.97      0.91      0.94        93

    accuracy                           1.00     10000
   macro avg       0.98      0.96      0.97     10000
weighted avg       1.00      1.00      1.00     10000

# Using MLflow

In [None]:
import mlflow
import os

In [None]:
def eval_metrics(actual, pred):
    rmse = np.sqrt(mean_squared_error(actual, pred))
    mae = mean_absolute_error(actual, pred)
    acc = accuracy_score(actual, pred)
    return rmse, mae, acc

In [None]:
print('MLflow Version:', mlflow.version.VERSION)
mlflow.set_tracking_uri('http://localhost:8990')
print('Tracking URI:', mlflow.tracking.get_tracking_uri())

In [None]:
experiment_name = 'LogReg_Hyperparameters_Search'
try:
    # creating a new experiment
    exp_id = mlflow.create_experiment(name=experiment_name)
except Exception as e:
    exp_id = mlflow.get_experiment_by_name(experiment_name).experiment_id

In [None]:
# if 'BW' not in os.listdir():
#     os.mkdir('BW')
    
# solvers = ['liblinear', 'newton-cg', 'lbfgs','sag', 'saga']
# penalties = ['l1', 'l2']
# c_values = [5.0, 10.0, 100.0]
# # max_iter = [100, 1000, 2000, 10000]
# l1_ratios = [1.0, 0.2, 0.1, 0.01, 0.001]

# for c in c_values:
#     for l1_ratio in l1_ratios:
#         for penalty in penalties:
#             for solver in solvers:
#                 if solver == 'newton-cg':
#                     penalty = 'l2'
#                 with mlflow.start_run(experiment_id=exp_id):
#                     mlflow.log_artifacts('BW')
#                     lr = RidgeClassifier(
#                         C=c, 
#                         l1_ratio=l1_ratio, 
#                         penalty=penalty, 
#                         solver=solver,
#                         random_state=42, 
#                         fit_intercept=True
#                     )

#                     lr.fit(X_train, y_train)
#                     y_pred = lr.predict(X_test)

#                     rmse, mae, acc = eval_metrics(y_test, y_pred)
#                     cohen_kappa = cohen_kappa_score(y_test, y_pred)

#                     mlflow.log_param('C', c)
#                     mlflow.log_param('l1_ratio', l1_ratio)
#                     mlflow.log_param('solvers', solver)
#                     mlflow.log_param('penalty', penalty)

#                     mlflow.log_metric('rmse', rmse)
#                     mlflow.log_metric('mae', mae)
#                     mlflow.log_metric('acc', acc)
#                     mlflow.log_metric('Cohen Kappa', cohen_kappa)
#                 #     mlflow.log_metrics({'rmse': rmse, 'mae': mae, 'Cohen Kappa': cohen_kappa})

#                     mlflow.sklearn.log_model(lr, 'LogisticRegression_Model')

In [None]:
if 'BW' not in os.listdir():
    os.mkdir('BW')
    
solvers = ['cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga',]
penalties = ['l1', 'l2']
alphas = [0.1, 0.5, 10.0, 100.0, 1000.0]
# max_iter = [100, 1000, 2000, 10000]
l1_ratios = [1.0, 0.2, 0.1, 0.01, 0.001]

for alpha in alphas:
            for solver in solvers:
                with mlflow.start_run(experiment_id=exp_id):
                    mlflow.log_artifacts('BW')
                    lr = RidgeClassifier(
                        alpha=alpha, 
                        solver=solver,
                        random_state=42,
                        positive=False,
                        fit_intercept=False
                    )

                    lr.fit(X_train, y_train)
                    y_pred = lr.predict(X_test)

                    rmse, mae, acc = eval_metrics(y_test, y_pred)
                    cohen_kappa = cohen_kappa_score(y_test, y_pred)

                    mlflow.log_param('alpha', alpha)
                    mlflow.log_param('solvers', solver)

                    mlflow.log_metric('rmse', rmse)
                    mlflow.log_metric('mae', mae)
                    mlflow.log_metric('acc', acc)
                    mlflow.log_metric('Cohen Kappa', cohen_kappa)
                #     mlflow.log_metrics({'rmse': rmse, 'mae': mae, 'Cohen Kappa': cohen_kappa})

                    mlflow.sklearn.log_model(lr, 'LogisticRegression_Model')

# Searching runs

In [None]:
mlflow.search_runs(exp_id)

In [None]:
search_query = 'metrics.acc > .95'
df_mlflow = mlflow.search_runs(exp_id, search_query)
df_mlflow.head()

In [None]:
print('Run IDs with least MAE \n')

for run in df_mlflow['run_id'][df_mlflow['metrics.mae'] == df_mlflow['metrics.mae'].min()].values:
    print(run, end=', ')

# Getting details of a run

In [None]:
client = mlflow.tracking.MlflowClient()
client.get_run(run_id='a415049b6f5e463c9266e37b6632a71b')

# Serving the model

In [None]:
# mlflow models serve --model-uri mlflow-artifacts:/161047726410345551/92b54a97fc064b7daf2c2bb48593b99a/artifacts/LogisticRegression_Model/model.pkl -p 5600

In [None]:
%time
# sample_text = ["""trampling footfetish crushing footjob feet femdom extreme smoking facestanding facesitting mistress heels boots worship domination bdsm whip torture girls women slave bondage dangling humiliation submission ballbusting spanking facetrampling whipping sandals shoes stiletto pain sadism sadistic princess queen facestanting facebusting spitting nipple torture barefeet licking supremacy skin leather pvc latex marks tight high metal jumps footjobs slapping spanking handcuff mistress slave daemonia sapphiria the balck whip lady juliette lilith babalon  deabolika vanesia hypnos candy sadik angels sadik girls womenweight mellyfeet mellydom italian padrone lucignolo bad badcloster badchoice mysapce erotismo erotic nude"""]
# sample_text = ["""jerk instruction asian goddess masturbation encouragement gag talk asian goddess asian princess femdom female domination female supremacy gag talk cleave gag joi jerk instruction masturbation instruction jerk encouragement astrodomina damsel distress rope bondage tape bondage"""]
# sample_text = ["""Well now, buxom powerhouse beauty, Charlie, must have been eating cheese or something equally ‘mind-influencing' shortly before bedtime, ‘cause she's having some mondo creepyass dreamage! All to do with absolute captivity, you know, tight bondage and even tighter gags? As she fitfully starts and stutters, gagged and bound alongside her best friend, blonde saucepot, Hannah, for some bizarre reason, her dreams shift and suddenly she's alone and facedown on the bed, struggling intensely, a mega tight black cloth cleave gag keeping her completely muzzled, and yet still she gag-enthuses passionately, wriggling hard like a fish out of the bowl and totally tied up and helpless... then the dream shifts back again and she's coming around alongside her equally scantily clad friend, who is gagged firmly with tape. As the pair - is this still a dream, or what? - come around and realise they are bound and gagged; cue yet another wondrous moment of sexy wriggling, gag talking and energised attempts to escape the stringent ropes. But, of course, they are too tight! And now all these hapless stunners can do is writhe and moan in their underwear on the bed, pondering who, in Charlie's dream, has done this to them! When their twilit captor finally shows himself, the furious babes protest miraculously into their gags, but he ignores them. In Charlie's dreams he is a dark burglar, intent on molesting them, and relieving them of all their homely possessions. Nice. And, as if to demonstrate, he forces a  new gag around the indignant yapper of Charlie as Hannah struggles on, and unleashes her truly magnificent boobies, bouncing them around for his total pleasure, Then the newly bandanna cleave gagged babe is left again to struggle and strain next to her similarly now boob-revealed chum. Both girls are first tickled and spanked and then shut away to continue their tied up endeavours. After a while of very tasty squirming, they suddenly hatch the utterly unintelligible (and this is precisely why it's so damned sexy) plot to wriggle to their feet and hop away. This they ultimately acheive in one incredibly potent scene but are soon recaptured and both carried jiggling and protesting HARD over the bad guy's shoulder. The ever-resisting hotties are shoved rudely against the bedroom wall and left again to twist and buck, shaking their massive tits all over the place. Next up, as a punishment for their wholly unacceptable escape attempts, the pair are firmly trussed in a hallway and massively gagged with hardcore microfoam tape wraparounds. FEEL them cheeks BULGE, Peeps! Wow, this dream is rapidly descending into the stuff of nightmare - for them. For US, it's the ultimate thrill, right? And, as the amazingly barefoot honnies struggle and emote with total sexiness, we must pinch ourselves every once in a while, surely, to see if we really are awake ourselves.... It doesn't get much more exciting that this, as these two Herculean beauties smash it again and again as the ultimate pair of feisty damsels, and for our complete pleasure.****
# Wow, this incredibly kinky dream of Young Charlie's seems to have been going on a long time, and ratcheting up in intensity as it progresses to boot! Now, the phantom burglar has left the babes bound and gagged in an upstairs hallway where they pluckily communicate with quite scintillating vigour to one another through their massively suppressive microfoam tape wrap muzzles. They cannot say a WORD. Perfecto! What they CAN do is mmmmpppphhhh like crazy and jiggle their massive boobs around while also flexing their sexy bare feet as they struggle and strain, trying urgently to loosen those damned ropes What they actually succeed in doing is tightening them even more, and now things are getting more nightmarish as the ethereal (get the picture?) crook materialises once more and ropes both hotties one by one into complete and ruthlessly close hogties, but not before manhandling and abusing them both with pervy feel ups and spankings, even wrapping his rope tightly around Charlie's massive boobs, squeezing them ultra tightly as she protests indignity. Poor Hannah, roped into her hogtie already, can't even turn around to see what the creep is doing to her friend. The totally incapacitated duo are soon left to struggle passionately, facedown on the rug, wriggling and grunting with the pure exertion of bouncing around on their boobs, their sexy legs drawn up uncomfortably behind them in these wicked-close hogties! No escape, just an urgent wigglefest as the babes try repeatedly - and furiously - to communicate with one another through their overwhelming tape wrap gags. Later, the bad guy is back, and to add to their current woes, repeatedly winds black electricians' tape over the top of their already mondo-gags. This guy must be nuts, but, we are most grateful for his creativity, no doubt. Now the utterly helpless stunners must struggle and roll and twist their way through the final phases of Charlie's incredibly hot bondage dream. We wonder when she'll ever wake up, but would keenly encourage her to continue napping if these are the kind of night-time fancies she experiences. Good for you, Charlie, and thank you, you saucy little beauts for putting on one hell of a lucid (for a dream) action show as you galvanically buck and moan your respective ways to bondage supremacy. Our collective pulse is now through the roof, thanks to you, gals! Happy Effing New Year, Folks. Just  you WAIT to see what thrills 2014 holds here at Borderland Bound!!"""]
sample_text = ["strap on dildo fucking male strap on amateur forced feminization strap on bondage whipping caning dildo female domination cross dressing spanking humiliation sissy slut big tits MILF blonde BDSM i sissy training dildo blow job        "]
clean_sample_text = clean_text(sample_text[0])
sample_vec = vec.transform(sample_text)
pred = lr.predict(sample_vec)

print("Has bad word" if pred[0] == 1 else "Clear text")


In [None]:
df_val = pd.read_csv('../datasets/current_train_data.csv')
df_val = df_val[:100]

In [None]:
df_val["text"] = df_val.raw_text.map(clean_text)
df_val.drop(labels=['raw_text'], axis=1, inplace=True)
df_val = df_val[['text', 'labels']]
df_val.head()

In [None]:
vec = CountVectorizer(
    ngram_range=(1, 3)
)

In [None]:
df_val

In [None]:
str_to_vec = df_val['text']
vectors = vec.fit_transform(str_to_vec)
df_val['predict'] = lr.predict(vectors)
df_val.head()