In [1]:
import pandas as pd
import numpy as np
import multiprocessing
from bs4 import BeautifulSoup as bs
from selectolax.parser import HTMLParser
import re
import pymorphy2
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import matplotlib.pyplot as plt
import string

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer
from time import time  # To time our operations
from collections import defaultdict, Counter  # For word frequency

# Load the TensorBoard notebook extension
%load_ext tensorboard

# import logging  # Setting up the loggings to monitor gensim
# logging.basicConfig(format="%(levelname)s - %(asctime)s: %(message)s", datefmt= '%H:%M:%S', level=logging.INFO)

import warnings
warnings.filterwarnings('ignore')

In [None]:
nltk.download('punkt')
nltk.download('stopwords')

In [None]:
# Data loading and preparation
data = pd.read_json('../datasets/dataset.json')
mapping = {False: 0, True: 1}
data.replace({'hasBadWords': mapping}, inplace=True)
# data.hasBadWords = data.hasBadWords.apply(lambda x: 1 if x == True else 0)
data.rename(columns={"hasBadWords": "labels"}, inplace=True)
data.drop(['violation'], axis=1, inplace=True)
data.shape

In [None]:
data.head()

# Preprocessing function

In [None]:
def preprocess(text, stop_words, punctuation_marks): #, morph):
    tokens = word_tokenize(text.lower())
    preprocessed_text = []
    for token in tokens:
        if token not in punctuation_marks:
            lemma = token #morph.parse(token)[0].normal_form
            if lemma not in stop_words:
                preprocessed_text.append(lemma)
    return ' '.join(preprocessed_text)

punctuation_marks = ['!', ',', ';', ':', '(', ')', '-', '--', '?', '@', '....', '~',
                     '.', '..', '...', '....................', '<', '>', '=', '»', '|', '’', '`', '+', '$',
                     '&', '#', '+++', '*', '``', '%', '[', ']', '{', '}', '√©']

stop_words = stopwords.words('english') + ['14000kbps', 'november', '1080p', '4k', 'mp4', 'error', '404', '2022']
morph = pymorphy2.MorphAnalyzer(lang='uk')

In [None]:
def normalize(s):
    """
    Given a text, cleans and normalizes it. Feel free to add your own stuff.
    """
    s = s.lower()
    # Replace ips
    s = re.sub(r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}', ' _ip_ ', s)
    # Isolate punctuation
    s = re.sub(r'([.\(\)\!\?\-\\\/\,])', r' \1 ', s)
    # Remove some special characters
    s = re.sub(r'([\;\:\|•«\n])', ' ', s)
    # Replace numbers and symbols with language
    s = s.replace('&', ' and ')
    s = s.replace('@', ' at ')
    s = s.replace('0', ' zero ')
    s = s.replace('1', ' one ')
    s = s.replace('2', ' two ')
    s = s.replace('3', ' three ')
    s = s.replace('4', ' four ')
    s = s.replace('5', ' five ')
    s = s.replace('6', ' six ')
    s = s.replace('7', ' seven ')
    s = s.replace('8', ' eight ')
    s = s.replace('9', ' nine ')
    return s

In [None]:
data[:1000].apply(lambda row: str(row.text), axis=1)

In [None]:
# data[:100].apply(lambda row: bs(row['text']).get_text().replace('\n',' '),axis=1)

In [None]:
data[:100].apply(lambda row: preprocess(row.text, punctuation_marks, stop_words), axis=1) #, morph), axis=1)

In [None]:
# Dropping the rows with "<div"
# identify partial string
discard = ["<div ", "<p ", "<span ", "<p>", "<div>", "<h", "<input ", "center>", "<a ", 
           "<td>", "<", ">", r"              ", "Ø", '√ú', 'http://']
  
data = data[~data.text.str.contains('|'.join(discard))]

In [None]:
data['text'] = data.apply(lambda row: bs(row.text, 'lxml').get_text().replace('\r\n', ' ').replace('/', ' ').replace('"', '\"'), axis=1)
# data['text'] = data.apply(lambda row: HTMLParser(row.text).body.text(separator=' ').replace('\r\n',' '),axis=1)

In [None]:
data['text_preprocessed'] = data.apply(lambda row: preprocess(row.text, punctuation_marks, stop_words), axis=1)

In [None]:
data

In [None]:
data = data[['text_preprocessed', 'labels', 'text']] # columns reorder

In [None]:
data.head()

In [None]:
# data.to_csv('../datasets/clear_text.csv')

# Tensorflow

In [None]:
# Count unique words
def counter_word(text_col):
    count = Counter()
    for text in text_col.values:
        for word in text.split():
            count[word] += 1
    return count  

In [None]:
counter = counter_word(data.text_preprocessed)  

In [None]:
data.text_preprocessed

In [None]:
num_unique_words = len(counter)
oov_token = '<UNK>'

In [None]:
num_unique_words

In [None]:
counter

In [None]:
counter.most_common(150)

In [None]:
from wordcloud import *
word_freq = [i for i in counter.most_common(50)]
wd = WordCloud(background_color='white')
wd.generate_from_frequencies(dict(word_freq))
plt.figure()
plt.imshow(wd, interpolation = 'bilinear')
plt.axis('off')
plt.show()

In [None]:
train_size = int(data.shape[0] * .8)

train = data[:train_size]
val = data[train_size:]

# Split train and test

X_train = train.text_preprocessed.to_numpy()
y_train = train.labels.to_numpy()

X_val = val.text_preprocessed.to_numpy()
y_val = val.labels.to_numpy()

In [None]:
train.text_preprocessed.to_numpy()

In [None]:
X_train,  y_train

In [None]:
X_train.shape, X_val.shape

# Tokenize

In [None]:
# vectorize a text corpus by turning each text into sentence of integers

tokenizer2 = Tokenizer(num_words=num_unique_words, oov_token=oov_token)
tokenizer2.fit_on_texts(X_train)

In [None]:
import pickle

# saving
with open('../models/tokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenizer2, handle, protocol=pickle.HIGHEST_PROTOCOL)

# loading
with open('../models/tokenizer.pickle', 'rb') as handle:
    tokenizer = pickle.load(handle)

In [None]:
# each word have a unique index
word_index = tokenizer.word_index
word_index

In [None]:
X_train_sequences = tokenizer.texts_to_sequences(X_train)
X_val_sequences = tokenizer.texts_to_sequences(X_val)

In [None]:
print(X_train[1])
print(X_train_sequences[1])

In [None]:
# pad the sequences to have the same length
from tensorflow.keras.preprocessing.sequence import pad_sequences

# max words in a sequence
max_length = max([len(x) for x in X_train_sequences]) #256 #20

X_train_padded = pad_sequences(X_train_sequences, maxlen=max_length, padding="post", truncating="post")
X_val_padded = pad_sequences(X_val_sequences, maxlen=max_length, padding="post", truncating="post")

X_train_padded.shape, X_val_padded.shape


In [None]:
X_train_padded[10]

In [None]:
print(X_train[10])
print(X_train_sequences[10])
print(X_train_padded[10])

In [None]:
# Check reversing the indices

# flip (key, value)
reverse_word_index = dict([(idx, word) for (word, idx) in word_index.items()])

In [None]:
reverse_word_index

In [None]:
def decode(sequence):
    return " ".join([reverse_word_index.get(idx, "?") for idx in sequence])

decoded_text = decode(X_train_sequences[10])

print(X_train_sequences[10])
print(decoded_text)

In [None]:
# Create LSTM model
from tensorflow.keras import layers

# Word embeddings give us a way to use an efficient, dense representation in which similar words have
# a similar encoding. Importantly, you do not have to specify this encoding by hand. An embedding is a 
# dense vector of floating point values (the length of the vector is a parameter you specify)

# tf.debugging.set_log_device_placement(True)

model = keras.models.Sequential()
model.add(layers.Embedding(num_unique_words, 32, input_length=max_length))

# The layer will take as input as integer matrix of size (batcg, input_length)
# and the largest integer (i.e. word index) in the input should be no larger than num_words (vocabulary size).
# Now model.output_shape is (None, input_length, 32), where `None` is the batch dimension.

model.add(layers.LSTM(64, dropout=.1))
model.add(layers.Dense(1, activation='sigmoid'))

model.summary()

In [None]:
from pathlib import Path
path = Path('../models/bad_words.model')
path.mkdir(exist_ok=True) 
cpt_filename = '{epoch:02d}_checkpoint_{val_loss:.2f}.hdf5'
cpt_path = str(path / cpt_filename)

checkpoint = tf.keras.callbacks.ModelCheckpoint(cpt_path, monitor='val_loss', verbose=1, 
                                                save_best_only=True, mode='min')

In [None]:
loss = keras.losses.BinaryCrossentropy(from_logits=False)
optimizer = keras.optimizers.legacy.Adam(learning_rate=.001)
metrics = ['accuracy']

model.compile(loss=loss, optimizer=optimizer, metrics=metrics)

In [None]:
model.fit(X_train_padded, y_train, epochs=15, validation_data=(X_val_padded, y_val), verbose=1, 
          callbacks=[checkpoint])

In [None]:
# model.get_weights()

In [None]:
history_df = pd.DataFrame(model.history.history)
history_df.head(15)

In [None]:
import matplotlib.pyplot as plt
plt.plot(history_df.loss);

In [None]:
model.load_weights('../models/bad_words.model/04_checkpoint_0.04.hdf5') # loading weights - model had created erlier
loss, acc = model.evaluate(X_val_padded, y_val)
print(f'Accuracy of restored model {acc*100:.2f}%')

In [None]:
model = tf.keras.models.load_model('../models/bad_words.model/04_checkpoint_0.04.hdf5')
loss, acc = model.evaluate(X_val_padded, y_val)
print(f'Accuracy of restored model {acc*100:.2f}%')

In [None]:
!ls '../models/bad_words.model'

In [None]:
predictions = model.predict(X_train_padded)
predictions = [1 if p > .5 else 0 for p in predictions]

In [None]:
X_train_padded[0]

In [None]:
print(X_train[2000:2010])
print(y_train[2000:2010])
print(predictions[2000:2010])

In [None]:
train['predictions'] = predictions
train = train[['text_preprocessed', 'labels', 'predictions', 'text']] # columns reorder

In [None]:
train.head()

In [None]:
train.to_csv('../datasets/wo_html.csv')

In [None]:
model.save('../models/modelSequential_wo_HTML.h5')

In [None]:
# model_new = keras.models.load_model('../models/modelSequential_wo_HTML.h5')
model_new = keras.models.load_model('../models/bad_words.model/04_checkpoint_0.04.hdf5')

In [None]:
predictions = model_new.predict(X_train_padded)
predictions = [1 if p > .5 else 0 for p in predictions]

# Check sentence

In [None]:
raw_text = "Welcome to Jandro The Tickler. What \
          you're seeing here is completely real! So here's the premise: Husbands, \
          boyfriends, friends, etc, hire me, \"The Tickler\", to show \
          up at a specified location at a specific time with one mission: Find \
          the girl, tie her up, surprise her, and tickle the hell out of her! \
          Sometimes the girls are in the know, and sometimes they're not:) The \
          bonds are real, the reactions are certainly real, and the tickle tools \
          are 100% real. The end result is usually a surprised, shocked, tortured, \
          turned on, worn out girl, with hardly the strength to wave at the camera \
          before I exit:) I basically wanted to combine Tickle Torture with Foot, \
          Sleepy, Voyeur, Light Bondage, and even Forced O. The premise allows \
          all of these to take place per vid."
test_text_preprocessed = bs(raw_text, 'lxml').get_text().replace('\r\n',' ')
test_text_preprocessed = preprocess(test_text_preprocessed, punctuation_marks, stop_words)
test_text_np = np.array([test_text_preprocessed])
test_sequences = tokenizer.texts_to_sequences(test_text_np)

In [None]:
print(test_text_preprocessed)
print(test_sequences[0])

In [None]:
# for word in test_text_preprocessed:
# Get max training sequence length
max_len = max([len(x) for x in test_sequences])

test_sequences_padded = pad_sequences(test_sequences, maxlen=max_len, padding="post", truncating="post")

In [None]:
print(test_text_preprocessed)
print(test_sequences_padded)

In [None]:
%%time
predictions = model_new.predict(test_sequences_padded)
predictions = [1 if p > .5 else 0 for p in predictions]

In [None]:
predictions

In [None]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup as bs
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
nltk.download('punkt')
nltk.download('stopwords')
import pickle
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

from time import time  # To time our operations
from collections import defaultdict, Counter  # For word frequency

import logging  # Setting up the loggings to monitor gensim
logging.basicConfig(format="%(levelname)s - %(asctime)s: %(message)s", datefmt= '%H:%M:%S', level=logging.INFO)

import warnings
warnings.filterwarnings('ignore')

from time import time  # To time our operations
from collections import defaultdict, Counter  # For word frequency

import logging  # Setting up the loggings to monitor gensim
logging.basicConfig(format="%(levelname)s - %(asctime)s: %(message)s", datefmt= '%H:%M:%S', level=logging.INFO)

import warnings
warnings.filterwarnings('ignore')

def preprocess(text, stop_words, punctuation_marks): #, morph):
    tokens = word_tokenize(text.lower())
    preprocessed_text = []
    for token in tokens:
        if token not in punctuation_marks:
            lemma = token #morph.parse(token)[0].normal_form
            if lemma not in stop_words:
                preprocessed_text.append(lemma)
    return ' '.join(preprocessed_text)

punctuation_marks = ['!', ',', ';', ':', '(', ')', '-', '--', '?', '@', '....', '~', '¬ß'
                     '.', '..', '...', '....................', '<', '>', '=', '»', '|', '’', '`', '+', '$',
                     '&', '#', '+++', '*', '``', '%', '[', ']', '{', '}', '√©', '√™', '¬†', '√¢']


stop_words = stopwords.words('english') + ['14000kbps', 'https', "'s", "'m", 'http', 'mp4', 'error', '404',
                                          'Error 404', '404error']

# loading
with open('../models/tokenizer.pickle', 'rb') as handle:
    tokenizer = pickle.load(handle)
    
model = keras.models.load_model('../models/modelSequential_wo_HTML.h5')

raw_text = 'nigger'


def predict(sequences):
    sequences_padded = pad_sequences(sequences, maxlen=max_length, padding="post", truncating="post")
    predictions = model.predict(sequences_padded)
    predictions = [1 if p > .5 else 0 for p in predictions]
    return predictions

# Data loading and preparation
data = pd.read_json('../datasets/neil_ProducerClipSite_rand.json')
mapping = {False: 0, True: 1}
# data.replace({'hasBadWords': mapping}, inplace=True)
# data.hasBadWords = data.hasBadWords.apply(lambda x: 1 if x == True else 0)
# data.rename(columns={"hasBadWords": "labels"}, inplace=True)
data.drop(['SiteID', 'Title', 'Description', 'Keywords', 'Bottom'], axis=1, inplace=True)
data.head()



In [None]:
data.Top

In [None]:
%%time
data['text'] = data.apply(lambda row: bs(row.Top, 'lxml').get_text().replace('\r\n',' ').replace('.', ''), axis=1)
data['text_preprocessed'] = data.apply(lambda row: preprocess(row.text, punctuation_marks, stop_words), axis=1)

# Get max training sequence length
max_length = max([len(x) for x in X_val_sequences])

X_val = data.text_preprocessed.to_numpy()
X_val_sequences = tokenizer.texts_to_sequences(X_val)
X_val_padded = pad_sequences(X_val_sequences, maxlen=256, padding="post", truncating="post")
# X_val_padded = pad_sequences(X_val_sequences, padding="post", truncating="post")


predictions = model.predict(X_val_padded)
predictions = [1 if p > .5 else 0 for p in predictions]


In [None]:

X_val_padded[100:102]


In [None]:
data['predictions'] = predictions

In [None]:
data.head()

In [None]:
data = data[['Top', 'predictions']]

In [None]:
data.to_csv('../datasets/last_one.csv')

In [None]:
data.to_json('../datasets/last_one.json')