In [1]:
%matplotlib inline
from __future__ import print_function
import collections
import math
import numpy as np
from bs4 import BeautifulSoup as bs
import os
import random
import tensorflow as tf
import zipfile
from matplotlib import pylab
from sklearn.manifold import TSNE
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import pymorphy2
import pandas as pd 
import six

import warnings
warnings.filterwarnings('ignore')

In [2]:
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /Users/adwiz/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /Users/adwiz/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
from subprocess import check_output
print(check_output(["ls", "../datasets/"]).decode("utf8"))

App
Machine-Learning-Book-Ratings
anna_words.txt
bad_words.txt
bank-full.csv
bw.csv
cat_pred.csv
check_false_sentenses.csv
check_sentenses.csv
clear_text.csv
coin_prices.db
data.txt
dataset.json
dependent_t.csv
df.parquet
df4.xlsx
df5.csv
df6.json
df7.csv
df_clean.csv
df_out.csv
df_types.csv
df_types_upd.csv
form_LP001014.json
form_LP001024.json
iTunes_api.csv
iTunes_api.xlsx
independent_t_student.csv
insurance.csv
last_one.csv
last_one.json
loan_train.csv
main.py
mann_whitney.csv
neil_ProducerClipSite_rand.json
other_partner_data.csv
partner_data.csv
partner_data_next_part.csv
partner_data_records.json
partner_data_records_cp1251.json
partner_data_semicolon.csv
pics
pima-indians-diabetes.data.csv
pima-indians-diabetes.data.csv.1
pima-indians-diabetes.data.csv.2
practice4.csv
practice_5.1.csv
practice_5.2.csv
prance-8.jpg
recdemo.csv
signed_wilcoxon.csv
stroke_data.csv
stroke_data_encoded.csv
submission.csv
test.csv
test_data.json
transfusion_main.csv
transfusion_oot.csv
vehicles_datas

In [4]:
def read_train_data():
    df_train = pd.read_json('../datasets/dataset.json')
    mapping = {False: 0, True: 1}
    df_train.replace({'hasBadWords': mapping}, inplace=True)
    df_train.rename(columns={"hasBadWords": "labels"}, inplace=True)
    df_train.drop(['violation'], axis=1, inplace=True)
    print('Data size %d' % len(df_train))
    print('Data headers %s' % df_train.columns.values)
    return df_train

In [5]:
df_train = read_train_data()

Data size 86439
Data headers ['text' 'labels']


# Preprocessing function

In [6]:
def preprocess(text, stop_words, punctuation_marks): #, morph):
    tokens = word_tokenize(text.lower())
    preprocessed_text = []
    for token in tokens:
        if token not in punctuation_marks:
            lemma = token #morph.parse(token)[0].normal_form
            if lemma not in stop_words:
                preprocessed_text.append(lemma)
    return ' '.join(preprocessed_text)

punctuation_marks = ['!', ',', ';', ':', '(', ')', '-', '--', '?', '@', '....', '~',
                     '.', '..', '...', '....................', '<', '>', '=', '»', '|', '’', '`', '+', '$',
                     '&', '#', '+++', '*', '``', '%', '[', ']', '{', '}', '√©']

stop_words = stopwords.words('english') + ['14000kbps', 'november', '1080p', '4k', 'mp4', 'error', '404', '2022']
morph = pymorphy2.MorphAnalyzer(lang='uk')

In [7]:
# Dropping the rows with "<div"
# identify partial string
discard = ["<div ", "<p ", "<span ", "<p>", "<div>", "<h", "<input ", "center>", "<a ", 
           "<td>", "<", ">", r"              ", "Ø", '√ú', 'http://']
  
df_train = df_train[~df_train.text.str.contains('|'.join(discard))]

In [8]:
df_train['text'] = df_train.apply(lambda row: bs(row.text, 'lxml').get_text().replace('\r\n', ' ').replace('/', ' ').replace('"', '\"'), axis=1)

In [9]:
df_train['text'] = df_train.apply(lambda row: preprocess(row.text, punctuation_marks, stop_words), axis=1)

In [10]:
df_train.head()

Unnamed: 0,text,labels
0,favorite slut,0
1,girlfriends sit 's faces asses,0
2,bound beauty kisses girlfriend,0
3,morgan anytime nail painting slave 's face,0
4,transgender coaching wmv part 1,0


In [11]:
def extract_words(df_train):
    words = list()
    for index, row in df_train.iterrows():
        q1 = row['text']
        if not q1 or not isinstance(q1, six.string_types):
            continue
        q_words = q1.split()
        for word in q_words:
            words.append(word)
    return words
vocabulary_size = 500000
words = extract_words(df_train)
print('Number of words: %d' % len(words))

Number of words: 941214


In [12]:
def build_dataset(words):
    count = [['UNK', -1]]
    count.extend(collections.Counter(words).most_common(vocabulary_size - 1))
    print("size of count")
    print(len(count))
    dictionary = dict()
    for word, _ in count:
        dictionary[word] = len(dictionary)
    data = list()
    unk_count = 0
    for word in words:
        if word in dictionary:
            index = dictionary[word]
        else:
            index = 0  # dictionary['UNK']
            unk_count = unk_count + 1
        data.append(index)
    count[0][1] = unk_count
    reverse_dictionary = dict(zip(dictionary.values(), dictionary.keys()))
    return data, count, dictionary, reverse_dictionary

data, count, dictionary, reverse_dictionary = build_dataset(words)
print('Most common words (+UNK)', count[:5])
print('Sample data', data[:10])
del words  # Hint to reduce memory.

size of count
46058
Most common words (+UNK) [['UNK', 0], ('fetish', 21782), ('foot', 16165), ('feet', 13306), ('domination', 9319)]
Sample data [2229, 110, 1845, 709, 39, 887, 761, 132, 550, 1353]


In [13]:
data_index = 0

In [14]:
def generate_batch(batch_size, num_skips, skip_window):
    global data_index
    assert batch_size % num_skips == 0
    assert num_skips <= 2 * skip_window
    batch = np.ndarray(shape=(batch_size), dtype=np.int32)
    labels = np.ndarray(shape=(batch_size, 1), dtype=np.int32)
    span = 2 * skip_window + 1 # [ skip_window target skip_window ]
    buffer = collections.deque(maxlen=span)
    for _ in range(span):
        buffer.append(data[data_index])
        data_index = (data_index + 1) % len(data)
    for i in range(batch_size // num_skips):
        target = skip_window  # target label at the center of the buffer
        targets_to_avoid = [ skip_window ]
        for j in range(num_skips):
            while target in targets_to_avoid:
                target = random.randint(0, span - 1)
            targets_to_avoid.append(target)
            batch[i * num_skips + j] = buffer[skip_window]
            labels[i * num_skips + j, 0] = buffer[target]
        buffer.append(data[data_index])
        data_index = (data_index + 1) % len(data)
    return batch, labels

In [15]:
print('data:', [reverse_dictionary[di] for di in data[:8]])

data: ['favorite', 'slut', 'girlfriends', 'sit', "'s", 'faces', 'asses', 'bound']


In [16]:
for num_skips, skip_window in [(2, 1), (4, 2)]:
    data_index = 0
    batch, labels = generate_batch(batch_size=8, num_skips=num_skips, skip_window=skip_window)
    print('\nwith num_skips = %d and skip_window = %d:' % (num_skips, skip_window))
    print('    batch:', [reverse_dictionary[bi] for bi in batch])
    print('    labels:', [reverse_dictionary[li] for li in labels.reshape(8)])


with num_skips = 2 and skip_window = 1:
    batch: ['slut', 'slut', 'girlfriends', 'girlfriends', 'sit', 'sit', "'s", "'s"]
    labels: ['girlfriends', 'favorite', 'sit', 'slut', "'s", 'girlfriends', 'faces', 'sit']

with num_skips = 4 and skip_window = 2:
    batch: ['girlfriends', 'girlfriends', 'girlfriends', 'girlfriends', 'sit', 'sit', 'sit', 'sit']
    labels: ['sit', 'favorite', 'slut', "'s", 'girlfriends', 'slut', 'faces', "'s"]


In [17]:
batch_size = 128
embedding_size = 128  # Dimension of the embedding vector.
skip_window = 4  # How many words to consider left and right.
num_skips = 2  # How many times to reuse an input to generate a label.
# We pick a random validation set to sample nearest neighbors. here we limit the
# validation samples to the words that have a low numeric ID, which by
# construction are also the most frequent.
valid_size = 16  # Random set of words to evaluate similarity on.
valid_window = 100  # Only pick dev samples in the head of the distribution.
valid_examples = np.array(random.sample(range(valid_window), valid_size))
num_sampled = 64  # Number of negative examples to sample.

graph = tf.Graph()

In [24]:
with graph.as_default(), tf.device('/cpu:0'):
    # Input data.
    train_dataset = tf.compat.v1.placeholder(tf.int32, shape=[batch_size])
    train_labels = tf.compat.v1.placeholder(tf.int32, shape=[batch_size, 1])
    valid_dataset = tf.constant(valid_examples, dtype=tf.int32)

    # Variables.
    embeddings = tf.Variable(
        tf.random.uniform([vocabulary_size, embedding_size], -1.0, 1.0))
    softmax_weights = tf.Variable(
        tf.random.truncated_normal([vocabulary_size, embedding_size],
                            stddev=1.0 / math.sqrt(embedding_size)))
    softmax_biases = tf.Variable(tf.zeros([vocabulary_size]))

    # Model.
    # Look up embeddings for inputs.
    embed = tf.nn.embedding_lookup(embeddings, train_dataset)
    # Compute the softmax loss, using a sample of the negative labels each time.
    loss = tf.reduce_mean(
        tf.nn.sampled_softmax_loss(weights=softmax_weights, biases=softmax_biases, inputs=embed,
                                   labels=train_labels, num_sampled=num_sampled, num_classes=vocabulary_size))

    # Optimizer.
    # Note: The optimizer will optimize the softmax_weights AND the embeddings.
    # This is because the embeddings are defined as a variable quantity and the
    # optimizer's `minimize` method will by default modify all variable quantities
    # that contribute to the tensor it is passed.
    # See docs on `tf.train.Optimizer.minimize()` for more details.
    optimizer = tf.keras.optimizers.legacy.Adagrad()

    # Compute the similarity between minibatch examples and all embeddings.
    # We use the cosine distance:
    norm = tf.sqrt(tf.reduce_sum(tf.square(embeddings), 1, keepdims=True))
    normalized_embeddings = embeddings / norm
    valid_embeddings = tf.nn.embedding_lookup(
        normalized_embeddings, valid_dataset)
    similarity = tf.matmul(valid_embeddings, tf.transpose(normalized_embeddings))
    print("Shapes of tensors similarity, embeddings, norm, normalized_embeddings, valid_embeddings")
    print(similarity)
    print(embeddings)
    print(norm)
    print(normalized_embeddings)
    print(valid_embeddings)

Shapes of tensors similarity, embeddings, norm, normalized_embeddings, valid_embeddings
Tensor("MatMul_2:0", shape=(16, 500000), dtype=float32, device=/device:CPU:0)
<tf.Variable 'Variable_6:0' shape=(500000, 128) dtype=float32>
Tensor("Sqrt_2:0", shape=(500000, 1), dtype=float32, device=/device:CPU:0)
Tensor("truediv_2:0", shape=(500000, 128), dtype=float32, device=/device:CPU:0)
Tensor("embedding_lookup_5/Identity:0", shape=(16, 128), dtype=float32, device=/device:CPU:0)


In [25]:
num_steps = 100001

In [26]:
with tf.compat.v1.Session(graph=graph) as session:
    tf.compat.v1.global_variables_initializer().run()
    print('Initialized')
    average_loss = 0
    for step in range(num_steps):
        batch_data, batch_labels = generate_batch(
              batch_size, num_skips, skip_window)
        feed_dict = {train_dataset : batch_data, train_labels : batch_labels}
        _, l = session.run([optimizer, loss], feed_dict=feed_dict)
        average_loss += l
        if step % 2000 == 0:
            if step > 0:
                average_loss = average_loss / 2000
            # The average loss is an estimate of the loss over the last 2000 batches.
                print('Average loss at step %d: %f' % (step, average_loss))
                average_loss = 0
        # note that this is expensive (~20% slowdown if computed every 500 steps)
        if step % 10000 == 0:
            sim = similarity.eval()
            for i in range(valid_size):
                valid_word = reverse_dictionary[valid_examples[i]]
                top_k = 8 # number of nearest neighbors
                nearest = (-sim[i, :]).argsort()[1:top_k+1]
                log = 'Nearest to %s:' % valid_word
                print(nearest)
                print(len(reverse_dictionary))
                for k in range(top_k):
                    a = nearest[k]
                    print(a)
                    close_word = reverse_dictionary[nearest[k]]
                    print(close_word)
                    log = '%s %s,' % (log, close_word)
                    print(log)
        final_embeddings = normalized_embeddings.eval()

Initialized


TypeError: Argument `fetch` = <keras.optimizers.legacy.adagrad.Adagrad object at 0x16fa31ac0> has invalid type "Adagrad" must be a string or Tensor. (Can not convert a Adagrad into a Tensor or Operation.)