In [None]:
import pandas as pd
import numpy as np
import pymorphy2
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings('ignore')

In [None]:
nltk.download('punkt')

In [None]:
data = pd.read_json('../datasets/dataset.json')
data.hasBadWords = data.hasBadWords.apply(lambda x: 1 if x == True else 0)
data.rename(columns={"hasBadWords": "labels"}, inplace=True)
data.drop(['violation'], axis=1, inplace=True)
data.shape

In [None]:
data.head()

# Tokenization

In [None]:
text = data.iloc[10]['text']

In [None]:
text

In [None]:
tokens = word_tokenize(text.lower())

In [None]:
tokens

# Punctuation filtering

In [None]:
punctuation_marks = ['!', ',', ';', '(', ')', ':', '-', '?', 
                     '.', '..', '...', '<', '>', '=', '\'\'',
                     '\"\"', '</', '&', '/', '#', '\'']

In [None]:
only_words = []
for token in tokens:
    if token not in punctuation_marks:
        only_words.append(token)

In [None]:
only_words

# Word normalization (stemming)

In [None]:
morph = pymorphy2.MorphAnalyzer(lang='uk')

In [None]:
lemmas = []
for token in only_words:
    lemmas.append(morph.parse(token)[0].normal_form)

In [None]:
lemmas

# Remove stop words

In [None]:
nltk.download('stopwords')

In [None]:
stop_words = stopwords.words('english')
# stop_words

In [None]:
filtered_words = []
for token in lemmas:
    if token not in stop_words:
        filtered_words.append(token)

# Preprocessing function

In [None]:
def preprocess(text, stop_words, punctuation_marks, morph):
    tokens = word_tokenize(text.lower())
    preprocessed_text = []
    for token in tokens:
        if token not in punctuation_marks:
            lemma = morph.parse(token)[0].normal_form
            if lemma not in stop_words:
                preprocessed_text.append(lemma)
    return preprocessed_text

punctuation_marks = ['!', ',', ';', '(', ')', ':', '-', '--', '', '?', 
                     '.', '..', '...', '<', '>', '=', '\'\'',
                     '\"\"', '</', '&', '/', '#', '\'', '*', '``', '%', '[', ']', '{', '}']
stop_words = stopwords.words('english') + ['div' , '/div', '/p', 'p', 'a', '/a', 'td', 'target=',
                                           'nbsp', 'img', 'src=', 'align=', '\'s', '/center',
                                           'http', 'br', 'center', 'href=', '/td', 'width=',
                                           'style=', 'font', '/font', 'border=', 'class=', 'span',
                                           '/span', '_blank', 'height=', '0', 'b', '/b', '/tr', 'tr',
                                           'size=', '/strong', "class='link", 'font-size',
                                          'name=', 'helvetica', '/table', '//www.clips4sale.com/',
                                           "src='https", 'id=', 'bgcolor=', 'text/javascript', 
                                          "align='center", 'data-placeholder=', 'clips4sale',
                                          "'https", '80', 'http-equiv=', 'https', 'face', 'color=']
morph = pymorphy2.MorphAnalyzer(lang='uk')

In [None]:
data[:1000].apply(lambda row: remove_html_tags(row.text), axis=1)

In [None]:
data[:1000].apply(lambda row: preprocess(row.text, punctuation_marks, stop_words, morph), axis=1)

In [None]:
data['text_preprocessed'] = data.apply(lambda row: remove_html_tags(row.text), axis=1)

In [None]:
data['text_preprocessed'] = data.apply(lambda row: preprocess(row.text, punctuation_marks, stop_words, morph), axis=1)

In [None]:
data = data[['text_preprocessed', 'labels', 'text']] # columns reorder

In [None]:
data.head()

# Simple vectorzation

In [None]:
from collections import Counter

In [None]:
test_counter = Counter(data['text_preprocessed'][10])
test_counter

In [None]:
test_counter.most_common(10)

Count word frequency

In [None]:
words = Counter()

In [None]:
for text in data['text_preprocessed']:
    words.update(text)

In [None]:
len(words)

In [None]:
words.most_common(10)

In [None]:
from wordcloud import *
word_freq = [i for i in words.most_common(100)]
wd = WordCloud(background_color='white')
wd.generate_from_frequencies(dict(word_freq))
plt.figure()
plt.imshow(wd, interpolation = 'bilinear')
plt.axis('off')
plt.show()

# Create dictionary ordered by frequency

Start from 2, because of

* 0 - code filler
* 1 - unknown word

In [None]:
word_to_index = dict()
index_to_word = dict()

In [None]:
max_words = 1000

In [None]:
for i, word in enumerate(words.most_common(max_words - 2)):
    word_to_index[word[0]] = i + 2
    index_to_word[i + 2] = word[0]

In [None]:
word_to_index

In [None]:
index_to_word

In [None]:
def text_to_sequence(txt, word_to_index):
    seq = []
    for word in txt:
        index = word_to_index.get(word, 1) # 1 - unknown word
        # Unknown words doesn't include to output sequence
        if index != 1:
            seq.append(index)
    return seq

In [None]:
txt = data['text_preprocessed'][10]

In [None]:
seq = [word_to_index.get(word, 1) for word in txt]

In [None]:
seq

In [None]:
index_to_word[16]

In [None]:
data['sequences'] = data.apply(lambda row: text_to_sequence(row['text_preprocessed'], word_to_index), axis=1)
data = data[['text_preprocessed', 'sequences', 'labels', 'text']] # columns reorder

In [None]:
data.head()

# Make the bag of words

In [None]:
def vectorize_sequences(sequences, dimension=10000):
    results = np.zeros((len(sequences), dimension))
    for i, sequence in enumerate(sequences):
        for index in sequence:
            results[i, index] += 1.
    return results

In [None]:
vactors = vectorize_sequences(data['sequences'], max_words)

In [None]:
vactors[0][:100]

In [None]:
vactors[0]

# Word2Vec

In [None]:
import gensim.downloader

In [None]:
word2vec_eng = gensim.downloader.load('word2vec-google-news-300')

In [None]:
word2vec_eng['fuck']

In [None]:
word2vec_eng['sex']

# Models list in Gensim

In [None]:
list(gensim.downloader.info()['models'].keys())

# FastText

In [None]:
import fasttext.util

In [None]:
fasttext.FastText.eprint = lambda x: None

In [None]:
# fasttext.util.download_model('en', if_exists='ignore')

In [None]:
# fasttext.util.download_model('ru', if_exists='ignore')

In [None]:
ft = fasttext.load_model('../datasets/crawl-300d-2M-subword/crawl-300d-2M-subword.bin')

In [None]:
ft.get_word_vector('rip')

In [None]:
%%time
ft.get_nearest_neighbors('dog')

In [None]:
# or, cbow model :
model = fasttext.train_unsupervised('../datasets/data.txt',  model='skipgram')

In [None]:
model.get_word_vector("ki11er")

In [None]:
model.get_nearest_neighbors('ki11er')

In [None]:
from gensim.models import FastText
sentences = [["cat", "say", "meow"], ["dog", "say", "woof"]]
model = FastText(sentences, min_count=1)
say_vector = model.wv['say']  # get vector for word
of_vector = model.wv['of']  # get vector for out-of-vocab word

In [None]:
say_vector

In [None]:
of_vector