In [1]:
import re  # For preprocessing
import pandas as pd  # For data handling
from time import time  # To time our operations
from collections import defaultdict  # For word frequency

import spacy  # For preprocessing

import logging  # Setting up the loggings to monitor gensim
logging.basicConfig(format="%(levelname)s - %(asctime)s: %(message)s", datefmt= '%H:%M:%S', level=logging.INFO)

2023-03-31 13:50:04.689492: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
df = pd.read_json('../datasets/dataset.json')
df.rename(columns={"hasBadWords": "labels"}, inplace=True)
mapping = {False: 0.0, True: 1.0}
df.replace({'labels': mapping}, inplace=True)
df.drop(['violation', 'labels'], axis=1, inplace=True)

In [3]:
df.shape

(86439, 1)

In [4]:
df.head()

Unnamed: 0,text
0,My Favorite Slut
1,girlfriends sit on each other's faces with the...
2,bound beauty kisses her girlfriend
3,MORGAN - Anytime - Nail Painting On The Slave'...
4,TRANSGENDER COACHING (wmv) PART 1


In [5]:
df = df[df['text'].str.contains('<div' or '<p' or 'p>' or '<b'or '<br>' or '&nbsp' or '=') == False]

In [6]:
df

Unnamed: 0,text
0,My Favorite Slut
1,girlfriends sit on each other's faces with the...
2,bound beauty kisses her girlfriend
3,MORGAN - Anytime - Nail Painting On The Slave'...
4,TRANSGENDER COACHING (wmv) PART 1
...,...
83594,"ebony,hotwife,wife,swinger,cuckold,bigass"
83595,"ssbhm, bhm, ffa, female fat admire, fat admire..."
83596,"Feet in heels, sexy shoes, high heels, high he..."
83597,"foot fetish, breeding, kinky, fetish porn, bon..."


In [7]:
nlp = spacy.load('en_core_web_sm', disable=['ner', 'parser']) # disabling Named Entity Recognition for speed

def cleaning(doc):
    # Lemmatizes and removes stopwords
    # doc needs to be a spacy Doc object
    txt = [token.lemma_ for token in doc if not token.is_stop]
    # Word2Vec uses context words to learn the vector representation of a target word,
    # if a sentence is only one or two words long,
    # the benefit for the training is very small
    if len(txt) > 2:
        return ' '.join(txt)

In [8]:
brief_cleaning = (re.sub("[^A-Za-z']+", ' ', str(row)).lower() for row in df['text'])

In [9]:
t = time()

txt = [cleaning(doc) for doc in nlp.pipe(brief_cleaning, batch_size=5000)]

print('Time to clean up everything: {} mins'.format(round((time() - t) / 60, 2)))

Time to clean up everything: 3.93 mins


In [10]:
df_clean = pd.DataFrame({'clean': txt})
df_clean = df_clean.dropna().drop_duplicates()
df_clean.shape

(52201, 1)

In [11]:
from gensim.models.phrases import Phrases, Phraser

In [12]:
sent = [row.split() for row in df_clean['clean']]

In [13]:
phrases = Phrases(sent, min_count=1, progress_per=10000)

INFO - 13:54:22: collecting all words and their counts
INFO - 13:54:22: PROGRESS: at sentence #0, processed 0 words and 0 word types
INFO - 13:54:22: PROGRESS: at sentence #10000, processed 328890 words and 156603 word types
INFO - 13:54:23: PROGRESS: at sentence #20000, processed 465581 words and 211672 word types
INFO - 13:54:23: PROGRESS: at sentence #30000, processed 954008 words and 373780 word types
INFO - 13:54:23: PROGRESS: at sentence #40000, processed 1064330 words and 401603 word types
INFO - 13:54:24: PROGRESS: at sentence #50000, processed 1665344 words and 534786 word types
INFO - 13:54:24: collected 542508 token types (unigram + bigrams) from a corpus of 1687878 words and 52201 sentences
INFO - 13:54:24: merged Phrases<542508 vocab, min_count=1, threshold=10.0, max_vocab_size=40000000>
INFO - 13:54:24: Phrases lifecycle event {'msg': 'built Phrases<542508 vocab, min_count=1, threshold=10.0, max_vocab_size=40000000> in 2.05s', 'datetime': '2023-03-31T13:54:24.494204', 'ge

In [14]:
bigram = Phraser(phrases)

INFO - 13:54:24: exporting phrases from Phrases<542508 vocab, min_count=1, threshold=10.0, max_vocab_size=40000000>
INFO - 13:54:25: FrozenPhrases lifecycle event {'msg': 'exported FrozenPhrases<62104 phrases, min_count=1, threshold=10.0> from Phrases<542508 vocab, min_count=1, threshold=10.0, max_vocab_size=40000000> in 1.06s', 'datetime': '2023-03-31T13:54:25.572638', 'gensim': '4.3.0', 'python': '3.10.10 (main, Feb 16 2023, 02:55:02) [Clang 14.0.0 (clang-1400.0.29.202)]', 'platform': 'macOS-13.2.1-x86_64-i386-64bit', 'event': 'created'}


In [15]:
sentences = bigram[sent]

In [16]:
word_freq = defaultdict(int)
for sent in sentences:
    for i in sent:
        word_freq[i] += 1
len(word_freq)

84423

In [17]:
sorted(word_freq, key=word_freq.get, reverse=True)[:100]

['p',
 'foot',
 'fetish',
 'ass',
 'girl',
 'nbsp',
 'strong',
 'clip',
 'worship',
 'video',
 'big',
 'br',
 'cum',
 'span',
 'cock',
 'bondage',
 'sexy',
 'crush',
 'domination',
 'sale_com',
 'slave',
 'pussy',
 'href_http',
 'femdom',
 'www_clip',
 'face',
 'humiliation',
 'fuck',
 'toe',
 'leg',
 'sex',
 'fart',
 'want',
 'woman',
 'sol',
 'shoe',
 'black',
 'img_src',
 'clip_sale',
 'high_heel',
 'lick',
 'boot',
 'time',
 'suck',
 'amateur',
 'pov',
 'play',
 'love',
 'href_https',
 'like',
 'span_style',
 'com',
 'center',
 'hot',
 'milf',
 'female_domination',
 'mistress',
 'pantyhose',
 'tease',
 'get',
 'align_center',
 'mouth',
 'hard',
 'balloon',
 'heel',
 'tit',
 'studio',
 'hand',
 'know',
 'br_br',
 'target_blank',
 'go',
 'tickle',
 'anal',
 'gag',
 'sock',
 'bdsm',
 'fucking',
 'lesbian',
 'goddess',
 'trample',
 'store',
 'b',
 'good',
 'http_www',
 'smell',
 'spank',
 'watch',
 'dildo',
 'man',
 'body',
 'bbw',
 'masturbation',
 'long',
 'mp',
 'female',
 'start',


In [18]:
sentences

<gensim.interfaces.TransformedCorpus at 0x140254970>

In [19]:
# from wordcloud import *
# word_freq = [i for i in counter.most_common(50)]
# wd = WordCloud(background_color='white')
# wd.generate_from_frequencies(dict(word_freq))
# plt.figure()
# plt.imshow(wd, interpolation = 'bilinear')
# plt.axis('off')
# plt.show()

In [20]:
import multiprocessing

from gensim.models import Word2Vec

In [21]:
cores = multiprocessing.cpu_count()

In [22]:
w2v_model = Word2Vec(min_count=1,
                     window=2,
                     sample=6e-5, 
                     alpha=0.03, 
                     min_alpha=0.0007, 
                     negative=20,
                     workers=cores-1)

INFO - 13:54:27: Word2Vec lifecycle event {'params': 'Word2Vec<vocab=0, vector_size=100, alpha=0.03>', 'datetime': '2023-03-31T13:54:27.231352', 'gensim': '4.3.0', 'python': '3.10.10 (main, Feb 16 2023, 02:55:02) [Clang 14.0.0 (clang-1400.0.29.202)]', 'platform': 'macOS-13.2.1-x86_64-i386-64bit', 'event': 'created'}


In [23]:
t = time()

w2v_model.build_vocab(sentences, progress_per=10000)

print('Time to build vocab: {} mins'.format(round((time() - t) / 60, 2)))

INFO - 13:54:27: collecting all words and their counts
INFO - 13:54:27: PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
INFO - 13:54:27: PROGRESS: at sentence #10000, processed 259578 words, keeping 27856 word types
INFO - 13:54:27: PROGRESS: at sentence #20000, processed 368880 words, keeping 36517 word types
INFO - 13:54:28: PROGRESS: at sentence #30000, processed 757011 words, keeping 54853 word types
INFO - 13:54:28: PROGRESS: at sentence #40000, processed 845252 words, keeping 58508 word types
INFO - 13:54:28: PROGRESS: at sentence #50000, processed 1276701 words, keeping 83664 word types
INFO - 13:54:28: collected 84423 word types from a corpus of 1296037 raw words and 52201 sentences
INFO - 13:54:28: Creating a fresh vocabulary
INFO - 13:54:29: Word2Vec lifecycle event {'msg': 'effective_min_count=1 retains 84423 unique words (100.00% of original 84423, drops 0)', 'datetime': '2023-03-31T13:54:29.149409', 'gensim': '4.3.0', 'python': '3.10.10 (main, Feb 16 2023

Time to build vocab: 0.05 mins


In [24]:
t = time()

w2v_model.train(sentences, total_examples=w2v_model.corpus_count, epochs=30, report_delay=1)

print('Time to train the model: {} mins'.format(round((time() - t) / 60, 2)))

INFO - 13:54:30: Word2Vec lifecycle event {'msg': 'training model with 3 workers on 84423 vocabulary and 100 features, using sg=0 hs=0 sample=6e-05 negative=20 window=2 shrink_windows=True', 'datetime': '2023-03-31T13:54:30.350702', 'gensim': '4.3.0', 'python': '3.10.10 (main, Feb 16 2023, 02:55:02) [Clang 14.0.0 (clang-1400.0.29.202)]', 'platform': 'macOS-13.2.1-x86_64-i386-64bit', 'event': 'train'}
INFO - 13:54:31: EPOCH 0 - PROGRESS: at 39.97% examples, 225196 words/s, in_qsize 5, out_qsize 0
INFO - 13:54:32: EPOCH 0 - PROGRESS: at 82.52% examples, 244225 words/s, in_qsize 5, out_qsize 0
INFO - 13:54:33: EPOCH 0: training on 1296037 raw words (757404 effective words) took 2.9s, 265205 effective words/s
INFO - 13:54:34: EPOCH 1 - PROGRESS: at 37.29% examples, 201541 words/s, in_qsize 5, out_qsize 0
INFO - 13:54:35: EPOCH 1 - PROGRESS: at 54.55% examples, 212176 words/s, in_qsize 4, out_qsize 2
INFO - 13:54:36: EPOCH 1 - PROGRESS: at 91.47% examples, 236452 words/s, in_qsize 5, out_qs

INFO - 13:55:39: EPOCH 22: training on 1296037 raw words (757715 effective words) took 2.9s, 262731 effective words/s
INFO - 13:55:40: EPOCH 23 - PROGRESS: at 39.97% examples, 240237 words/s, in_qsize 6, out_qsize 0
INFO - 13:55:41: EPOCH 23 - PROGRESS: at 81.22% examples, 246397 words/s, in_qsize 5, out_qsize 0
INFO - 13:55:42: EPOCH 23: training on 1296037 raw words (757788 effective words) took 2.9s, 261475 effective words/s
INFO - 13:55:43: EPOCH 24 - PROGRESS: at 38.09% examples, 208130 words/s, in_qsize 0, out_qsize 1
INFO - 13:55:45: EPOCH 24 - PROGRESS: at 58.26% examples, 201975 words/s, in_qsize 4, out_qsize 0
INFO - 13:55:46: EPOCH 24 - PROGRESS: at 91.55% examples, 221387 words/s, in_qsize 5, out_qsize 0
INFO - 13:55:46: EPOCH 24: training on 1296037 raw words (757714 effective words) took 3.3s, 228148 effective words/s
INFO - 13:55:47: EPOCH 25 - PROGRESS: at 38.54% examples, 215204 words/s, in_qsize 4, out_qsize 1
INFO - 13:55:48: EPOCH 25 - PROGRESS: at 60.62% examples, 

Time to train the model: 1.52 mins


In [25]:
w2v_model.init_sims(replace=True)

  w2v_model.init_sims(replace=True)


In [26]:
w2v_model.wv.most_similar(positive=["slut"])

[('whore', 0.7729179859161377),
 ('couple_maledom', 0.7516284584999084),
 ('wife_hitchhiker', 0.7387457489967346),
 ('escort', 0.7235120534896851),
 ('pimp', 0.714429497718811),
 ('cum_dumpster', 0.7086385488510132),
 ('cocksucke', 0.7047537565231323),
 ('cum_dump', 0.6967712044715881),
 ('gurl', 0.6931356191635132),
 ('hotwife', 0.691525399684906)]

In [27]:
w2v_model.wv.similarity("slut", 'prostitute')

0.578677

In [33]:
w2v_model.wv.doesnt_match(['cocksucke', 'couple_maledom', 'whore'])

'whore'