In [None]:
import os
from itertools import islice

import joblib
import numpy as np
from sklearn.metrics import recall_score, precision_score

In [2]:
import pyLDAvis
import pyLDAvis.sklearn
pyLDAvis.enable_notebook()

In [1]:
%load_ext autoreload
%autoreload 2
from utils.streams import document_stream, stream_cleaned_texts, reservoir_sample, get_porn_domains, stream_all_records
from utils.topic import create_topic_model, PornClassifier

Code running in a notebook, loading display tools


In [None]:
DATA_PATH = "/work/netarkivet-cleaned/"
SAVE_PATH = "/work/topic_model/"

In [None]:
MODEL_TYPE = "nmf"
N_TOPICS = 100

In [3]:
texts = stream_cleaned_texts(data_path=DATA_PATH, filter_porn=False)

In [None]:
#I don't know what the optimal number of workers is, try different things and see what works best
#2 tends to be okay.
documents = document_stream(texts, workers=2)
#Randomly sample 100_000 documents  from the first 5 million
#This way we can be sure that all topics are included,
#but we won't have to use as much memory
documents = islice(documents, 5_000_000)
documents = reservoir_sample(documents, 100_000)

In [None]:
model, matrix, vectorizer = create_topic_model(
    documents,
    model_type=MODEL_TYPE,
    n_topics=N_TOPICS,
    max_freq=0.3,
    max_vocab=15_000
)

In [None]:
#Display topic model
pyLDAvis.sklearn.prepare(model, matrix, vectorizer, sort_topics=False)
#sot_topics is important as otherwise it displays topics in a messed up order
#Which doesn't correspond to the actual features at all

In [None]:
joblib.dump(model, os.path.join(SAVE_PATH, f"{MODEL_TYPE}_{N_TOPICS}.joblib"))
joblib.dump(vectorizer, os.path.join(SAVE_PATH, f"tf-idf_{MODEL_TYPE}_{N_TOPICS}.joblib"))

Testing the topic model for porn classification

In [None]:
porn_domains = get_porn_domains(DATA_PATH)
#obtain a sample of records
SAMPLE_SIZE = 200_000
records = stream_all_records(DATA_PATH)
records = islice(records, 5_000_000)
sample = reservoir_sample(records, SAMPLE_SIZE)

In [None]:
texts, is_porn = np.zeros(shape=SAMPLE_SIZE), np.zeros(shape=SAMPLE_SIZE)
for i, record in enumerate(records):
    texts[i] = record["text"]
    is_porn[i] = record["domain_key"] in porn_domains

In [None]:
classifier = PornClassifier.load(f"{MODEL_TYPE}_{N_TOPICS}")
predictions = classifier.predict(texts)

In [None]:
print(recall_score(is_porn, predictions))
print(precision_score(is_porn, predictions))