Skip to content

Commit

Permalink
Added blocklist extractor
Browse files Browse the repository at this point in the history
  • Loading branch information
André Mourão committed Sep 23, 2020
1 parent 061f2fc commit 47b0859
Show file tree
Hide file tree
Showing 3 changed files with 44 additions and 8 deletions.
23 changes: 16 additions & 7 deletions extractall_base64_mt.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,8 @@
from classifier_nsfw import ClassifierNSFW
from classifier_base import ClassifierBase

from metaclassifier_blocked import MetaClassifierBlocked

import logging

# get TF logger
Expand All @@ -27,7 +29,7 @@

batch_queue = queue.Queue()

def my_service(image_path, model, batch_size):
def my_service(image_path, model, metamodels, batch_size):
image_paths = []
image_ids = []
stored_lines = OrderedDict()
Expand Down Expand Up @@ -57,17 +59,19 @@ def my_service(image_path, model, batch_size):
j += 1
if not image_id in stored_lines:
stored_lines[image_id] = []
for metamodel in metamodels:
line = metamodel.classify(line)
stored_lines[image_id].append(line)
if len(stored_lines) > 0:
processed_images, failed, duplicates = model.load_images(image_paths,True)
batch_queue.put( (processed_images, failed, duplicates, image_ids, stored_lines) )
batch_queue.put( None )

def parse_file(image_path, model, batch_size):
def parse_file(image_path, model, metamodels, batch_size):
t0 = time.time()
count = 0
j = 0
t = threading.Thread(name='my_service', target=my_service, args=(image_path, model, batch_size))
t = threading.Thread(name='my_service', target=my_service, args=(image_path, model, metamodels, batch_size))
t.start()
with open(image_path + "_pages.jsonl", "w") as outP:
with open(image_path + "_images.jsonl", "w") as outI:
Expand Down Expand Up @@ -97,11 +101,11 @@ def parse_file(image_path, model, batch_size):
t.join()


def run_batched_images(models, images, batch_size):
def run_batched_images(models, metamodels, images, batch_size):
model = models[0]
for image_path in images:
if image_path.endswith(".jsonl"):
parse_file(image_path, model, batch_size)
parse_file(image_path, model, metamodels, batch_size)


def main(args=None):
Expand All @@ -116,6 +120,9 @@ def main(args=None):
submain.add_argument('--image_source', dest='image_source', type=str, required=True,
help='A directory of json images')

submain.add_argument('--image_block_list', dest='image_block_list', type=str, required=True,
help='URL to CSV with block list')

submain.add_argument('--batch_size', dest='batch_size', type=int, required=True,
help='Keras batch size')

Expand All @@ -124,11 +131,13 @@ def main(args=None):
else:
config = vars(parser.parse_args())

if config['image_source'] is None or not exists(config['image_source']):
if config['image_source'] is None or not exists():
raise ValueError("image_source must be a valid directory with images or a single image to classify.")

models = [ClassifierNSFW("/mobilenet_v2_140_224")]
image_preds = run_batched_images(models, [os.path.join(config['image_source'], f) for f in os.listdir(config['image_source'])], config['batch_size'])

metamodels = [MetaClassifierBlocked(config['image_block_list'])]
image_preds = run_batched_images(models, metamodels, [os.path.join(config['image_source'], f) for f in os.listdir(config['image_source'])], config['batch_size'])


if __name__ == "__main__":
Expand Down
25 changes: 25 additions & 0 deletions metaclassifier_blocked.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
import csv
import requests
import re


class MetaClassifierBlocked():

def __init__(self, block_list):
download = requests.get(block_list)
decoded_content = download.content.decode('utf-8')
cr = csv.reader(decoded_content.splitlines(), delimiter=',')
my_list = list(cr)
self.regexes = []
for row in my_list:
r = row[0]
if r.endswith("/"):
r = r[:-1]
self.regexes.append(re.compile(r))

def classify(self, jsonline):
for regex in self.regexes:
if regex.search(jsonline['url']):
jsonline['blocked'] = 1
return jsonline
return jsonline
4 changes: 3 additions & 1 deletion service_extractall.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
import pika

from classifier_nsfw import ClassifierNSFW
from metaclassifier_blocked import MetaClassifierBlocked
import extractall_base64_mt


Expand All @@ -18,6 +19,7 @@
HOST_PATH="/data/images/pipe"

model = ClassifierNSFW("/mobilenet_v2_140_224")
metamodels = [MetaClassifierBlocked("https://docs.google.com/spreadsheets/d/1PM4evPp8_v46N_Rd0Klsv8uFiKZGC5cxu1NCJxFhKFI/export?format=csv&id=1PM4evPp8_v46N_Rd0Klsv8uFiKZGC5cxu1NCJxFhKFI&gid=0")]

def on_message(ch, method, properties, body):
print(" [x] Received %r" % body)
Expand All @@ -33,7 +35,7 @@ def on_message(ch, method, properties, body):
p = subprocess.run(HDFS_COMMAND.format(body, HADOOP_PATH, FILENAME).split(" "))

image_path = "{}/{}".format(HADOOP_PATH, FILENAME)
extractall_base64_mt.parse_file(image_path, model, BATCH_SIZE)
extractall_base64_mt.parse_file(image_path, model, metamodels, BATCH_SIZE)
nsfw_image_path = "{}/{}_pages.jsonl".format(HOST_PATH, FILENAME)

#result = "{},{},{}".format(nsfw_image_path)
Expand Down

0 comments on commit 47b0859

Please sign in to comment.