Switch branches/tags
Nothing to show
Find file Copy path
Fetching contributors…
Cannot retrieve contributors at this time
185 lines (155 sloc) 7.48 KB
### This script is part of the FLUX Toolchain project (
### Copyright (C) Adrien Barbaresi, 2012.
### The FLUX Toolchain is freely available under the GNU GPL v3 license (
from __future__ import print_function
from __future__ import division
from urlparse import urlparse
import re
import optparse
import sys
## split lines of the kind '.htmlhttp://'
## more banned hostnames (Alexa list)
## english link text
#### spamdict problem
#### check options
# clean <> and {} ?
# Parse arguments and options
parser = optparse.OptionParser(usage='usage: %prog [options] arguments')
parser.add_option("-i", "--input-file", dest="inputfile", help="input file name", metavar="FILE")
parser.add_option("-o", "--output-file", dest="outputfile", help="output file name", metavar="FILE")
parser.add_option("-l", "--spamlist-file", dest="spamlistfile", help="name of the spamlist file (containing domain names)", metavar="FILE")
parser.add_option("-s", "--spam-urls-file", dest="spamurls", help="name of the file to write the spam urls", metavar="FILE")
parser.add_option("--adult-filter", dest="adultfilter", default=False, action="store_true", help="basic adult filter (not always useful)")
parser.add_option("-p", "--path", dest="path", help="path to the files")
options, args = parser.parse_args()
if options.inputfile is None or options.outputfile is None:
parser.error('input AND output file mandatory (-h or --help for more information).')
# Main regexes : media filters
# avoid getting trapped
protocol = re.compile(r'^http', re.IGNORECASE)
# obvious extensions
extensions = re.compile(r'\.atom$|\.json$|\.css$|\.xml$|\.js$|\.jpg$|\.jpeg$|\.png$|\.gif$|\.tiff$|\.pdf$|\.ogg$|\.mp3$|\.m4a$|\.aac$|\.avi$|\.mp4$|\.mov$|\.webm$|\.flv$|\.ico$|\.pls$|\.zip$|\.tar$|\.gz$|\.iso$|\.swf$', re.IGNORECASE)
# frequent media query schemes, just in case
mediaquery = re.compile(r'\.jpg[&?]|\.jpeg[&?]|\.png[&?]|\.gif[&?]|\.pdf[&?]|\.ogg[&?]|\.mp3[&?]|\.avi[&?]|\.mp4[&?]', re.IGNORECASE)
# avoid these websites
hostnames_filter = re.compile(r'last\.fm|soundcloud\.com|youtube\.com|youtu\.be|vimeo\.com|instagr\.am|instagram\.com|imgur\.com|flickr\.com|google\.|twitter\.com|twitpic\.com|gravatar\.com|akamai\.net|amazon\.com|cloudfront\.com', re.IGNORECASE)
notsuited = re.compile(r'^http://add?s?\.|^http://banner\.|doubleclick|tradedoubler\.com|livestream|live\.|videos?\.|feed$|rss$', re.IGNORECASE)
# Open and load spam-list file, if there is one
if options.spamlistfile is not None:
filename = options.spamlistfile
if options.path is not None:
filename = options.path + filename
spamlistfile = open(filename, 'r')
spamset = set()
# there should be domains names in the file
for domain in spamlistfile:
domain = domain.rstrip()
# '{} format' not supported before Python 2.7
print('Length of the spam list: {:,}' . format(len(spamset)))
except ValueError:
print('Length of the spam list:', len(spamset))
except IOError:
print('Could not open the file containing the spam reference list:', options.spamlistfile, '\nThe URLs will not be checked for spam.')
print('No spam reference list given, the URLs will not be checked for spam.')
# Open source and destination files
filename = options.inputfile
if options.path is not None:
filename = options.path + filename
sourcefile = open(filename, 'r')
except IOError:
sys.exit("Could not open the input file.")
# fall-back if there is nowhere to write the urls seen as spam
if options.spamurls is None:
options.spamurls = options.inputfile + '_spam-detected-urls'
print('No file name given for the urls classified as spam, defaulting to', options.spamurls)
# write/append to files
def append_to_file(filename, listname):
if options.path:
filename = options.path + filename
out = open(filename, 'a')
except IOError:
sys.exit ('Could not open output file: ' + filename)
for link in listname:
out.write(str(link) + "\n")
total_urls = 0
dropped_urls = 0
nonspam, usersdone = (list() for i in range(2))
for line in sourcefile:
total_urls += 1
line = line.rstrip()
candidates = list()
# clean the input string
line = line.replace('[ \t]+', '')
match ='^http.+?(https?://.+?$)', line)
if match:
match2 ='^http.+?(https?://.+?$)', line)
if match2:
for candidate in candidates:
passing_test = True
# regexes tests : a bit heavy...
## check HTTP and length
if not or len(candidate) < 11:
passing_test = False
# lower case
candidate = candidate.lower()
## compiled filters
if or or or
passing_test = False
# https
candidate = candidate.replace('^https', 'http')
# domain spam check
if 'spamset' in globals():
domain = urlparse(candidate).netloc
if domain in spamset:
passing_test = False
except ValueError:
passing_test = False
## (basic) adult spam filter
if options.adultfilter is True:
#if'[\./]sex|[\./-](adult|porno?|cash|xxx|fuck)', candidate) or'(sex|adult|porno?|cams|cash|xxx|fuck)[\./-]', candidate) or'gangbang|incest', candidate) or'[\./-](ass|sex)[\./-]', candidate):
if'[\./_-](porno?|xxx)', re.IGNORECASE) or'(cams|cash|porno?|sex|xxx)[\./_-]', re.IGNORECASE) or'gangbang|incest', re.IGNORECASE) or'[\./_-](adult|ass|sex)[\./_-]', re.IGNORECASE):
passing_test = False
if passing_test == True:
dropped_urls += 1
# regularly check if the lists don't become too long
if total_urls % 1000 == 0:
if len(nonspam) > 10000 or len(spamurls) > 10000:
append_to_file(options.outputfile, nonspam)
append_to_file(options.spamurls, spamurls)
nonspam, spamurls = (list() for i in range(2))
# print the rest
append_to_file(options.outputfile, nonspam)
append_to_file(options.spamurls, spamurls)
# print final results
print('Total URLs seen: {:,}' . format(total_urls))
print('Total URLs dropped: {:,}' . format(dropped_urls))
print('Ratio: {0:.2f}' . format((dropped_urls/total_urls)*100), '%')
## '{} format' not supported before Python 2.7
except ValueError:
print('Total URLs seen:', total_urls)
print('Total URLs dropped:', dropped_urls) #'Total URLs dropped: %d'
print('Ratio:', ((dropped_urls/total_urls)*100), '%') #'Ratio: %.02f'