Skip to content

Commit

Permalink
Revamp !!/allspam command (Charcoal-SE#3)
Browse files Browse the repository at this point in the history
* Outline tasks

* Preliminary post sanity checks

* Handle allspam calls with SE user links

* Handle allspam

* Move allspam command to be near report

* Fix flake issues

* Add rate-limiting

* Remove old allspam handling code

* Fix parsing error

* Fix flake error

* Update to use new command syntax

* Fix minor bugs, add messageData.p to gitignore

* drop the arity kwarg
  • Loading branch information
angussidney authored and quartata committed Oct 15, 2017
1 parent 20df735 commit e3e5e8b
Show file tree
Hide file tree
Showing 8 changed files with 141 additions and 88 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,7 @@ apiCalls.pickle
bodyfetcherQueue.p
bodyfetcherQueueTimings.p
bodyfetcherMaxIds.p
messageData.p

# Settings files
.settings/
Expand Down
155 changes: 120 additions & 35 deletions chatcommands.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,20 +6,20 @@
# noinspection PyUnresolvedReferences
from datetime import datetime
from utcdate import UtcDate
from apigetpost import api_get_post
from apigetpost import api_get_post, PostData
from datahandling import *
from blacklists import load_blacklists
from metasmoke import Metasmoke
from parsing import *
from spamhandling import handle_spam
from spamhandling import handle_user_with_all_spam
from gitmanager import GitManager
import threading
from threading import Thread
import random
import requests
import os
import time
from html import unescape
# noinspection PyCompatibility
import regex
from helpers import only_blacklists_changed
Expand All @@ -28,7 +28,7 @@
# TODO: pull out code block to get user_id, chat_site, room_id into function
# TODO: Return result for all functions should be similar (tuple/named tuple?)
# TODO: Do we need uid == -2 check? Turn into "is_user_valid" check
# TODO: Consistant return structure
# TODO: Consistent return structure
# if return...else return vs if return...return


Expand Down Expand Up @@ -534,23 +534,6 @@ def alive():
'Kinda sorta'])


# noinspection PyIncorrectDocstring
@command(str, whole_msg=True, privileged=True, aliases=["reportuser"])
def allspam(msg, url):
"""
Reports all of a user's posts as spam
:param url:
:return:
"""
user = get_user_from_url(url)

if user is None:
raise Exception("That doesn't look like a valid user URL.")

why = u"User manually reported by *{}* in room *{}*.\n".format(msg.owner.name, msg.room.name)
handle_user_with_all_spam(user, why)


# noinspection PyIncorrectDocstring
@command(int, privileged=True, arity=(0, 1))
def errorlogs(count):
Expand Down Expand Up @@ -1045,10 +1028,8 @@ def report(msg, urls):
raise Exception("You can execute the !!/report command again in {} seconds. "
"To avoid one user sending lots of reports in a few commands and "
"slowing SmokeDetector down due to rate-limiting, you have to "
"wait 30 seconds after you've reported multiple posts using "
"!!/report, even if your current command just has one URL. (Note "
"that this timeout won't be applied if you only used !!/report "
"for one post)".format(wait))
"wait 30 seconds after you've reported multiple posts in "
"one go.".format(wait))

output = []
urls = list(set(urls.split()))
Expand All @@ -1059,7 +1040,7 @@ def report(msg, urls):
"SmokeDetector's chat messages getting rate-limited too much, "
"which would slow down reports.")

for index, url in enumerate(urls):
for index, url in enumerate(urls, start=1):
post_data = api_get_post(url)

if post_data is None:
Expand Down Expand Up @@ -1102,6 +1083,115 @@ def report(msg, urls):
return os.linesep.join(output)


# noinspection PyIncorrectDocstring,PyUnusedLocal
@command(str, whole_msg=True, privileged=True, aliases=['reportuser'])
def allspam(msg, url):
"""
Reports all of a user's posts as spam
:param msg:
:param url: A user profile URL
:return:
"""
crn, wait = can_report_now(msg.owner.id, msg._client.host)
if not crn:
raise Exception("You can execute the !!/report command again in {} seconds. "
"To avoid one user sending lots of reports in a few commands and "
"slowing SmokeDetector down due to rate-limiting, you have to "
"wait 30 seconds after you've reported multiple posts in "
"one go.".format(wait))
user = get_user_from_url(url)
if user is None:
raise Exception("That doesn't look like a valid user URL.")
user_sites = []
user_posts = []
# Detect whether link is to network profile or site profile
if user[1] == 'stackexchange.com':
# Respect backoffs etc
GlobalVars.api_request_lock.acquire()
if GlobalVars.api_backoff_time > time.time():
time.sleep(GlobalVars.api_backoff_time - time.time() + 2)
# Fetch sites
api_filter = "!6Pbp)--cWmv(1"
request_url = "http://api.stackexchange.com/2.2/users/{}/associated?filter={}&key=IAkbitmze4B8KpacUfLqkw((" \
.format(user[0], api_filter)
res = requests.get(request_url).json()
if "backoff" in res:
if GlobalVars.api_backoff_time < time.time() + res["backoff"]:
GlobalVars.api_backoff_time = time.time() + res["backoff"]
GlobalVars.api_request_lock.release()
if 'items' not in res or len(res['items']) == 0:
raise Exception("The specified user does not appear to exist.")
if res['has_more']:
raise Exception("The specified user has an abnormally high number of accounts. Please consider flagging for"
" moderator attention, otherwise use !!/report on the user's posts individually.")
# Add accounts with posts
for site in res['items']:
if site['question_count'] > 0 or site['answer_count'] > 0:
user_sites.append((site['user_id'], get_api_sitename_from_url(site['site_url'])))
else:
user_sites.append((user[0], get_api_sitename_from_url(user[1])))
# Fetch posts
for u_id, u_site in user_sites:
# Respect backoffs etc
GlobalVars.api_request_lock.acquire()
if GlobalVars.api_backoff_time > time.time():
time.sleep(GlobalVars.api_backoff_time - time.time() + 2)
# Fetch posts
api_filter = "!)Q4RrMH0DC96Y4g9yVzuwUrW"
request_url = "http://api.stackexchange.com/2.2/users/{}/posts?site={}&filter={}&key=IAkbitmze4B8KpacUfLqkw((" \
.format(u_id, u_site, api_filter)
res = requests.get(request_url).json()
if "backoff" in res:
if GlobalVars.api_backoff_time < time.time() + res["backoff"]:
GlobalVars.api_backoff_time = time.time() + res["backoff"]
GlobalVars.api_request_lock.release()
if 'items' not in res or len(res['items']) == 0:
raise Exception("The specified user has no posts on this site.")
posts = res['items']
if posts[0]['owner']['reputation'] > 100:
raise Exception("The specified user's reputation is abnormally high. Please consider flagging for moderator"
" attention, otherwise use !!/report on the posts individually.")
# Add blacklisted user - use most downvoted post as post URL
message_url = "https://chat.{}/transcript/{}?m={}".format(msg._client.host, msg.room.id, msg.id)
add_blacklisted_user(user, message_url, sorted(posts, key=lambda x: x['score'])[0]['owner']['link'])
# TODO: Postdata refactor, figure out a better way to use apigetpost
for post in posts:
post_data = PostData()
post_data.post_id = post['post_id']
post_data.post_url = url_to_shortlink(post['link'])
*discard, post_data.site, post_data.post_type = fetch_post_id_and_site_from_url(
url_to_shortlink(post['link']))
post_data.title = unescape(post['title'])
post_data.owner_name = unescape(post['owner']['display_name'])
post_data.owner_url = post['owner']['link']
post_data.owner_rep = post['owner']['reputation']
post_data.body = post['body']
post_data.score = post['score']
post_data.up_vote_count = post['up_vote_count']
post_data.down_vote_count = post['down_vote_count']
if post_data.post_type == "answer":
post_data.question_id = post['question_id']
post_data.is_answer = True
user_posts.append(post_data)
if len(user_posts) == 0:
raise Exception("The specified user hasn't posted anything.")
if len(user_posts) > 15:
raise Exception("The specified user has an abnormally high number of spam posts. Please consider flagging for "
"moderator attention, otherwise use !!/report on the posts individually.")
why_info = u"User manually reported by *{}* in room *{}*.\n".format(msg.owner.name, msg.room.name)
# Handle all posts
for index, post in enumerate(user_posts, start=1):
batch = ""
if len(user_posts) > 1:
batch = " (batch report: post {} out of {})".format(index, len(user_posts))
handle_spam(post=Post(api_response=post.as_dict),
reasons=["Manually reported " + post.post_type + batch],
why=why_info)
time.sleep(1) # Should this be implemented differently?
if len(user_posts) > 2:
add_or_update_multiple_reporter(msg.owner.id, msg._client.host, time.time())


#
#
# Subcommands go below here
Expand Down Expand Up @@ -1310,19 +1400,14 @@ def why(msg):
"""
post_data = get_report_data(msg)
if not post_data:
post_data = fetch_user_from_allspam_report(msg.content)

if not post_data:
raise Exception("That's not a report.")

why = get_why_allspam(post_data)
return why if why else ""
raise Exception("That's not a report.")
else:
*post, _ = fetch_post_id_and_site_from_url(post_data[0])
why = get_why(post[1], post[0])
return why if why else ""

raise Exception("There is no `why` data for that user (anymore).")
if why:
return why
else:
raise Exception("There is no `why` data for that user (anymore).")


# noinspection PyIncorrectDocstring,PyUnusedLocal
Expand Down
20 changes: 0 additions & 20 deletions datahandling.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,8 +51,6 @@ def load_files():
GlobalVars.notifications = _load_pickle("notifications.p", encoding='utf-8')
if os.path.isfile("whyData.p"):
GlobalVars.why_data = _load_pickle("whyData.p", encoding='utf-8')
if os.path.isfile("whyDataAllspam.p"):
GlobalVars.why_data_allspam = _load_pickle("whyDataAllspam.p", encoding='utf-8')
if os.path.isfile("apiCalls.p"):
GlobalVars.api_calls_per_site = _load_pickle("apiCalls.p", encoding='utf-8')
if os.path.isfile("bodyfetcherQueue.p"):
Expand Down Expand Up @@ -212,20 +210,6 @@ def filter_why(max_size=50):
GlobalVars.why_data = GlobalVars.why_data[-max_size:]


def add_why_allspam(user, why):
GlobalVars.why_data_allspam.append((user, why))
filter_why_allspam()
with open("whyDataAllspam.p", "wb") as f:
pickle.dump(GlobalVars.why_data_allspam, f, protocol=pickle.HIGHEST_PROTOCOL)


def get_why_allspam(user):
for post in GlobalVars.why_data_allspam:
if post[0] == user:
return post[1]
return None


def add_post_site_id_link(post_site_id, question_id):
GlobalVars.post_site_id_to_question[post_site_id] = question_id

Expand All @@ -236,10 +220,6 @@ def get_post_site_id_link(post_site_id):
return None


def filter_why_allspam(max_size=50):
GlobalVars.why_data_allspam = GlobalVars.why_data_allspam[-max_size:]


def add_latest_smokedetector_message(room, message_id):
GlobalVars.latest_smokedetector_messages[room].append(message_id)
# Keep the last 100 messages
Expand Down
1 change: 0 additions & 1 deletion globalvars.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,7 +93,6 @@ class GlobalVars:
bodyfetcher = None
se_sites = []
why_data = []
why_data_allspam = []
notifications = []
listen_to_these_if_edited = []
multiple_reporters = []
Expand Down
28 changes: 14 additions & 14 deletions parsing.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,20 @@ def get_user_from_url(url):
return None


# noinspection PyBroadException
def get_api_sitename_from_url(url):
match = regex.compile(r"(?:https?:)?(?://)?([\w.]+)/?").search(url)
if match is None:
return None
try:
if match.group(1) == 'mathoverflow.net':
return 'mathoverflow.net'
else:
return match.group(1).split('.')[0]
except:
return None


# noinspection PyBroadException,PyMissingTypeHints
def fetch_post_url_from_msg_content(content):
search_regex = r"^\[ \[SmokeDetector\]\([^)]*\)(?: \| \[.+\]\(.+\))? \] [\w\s,:+\(\)-]+: \[.+]\(((?:http:)" \
Expand Down Expand Up @@ -103,20 +117,6 @@ def fetch_title_from_msg_content(content):
return None


# noinspection PyBroadException,PyMissingTypeHints
def fetch_user_from_allspam_report(content):
search_regex = r"^\[ \[SmokeDetector\]\([^)]*\) \] All of this user's posts are spam: \[user \d+ on " \
r"[\w\.]+\]\((//[\w\.]+/users/\d+\D*)\)(?: \[.+\]\(.+\))?$"
match = regex.compile(search_regex).search(content)
if match is None:
return None
try:
user_link = match.group(1)
return get_user_from_url(user_link)
except:
return None


# noinspection PyBroadException,PyMissingTypeHints
def edited_message_after_postgone_command(content):
search_regex = r"^\[ \[SmokeDetector\]\([^)]*\)(?: \| \[.+\]\(.+\))? \] [\w\s,:+\(\)-]+: (\[.+]\((?:(?:http:)" \
Expand Down
13 changes: 0 additions & 13 deletions spamhandling.py
Original file line number Diff line number Diff line change
Expand Up @@ -152,16 +152,3 @@ def handle_spam(post, reasons, why):
except:
exc_type, exc_obj, exc_tb = sys.exc_info()
excepthook.uncaught_exception(exc_type, exc_obj, exc_tb)


def handle_user_with_all_spam(user, why):
user_id = user[0]
site = user[1]
tab = "activity" if site == "stackexchange.com" else "topactivity"
s = "[ [SmokeDetector](//git.io/vgx7b) ] All of this user's posts are spam:" \
" [user {} on {}](//{}/users/{}?tab={})".format(user_id, site, site, user_id, tab)

log('debug', GlobalVars.parser.unescape(s).encode('ascii', errors='replace'))
datahandling.add_why_allspam(user, why)

chatcommunicate.tell_rooms(("debug", "site-" + site), (), s)
1 change: 0 additions & 1 deletion test/data_test_parsing.txt
Original file line number Diff line number Diff line change
Expand Up @@ -14,5 +14,4 @@
[ [SmokeDetector](https://github.com/Charcoal-SE/SmokeDetector) ] Title has only one unique char: [-----------------------------](//ru.stackoverflow.com/q/458053/) by [Ni55aN](//ru.stackoverflow.com/users/20555/ni55an) on `ru.stackoverflow.com`
[ [SmokeDetector](https://github.com/Charcoal-SE/SmokeDetector) ] Repeating characters in body: [Why I can't insert data in a model from a custom controller?](//stackoverflow.com/questions/27954020) by [user3754535](//stackoverflow.com/users/3754535/user3754535) on `stackoverflow.com` (@user1 @someuser)
[ [SmokeDetector](https://github.com/Charcoal-SE/SmokeDetector) ] Manually reported question (batch report: post 2 out of 3): [Why I can't insert data in a model from a custom controller?](//stackoverflow.com/questions/27954020) by [user3754535](//stackoverflow.com/users/3754535/user3754535) on `stackoverflow.com`
[ [SmokeDetector](https://github.com/Charcoal-SE/SmokeDetector) ] All of this user's posts are spam: [user 5733779 on stackoverflow.com](//stackoverflow.com/users/5733779)
[ [SmokeDetector](//goo.gl/eLDYqh) | [MS](//m.erwaysoftware.com/posts/by-url?url=//drupal.stackexchange.com/questions/230329) ] Bad keyword in title, blacklisted username, blacklisted website in body, link at end of body, pattern-matching website in body, +1 more: [spammy spammy title](//drupal.stackexchange.com/questions/230329) by [dford juriyam](//drupal.stackexchange.com/u/73447) on `drupal.SE`
10 changes: 6 additions & 4 deletions test/test_parsing.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,9 @@
# ('sd 2 tpu', preprocess_shortcut_command, 'sd tpu tpu'),
# ('sd 10 tpu', preprocess_shortcut_command, 'sd tpu tpu tpu tpu tpu tpu tpu tpu tpu tpu'),
# ('sd fp 3 tpu', preprocess_shortcut_command, 'sd fp tpu tpu tpu'),
('stackoverflow.com', get_api_sitename_from_url, 'stackoverflow'),
('http://gaming.stackexchange.com', get_api_sitename_from_url, 'gaming'),
('https://mathoverflow.net/', get_api_sitename_from_url, 'mathoverflow.net'),
(test_data_inputs[0], fetch_post_id_and_site_from_msg_content, ('246651', 'meta.stackexchange.com', 'question')),
(test_data_inputs[0], fetch_owner_url_from_msg_content, 'http://meta.stackexchange.com/users/279263/lisa-usher'),
(test_data_inputs[0], fetch_title_from_msg_content, 'Best Weight Loss Tips For Fast Results'),
Expand Down Expand Up @@ -103,10 +106,9 @@
(test_data_inputs[15], fetch_owner_url_from_msg_content, '//stackoverflow.com/users/3754535/user3754535'),
(test_data_inputs[15], fetch_title_from_msg_content, "Why I can't insert data in a model from a custom controller?"),
(test_data_inputs[15], edited_message_after_postgone_command, "[ [SmokeDetector](https://github.com/Charcoal-SE/SmokeDetector) ] Manually reported question (batch report: post 2 out of 3): *(gone)* by [user3754535](//stackoverflow.com/users/3754535/user3754535) on `stackoverflow.com`"),
(test_data_inputs[16], fetch_user_from_allspam_report, ('5733779', 'stackoverflow.com')),
(test_data_inputs[17], fetch_post_id_and_site_from_msg_content, ('230329', 'drupal.stackexchange.com', 'question')),
(test_data_inputs[17], fetch_title_from_msg_content, 'spammy spammy title'),
(test_data_inputs[17], fetch_owner_url_from_msg_content, '//drupal.stackexchange.com/u/73447')
(test_data_inputs[16], fetch_post_id_and_site_from_msg_content, ('230329', 'drupal.stackexchange.com', 'question')),
(test_data_inputs[16], fetch_title_from_msg_content, 'spammy spammy title'),
(test_data_inputs[16], fetch_owner_url_from_msg_content, '//drupal.stackexchange.com/u/73447')
])
def test_parsing(input_data, parse_method, expected):
assert parse_method(input_data.strip()) == expected
Expand Down

0 comments on commit e3e5e8b

Please sign in to comment.