[HApropxy] new haproxy config and resources

aptos-labs · Sep 28, 2022 · 74b8581 · 74b8581
1 parent f87b98d
commit 74b8581
Show file tree

Hide file tree

Showing 2 changed files with 149 additions and 30 deletions.
diff --git a/terraform/helm/aptos-node/files/haproxy.cfg b/terraform/helm/aptos-node/files/haproxy.cfg
@@ -1,47 +1,161 @@
 global
     log stdout len 10240 format raw local0
-    maxconn 500000
-    nbthread 16
+
+    # Config manual: https://cbonte.github.io/haproxy-dconv/2.5/configuration.html
+    # magic values : terraform/helm/aptos-node/values.yaml
+
+    maxconn 1024
+    # This limits the whole HA Proxy impacting both validators and other frontends
+    # maxconnrate 128
+    nbthread 4
+
+    #4MB for client facing sndbuf/rcvbuf. -- 100Mb/s with 300 mili latency (e.g., us-asia)
+    #tune.sndbuf.client {{ $.Values.haproxy.limits.validator.tcpBufSize }}
+    tune.rcvbuf.client {{ $.Values.haproxy.limits.validator.tcpBufSize }}
+
     user nobody
 
+## TCP port defaults
 defaults
     log global
-    option tcplog
-    maxconn 500000
-    timeout queue 1s
-    timeout connect 10s
-    timeout server 60s
-    timeout client 60s
-    timeout client-fin 5s
-
-frontend validator
+    mode tcp
+    #option tcplog
+    option dontlog-normal
+    log-format "%ci:%cp - %sp[%rt] [%t] %ft %Tw/%Tc/%Tt %B [%ts] %ac/%fc/%bc/%sc/%rc %sq/%bq"
+    maxconn 1024		#Validator network mesh + FN x2
+    retries 3
+    timeout queue 5s  #limits num of concurrent connections. Not clear if t/o connect is needed. #https://www.papertrail.com/solution/tips/haproxy-logging-how-to-tune-timeouts-for-performance/
+    timeout connect 5s
+    # enough for 1 successfull + 5 unsuccessfull HB(10 sec interval) + 20 sec timeout
+    timeout server 80s
+    timeout client 80s
+
+    timeout client-fin 3s #How long to hold an interrupted client connection.
+    timeout server-fin 1s
+
+frontend fe-{{ include "aptos-validator.fullname" $ }}-validator
     bind :6180
-    default_backend validator
+    default_backend {{ include "aptos-validator.fullname" $ }}-validator
 
     # Deny requests from blocked IPs
-    tcp-request connection reject if { src -n -f /usr/local/etc/haproxy/blocked.ips }
+    tcp-request connection silent-drop if { src -n -f /usr/local/etc/haproxy/blocked.ips }
+
+    acl ip_high_conn_rate sc0_conn_rate gt {{ $.Values.haproxy.limits.validator.connectionsPerIPPerMin }}
+
+    stick-table type ip size 10m expire 30m store gpc0,gpc1,conn_rate(1m),bytes_out_rate(10s),bytes_out_cnt	##about 500MB of memory
+    tcp-request connection track-sc0 src 						   #update table with src ip as key, store in sc0
+
+    #We Count rate-limit manualy -- Will be more CPU intensieve but will allow whitelists to enter and up to rateLimitSession non blacklisted IPs.
+    tcp-request connection track-sc1 int(1) table CONN_RATE
+
+    #tcp-request connection sc-set-gpt0(0) int(...) if ip_high_conn_rate is better but dies with:
+    #parsing [/usr/local/etc/haproxy/haproxy.cfg:53] : internal error, unexpected rule->from=0, please report this bug!
+    #<1> Mark Blacklist
+    tcp-request connection sc-inc-gpc0(0) if ip_high_conn_rate
+
+    #This connection is silently dropped no reason to count it for rateLimitSession
+    tcp-request connection sc-inc-gpc1(1) unless { sc0_get_gpc0() ge 1 }
+
+    # an IP that was blacklisted due to to many unsucsessfull tcp attempts
+    #-1- Enforece Blacklist
+    tcp-request connection silent-drop if { sc0_get_gpc0() ge 1 }
 
-    # Limit to N TCP connections per minute per source IP
-    stick-table type ip size 500k expire 1m store gpc0_rate(1m)
-    tcp-request connection track-sc0 src
-    # TODO: Reject at content phase for now so we get logs, but this should be
-    # done at connection phase for higher efficiency
-    tcp-request content reject if { sc_gpc0_rate(0) ge {{ $.Values.haproxy.limits.validator.connectionsPerIPPerMin }} }
-    tcp-request content sc-inc-gpc0(0) unless { nbsrv(validator) eq 0 }
+    #an IP that had a sucessfull connection.
+    #-2- Allow Whitelist
+    tcp-request connection accept if { sc0_get_gpc1() ge 1 }
 
-backend validator
+    #-3- Enforece RateLimit
+    tcp-request connection reject if { sc1_gpc1_rate(CONN_RATE) gt  {{ $.Values.haproxy.limits.validator.rateLimitSession }} }
+
+    # This is a successfull connection i.e., was sent more than 16K bytes in the last 30 min
+    #tcp-request session sc-set-gpt0(0) int(...)  if { sc0_kbytes_out gt 16 }
+    #<2> Mark Whitelist
+    tcp-request session sc-inc-gpc1(0) if { sc0_kbytes_out gt 4 }
+
+    # -4- Break a long high rate connection
+    tcp-request session reject if { sc0_bytes_out_rate gt  {{ $.Values.haproxy.limits.validator.maxBytesOutRate10sec }} }
+
+backend {{ include "aptos-validator.fullname" $ }}-validator
     default-server maxconn 1024 {{ if $.Values.haproxy.config.send_proxy_protocol }}send-proxy-v2{{ end }}
     server {{ include "aptos-validator.fullname" $ }}-{{ $.Values.i }}-validator {{ include "aptos-validator.fullname" $ }}-{{ $.Values.i }}-validator:6180
 
+frontend fe-{{ include "aptos-validator.fullname" $ }}-validator-fn
+    bind :6181
+    default_backend {{ include "aptos-validator.fullname" $ }}-validator-fn
+
+    # Deny requests from blocked IPs
+    tcp-request connection silent-drop if { src -n -f /usr/local/etc/haproxy/blocked.ips }
+
+    acl ip_high_conn_rate sc0_conn_rate gt {{ $.Values.haproxy.limits.validator.connectionsPerIPPerMin }}
+
+    stick-table type ip size 10m expire 30m store gpc0,gpc1,conn_rate(1m),bytes_out_rate(10s),bytes_out_cnt	##about 500MB of memory
+    tcp-request connection track-sc0 src 						   #update table with src ip as key, store in sc0
+
+    #We Count rate-limit manualy -- Will be more CPU intensieve but will allow whitelists to enter and up to rateLimitSession non blacklisted IPs.
+    tcp-request connection track-sc1 int(1) table CONN_RATE
+
+    #tcp-request connection sc-set-gpt0(0) int(...) if ip_high_conn_rate is better but dies with:
+    #parsing [/usr/local/etc/haproxy/haproxy.cfg:53] : internal error, unexpected rule->from=0, please report this bug!
+    #<1> Mark Blacklist
+    tcp-request connection sc-inc-gpc0(0) if ip_high_conn_rate
+
+    #This connection is silently dropped no reason to count it for rateLimitSession
+    tcp-request connection sc-inc-gpc1(1) unless { sc0_get_gpc0() ge 1 }
+
+    # an IP that was blacklisted due to to many unsucsessfull tcp attempts
+    #-1- Enforece Blacklist
+    tcp-request connection silent-drop if { sc0_get_gpc0() ge 1 }
+
+    #an IP that had a sucessfull connection.
+    #-2- Allow Whitelist
+    tcp-request connection accept if { sc0_get_gpc1() ge 1 }
+
+    #-3- Enforece RateLimit
+    tcp-request connection reject if { sc1_gpc1_rate(CONN_RATE) gt  {{ $.Values.haproxy.limits.validator.rateLimitSession }} }
+
+    # This is a successfull connection i.e., was sent more than 16K bytes in the last 30 min
+    #tcp-request session sc-set-gpt0(0) int(...)  if { sc0_kbytes_out gt 16 }
+    #<2> Mark Whitelist
+    tcp-request session sc-inc-gpc1(0) if { sc0_kbytes_out gt 4 }
+
+    # -4- Break a long high rate connection
+    tcp-request session reject if { sc0_bytes_out_rate gt  {{ $.Values.haproxy.limits.validator.maxBytesOutRate10sec }} }
+
+backend {{ include "aptos-validator.fullname" $ }}-validator-fn
+    default-server maxconn 16
+    server {{ include "aptos-validator.fullname" $ }}-{{ $.Values.i }}-validator {{ include "aptos-validator.fullname" $ }}-{{ $.Values.i }}-validator:6181
+
+
+#CONNRATE holds only entry with key 1: used for determening global conn rate
+backend CONN_RATE
+    stick-table type integer size 1 expire 10m store gpc1,gpc1_rate(1s)
+
+##################  HTTP: metrics & API
+defaults
+	mode http
+        retries 3
+        timeout queue 5s  #limits num of concurrent connections. Not clear if t/o connect is needed. #https://www.papertrail.com/solution/tips/haproxy-logging-how-to-tune-timeouts-for-performance/
+        timeout connect 5s
+        timeout server 60s #what makes sense? for silence between nodes?
+        timeout client 60s
+
+        timeout client-fin 3s #How long to hold an interrupted client connection.
+        timeout server-fin 1s
+
+	timeout http-request 60s #len of http request
+	timeout http-keep-alive 2s
+
+        rate-limit sessions 256
+
 frontend validator-metrics
     mode http
     option httplog
     bind :9102
     default_backend validator-metrics
-    http-request add-header Forwarded "for=%ci"
 
     # Deny requests from blocked IPs
     tcp-request connection reject if { src -n -f /usr/local/etc/haproxy/blocked.ips }
+    http-request add-header Forwarded "for=%ci"
 
 backend validator-metrics
     mode http
@@ -55,10 +169,10 @@ frontend validator-api
     option httplog
     bind :8180
     default_backend validator-api
-    http-request add-header Forwarded "for=%ci"
 
     # Deny requests from blocked IPs
     tcp-request connection reject if { src -n -f /usr/local/etc/haproxy/blocked.ips }
+    http-request add-header Forwarded "for=%ci"
 
 backend validator-api
     mode http
@@ -87,10 +201,10 @@ frontend {{ $config.name }}-api
     default_backend {{ $config.name }}-api
     # add Forwarded header, which behaves differently than X-Forwarded-For
     # see https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/Forwarded
-    http-request add-header Forwarded "for=%ci"
 
     # Deny requests from blocked IPs
     tcp-request connection reject if { src -n -f /usr/local/etc/haproxy/blocked.ips }
+    http-request add-header Forwarded "for=%ci"
 
 backend {{ $config.name }}-api
     mode http
@@ -102,10 +216,10 @@ frontend {{ $config.name }}-metrics
     option httplog
     bind :{{ add 9103 $index }}
     default_backend {{ $config.name }}-metrics
-    http-request add-header Forwarded "for=%ci"
 
     # Deny requests from blocked IPs
     tcp-request connection reject if { src -n -f /usr/local/etc/haproxy/blocked.ips }
+    http-request add-header Forwarded "for=%ci"
 
 backend {{ $config.name }}-metrics
     mode http

diff --git a/terraform/helm/aptos-node/values.yaml b/terraform/helm/aptos-node/values.yaml
@@ -31,18 +31,23 @@ haproxy:
     pullPolicy: IfNotPresent
   resources:
     limits:
-      cpu: 1.5
-      memory: 2Gi
+      cpu: 4
+      memory: 8Gi
     requests:
-      cpu: 1.5
-      memory: 2Gi
+      cpu: 4
+      memory: 8Gi
   nodeSelector: {}
   tolerations: []
   affinity: {}
   limits:
     validator:
-      # -- Limit the number of connections per IP address per minute
+      # -- Limit the number of connections per IP address per sec
       connectionsPerIPPerMin: 2
+      # Sustained 100mb/s for 10 sec.
+      maxBytesOutRate10sec: 134217728
+      rateLimitSession: 256
+      tcpBufSize: 524288
+
   config:
     # -- Whether to send Proxy Protocol v2
     send_proxy_protocol: &send_proxy_protocol false