-
Notifications
You must be signed in to change notification settings - Fork 133
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Seeing Timeout while contacting DNS servers
with latest v2.19.1
#233
Comments
Hi @magichair, could you please show me your config file and provide more details about your settings so that I could reproduce the issue? Thanks. |
@zhonghui12 thanks for jumping in, first time contributing to an issue on this repo so thanks for replying. Redacted <> ECS Task Definition {
"ipcMode": null,
"executionRoleArn": "arn:aws:iam::<aws-account-id>:role/defaultServiceRegistryEcsTaskExecutionRole",
"containerDefinitions": [
{
"dnsSearchDomains": null,
"environmentFiles": null,
"logConfiguration": {
"logDriver": "awsfirelens",
"secretOptions": null,
"options": {
"dd_message_key": "log",
"apikey": "<redacted-datadog-api-key>",
"provider": "ecs",
"dd_service": "acorn-starter",
"Host": "http-intake.logs.datadoghq.com",
"TLS": "on",
"dd_source": "api",
"dd_tags": "environment:development,region:us-east-1,service:acorn-starter,container:api",
"Name": "datadog"
}
},
"entryPoint": null,
"portMappings": [
{
"hostPort": 8000,
"protocol": "tcp",
"containerPort": 8000
}
],
"command": null,
"linuxParameters": null,
"cpu": 256,
"environment": [],
"resourceRequirements": null,
"ulimits": [
{
"name": "nofile",
"softLimit": 1024,
"hardLimit": 4096
}
],
"dnsServers": null,
"mountPoints": [
{
"readOnly": null,
"containerPath": "/conf.d",
"sourceVolume": "confd"
}
],
"workingDirectory": null,
"secrets": null,
"dockerSecurityOptions": null,
"memory": 512,
"memoryReservation": null,
"volumesFrom": [],
"stopTimeout": null,
"image": "<private-ecr-image-for-api-server>",
"startTimeout": null,
"firelensConfiguration": null,
"dependsOn": null,
"disableNetworking": null,
"interactive": null,
"healthCheck": {
"retries": 3,
"command": [
"CMD-SHELL",
"curl http://localhost:8000/_services/starter/health || exit 1"
],
"timeout": 5,
"interval": 30,
"startPeriod": 1
},
"essential": true,
"links": null,
"hostname": null,
"extraHosts": null,
"pseudoTerminal": null,
"user": null,
"readonlyRootFilesystem": null,
"dockerLabels": {
"container": "api",
"environment": "development",
"service": "acorn-starter",
"region": "us-east-1"
},
"systemControls": null,
"privileged": null,
"name": "api"
},
{
"dnsSearchDomains": null,
"environmentFiles": null,
"logConfiguration": {
"logDriver": "awslogs",
"secretOptions": null,
"options": {
"awslogs-group": "/fargate/service/acorn-starter/api",
"awslogs-region": "us-east-1",
"awslogs-stream-prefix": "fargate"
}
},
"entryPoint": null,
"portMappings": [],
"command": null,
"linuxParameters": null,
"cpu": 10,
"environment": [
{
"name": "DD_API_KEY",
"value": "<redacted-datadog-api-key"
},
{
"name": "DD_APM_ANALYZED_SPANS",
"value": "false"
},
{
"name": "DD_APM_ENABLED",
"value": "false"
},
{
"name": "DD_APM_NON_LOCAL_TRAFFIC",
"value": "false"
},
{
"name": "DD_DOCKER_LABELS_AS_TAGS",
"value": "{\"environment\": \"environment\",\"region\": \"region\",\"service\": \"service\", \"container\": \"container\"}"
},
{
"name": "DD_DOGSTATSD_NON_LOCAL_TRAFFIC",
"value": "true"
},
{
"name": "DD_DOGSTATSD_ORIGIN_DETECTION",
"value": "true"
},
{
"name": "DD_DOGSTATSD_PORT",
"value": "8127"
},
{
"name": "DD_IGNORE_RESOURCE",
"value": "(GET|POST) /health"
},
{
"name": "DD_LOGS_ENABLED",
"value": "false"
},
{
"name": "ECS_FARGATE",
"value": "false"
}
],
"resourceRequirements": null,
"ulimits": null,
"repositoryCredentials": null,
"dnsServers": null,
"mountPoints": [
{
"readOnly": null,
"containerPath": "/conf.d",
"sourceVolume": "confd"
}
],
"workingDirectory": null,
"secrets": null,
"dockerSecurityOptions": null,
"memory": 256,
"memoryReservation": null,
"volumesFrom": [],
"stopTimeout": null,
"image": "datadog/agent:7.16.1",
"startTimeout": null,
"firelensConfiguration": null,
"dependsOn": [
{
"containerName": "api",
"condition": "HEALTHY"
}
],
"disableNetworking": null,
"interactive": null,
"healthCheck": null,
"essential": true,
"links": null,
"hostname": null,
"extraHosts": null,
"pseudoTerminal": null,
"user": null,
"readonlyRootFilesystem": null,
"dockerLabels": {
"container": "api",
"environment": "development",
"service": "acorn-starter",
"region": "us-east-1"
},
"systemControls": null,
"privileged": null,
"name": "datadog-agent"
},
{
"dnsSearchDomains": null,
"environmentFiles": null,
"logConfiguration": {
"logDriver": "awslogs",
"secretOptions": null,
"options": {
"awslogs-group": "/fargate/service/acorn-starter/api",
"awslogs-region": "us-east-1",
"awslogs-stream-prefix": "fargate"
}
},
"entryPoint": null,
"portMappings": [],
"command": null,
"linuxParameters": null,
"cpu": 10,
"environment": [],
"resourceRequirements": null,
"ulimits": null,
"dnsServers": null,
"mountPoints": [],
"workingDirectory": null,
"secrets": null,
"dockerSecurityOptions": null,
"memory": 256,
"memoryReservation": null,
"volumesFrom": [],
"stopTimeout": null,
"image": "public.ecr.aws/aws-observability/aws-for-fluent-bit:latest",
"startTimeout": null,
"firelensConfiguration": {
"type": "fluentbit",
"options": {
"enable-ecs-log-metadata": "true"
}
},
"dependsOn": null,
"disableNetworking": null,
"interactive": null,
"healthCheck": null,
"essential": true,
"links": null,
"hostname": null,
"extraHosts": null,
"pseudoTerminal": null,
"user": "0",
"readonlyRootFilesystem": null,
"dockerLabels": {
"container": "api",
"environment": "development",
"service": "acorn-starter",
"region": "us-east-1"
},
"systemControls": null,
"privileged": null,
"name": "log_router"
}
],
"placementConstraints": [],
"memory": "2048",
"taskRoleArn": "arn:aws:iam::<aws-account-id>:role/acorn_starter_task_role",
"compatibilities": [
"EC2",
"FARGATE"
],
"taskDefinitionArn": "arn:aws:ecs:us-east-1:<aws-account-id>:task-definition/acorn-starter-api:97",
"family": "acorn-starter-api",
"requiresAttributes": [
{
"targetId": null,
"targetType": null,
"value": null,
"name": "ecs.capability.execution-role-awslogs"
},
{
"targetId": null,
"targetType": null,
"value": null,
"name": "com.amazonaws.ecs.capability.ecr-auth"
},
{
"targetId": null,
"targetType": null,
"value": null,
"name": "com.amazonaws.ecs.capability.logging-driver.awsfirelens"
},
{
"targetId": null,
"targetType": null,
"value": null,
"name": "com.amazonaws.ecs.capability.task-iam-role"
},
{
"targetId": null,
"targetType": null,
"value": null,
"name": "ecs.capability.container-health-check"
},
{
"targetId": null,
"targetType": null,
"value": null,
"name": "ecs.capability.execution-role-ecr-pull"
},
{
"targetId": null,
"targetType": null,
"value": null,
"name": "com.amazonaws.ecs.capability.docker-remote-api.1.18"
},
{
"targetId": null,
"targetType": null,
"value": null,
"name": "ecs.capability.task-eni"
},
{
"targetId": null,
"targetType": null,
"value": null,
"name": "com.amazonaws.ecs.capability.docker-remote-api.1.29"
},
{
"targetId": null,
"targetType": null,
"value": null,
"name": "com.amazonaws.ecs.capability.logging-driver.awslogs"
},
{
"targetId": null,
"targetType": null,
"value": null,
"name": "com.amazonaws.ecs.capability.docker-remote-api.1.19"
},
{
"targetId": null,
"targetType": null,
"value": null,
"name": "ecs.capability.firelens.fluentbit"
},
{
"targetId": null,
"targetType": null,
"value": null,
"name": "ecs.capability.private-registry-authentication.secretsmanager"
},
{
"targetId": null,
"targetType": null,
"value": null,
"name": "ecs.capability.container-ordering"
}
],
"pidMode": null,
"requiresCompatibilities": [
"EC2",
"FARGATE"
],
"networkMode": "awsvpc",
"cpu": "512",
"revision": 97,
"status": "ACTIVE",
"inferenceAccelerators": null,
"proxyConfiguration": null,
"volumes": [
{
"fsxWindowsFileServerVolumeConfiguration": null,
"efsVolumeConfiguration": null,
"name": "logs",
"host": {
"sourcePath": null
},
"dockerVolumeConfiguration": null
},
{
"fsxWindowsFileServerVolumeConfiguration": null,
"efsVolumeConfiguration": null,
"name": "confd",
"host": {
"sourcePath": null
},
"dockerVolumeConfiguration": null
}
]
} And here are some screenshots of, hopefully relevant, sections for easier review. And finally a more full log snippet to show frequency and other log messages around it. Thanks. |
fluent/fluent-bit#3944 looks somewhat related, however they complain about a jump from 1.8.2 to 1.8.3, and my behavior here shows 1.8.3 works and 1.8.6 does not. |
I also hit this issue with latest fluentbit as well, 2.19.0 works well. |
I can confirm that 2.19.0 works fine, the issue is in 2.19.1. |
Can confirm. Same issue. DNS times out after exactly 5 seconds. Similar to this issue with fluent-bit on K8S kubernetes/kubernetes#56903 |
2.19.1 added a new DNS network setting from upstream: https://github.com/fluent/fluent-bit/blob/master/src/flb_upstream.c#L38 So in your output you can set:
The other valid value is TCP. Does trying that affect this issue? |
@chrisgray-vertex Just checking, you use 2.19.1 right? In previous versions that setting will be entirely ignored EDIT: Sorry, its 2.19.1 |
Yes, I first tried it with latest (2.19.1) with both UDP and TCP and still had the timeouts. Then pinned it to 2.19.0 and it failed to start with an error that |
Thanks for spending time with us @chrisgray-vertex. It seems like a DNS issue in our upstream. @magichair could you please open an issue in our upstream: https://github.com/fluent/fluent-bit to let the upstream maintainer notice this? We will also talk to them about this one. Thanks! |
We hit this issue as well. I can confirm that pinning to 2.19.0 does fix it. |
Just want to share, for folks that might just be joining on here and finding this later, this repo actually maintains a We found that it was a fair bit of work to rollout to all our infra, and we don't love having to repeat that, so we ended up choosing to switch from |
This must be a bug in the upstream code base; we are tracking it upstream in this issue: fluent/fluent-bit#4050 We'll try our best to reproduce this and engage the community to take a look. The Fluent Bit code base is huge; I'm an upstream maintainer specifically for AWS integrations- we will work with the larger community to keep fluent bit stable and fix these sorts of issues. |
If anyone is curious, here is the results of my investigation: fluent/fluent-bit#4050 (comment) Currently, the recommendation from folks on this issue has been confirmed by AWS. DNS resolution seems to have issues in 2.19.1 (Fluent Bit 1.8.6), and 2.19.0 (1.8.3) works. I only was able to reproduce DNS resolution issues with the datadog output though. If anyone has a nicely reproducible test case for other outputs, please post & send me the details. Reminder that you can map our releases to upstream releases here: https://github.com/aws/aws-for-fluent-bit/releases |
We are running the datadog integration but we are not sending the logs to datadog. We send them to a internal server. I just wanted to add that even if you don't send the logs to datadog the integration still has DNS lookup problems. |
Y'all gotta fix this ASAP! 😬 |
We discussed with upstream yesterday. Upstream has identified the root cause - if plugins do not implement the config_map interface, it will caused plugin's net_setup not to be initialized. A solution is provided - output: initialize network defaults for output instances. - fluent/fluent-bit#4050 (comment). . Once upstream has released the patch, AWS will provide a new release. |
Fluent Bit 1.8.7 has fixed this issue: https://fluentbit.io/announcements/v1.8.7/ We need to release it in AWS for Fluent Bit. |
Hi @PettitWesley, any idea when the new version is going to be bumped? This is affecting our ability to have logs in production. |
@PettitWesley Release ASAP will be huge help. |
@PettitWesley @tai-acall you could easily fix it by using the stable docker image instead of latest. |
Confirming that fixing image to |
Hi @jagnk @tai-acall, Sorry for the wait. We were busy with with another release which upgraded golang version in our image. It was done yesterday and I will work on a new release today to include fluent bit 1.8.7. Thanks for the patience. I will let you all know a new image is available. |
Hi all, aws-for-fluent-bit 2.20.0 is out: https://github.com/aws/aws-for-fluent-bit/releases/tag/v2.20.0. It includes fluent bit 1.8.7 and should fix this issue. Please try the latest image to see if your problem has been resolved. Thanks for the patience! |
Could anyone try the latest image and let me know if it works fine? Very appreciated. |
No DNS error since the update to 2.20.0. Doesn't notice that warnings when using the cloudwatch_logs, after switched to cloudwatch plugin got a lot of these warnings and seems still have gaps in logs. |
This means you're producing logs faster than fluent bit can read them and store them in its buffer. So you need to increase Mem_buf_limit: https://docs.fluentbit.io/manual/administration/buffering-and-storage
This is a failed retry. If it fails to retry a chunk, then some of your logs will be lost. There should be other errors in your logs, which explain why the retry failed. |
@PettitWesley already increased that Mem_buf_limit but still see the
Also, saw next errors : Maybe I need to increase more options of the input plugin? I'm already changed from defaults
No more error messages in my fluentbit application log. |
@Funkerman1992 Please open another issue for this. This looks like a bug, somehow our calculation of the event size in the payload is wrong. If you can, please provide full details in that issue on your config, and how to repro the error. |
Close this issue for now as it has been fixed for some people. If you still have DNS issue in latest version, please track it here: #253. Thanks. |
👋
I'm seeing a stream of errors like:
when my ECS tasks have picked up the latest SHA.
I can confirm by SSH'ing to the EC2 ECS docker host and
docker exec
on to thelogrouter
instance that it is definitely able to both DNS resolvehttp-intake.logs.datadoghq.com
very quickly and can get data back from that server.Are there any published tags inpublic.ecr.aws/aws-observability/aws-for-fluent-bit
for previous releases I could pin to to mitigate the issue in the short-term?Ah, I see the https://github.com/aws/aws-for-fluent-bit#versioning-faq section, I'll pin to 2.19.0 for now.
Thank you!
The text was updated successfully, but these errors were encountered: