forked from openshift/origin
-
Notifications
You must be signed in to change notification settings - Fork 1
/
router.go
211 lines (182 loc) · 7.05 KB
/
router.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
package cluster
import (
"bufio"
"errors"
"fmt"
"io"
"reflect"
"regexp"
"time"
kerrs "k8s.io/apimachinery/pkg/api/errors"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/labels"
kapi "k8s.io/kubernetes/pkg/api"
kclientset "k8s.io/kubernetes/pkg/client/clientset_generated/internalclientset"
authorizationapi "github.com/openshift/origin/pkg/authorization/apis/authorization"
osclient "github.com/openshift/origin/pkg/client"
deployapi "github.com/openshift/origin/pkg/deploy/apis/apps"
"github.com/openshift/origin/pkg/diagnostics/types"
)
// ClusterRouter is a Diagnostic to check that there is a working router.
type ClusterRouter struct {
KubeClient kclientset.Interface
OsClient *osclient.Client
}
const (
ClusterRouterName = "ClusterRouter"
routerName = "router"
clientAccessError = `Client error while retrieving router records. Client retrieved records
during discovery, so this is likely to be a transient error. Try running
diagnostics again. If this message persists, there may be a permissions
problem with getting router records. The error was:
(%T) %[1]v`
clGetRtNone = `
There is no "%s" DeploymentConfig. The router may have been named
something different, in which case this warning may be ignored.
A router is not strictly required; however it is needed for accessing
pods from external networks and its absence likely indicates an incomplete
installation of the cluster.
Use the 'oc adm router' command to create a router.
`
clGetRtFailed = `
Client error while retrieving "%s" DC. Client retrieved records
before, so this is likely to be a transient error. Try running
diagnostics again. If this message persists, there may be a permissions
problem with getting records. The error was:
(%[2]T) %[2]v`
clRtNoPods = `
The "%s" DeploymentConfig exists but has no running pods, so it
is not available. Apps will not be externally accessible via the router.`
clRtPodLog = `
Failed to read the logs for the "%s" pod belonging to
the router deployment. This is not a problem by itself but prevents
diagnostics from looking for errors in those logs. The error encountered
was:
%s`
clRtPodConn = `
Recent pod logs for the "%s" pod belonging to
the router deployment indicated a problem requesting route information
from the master. This prevents the router from functioning, so
applications will not be externally accessible via the router.
There are many reasons for this request to fail, including invalid
credentials, DNS failures, master outages, and so on. Examine the
following error message from the router pod logs to determine the
cause of the problem:
%s
Time: %s`
)
func (d *ClusterRouter) Name() string {
return "ClusterRouterName"
}
func (d *ClusterRouter) Description() string {
return "Check there is a working router"
}
func (d *ClusterRouter) CanRun() (bool, error) {
if d.KubeClient == nil || d.OsClient == nil {
return false, errors.New("must have kube and os client")
}
can, err := userCan(d.OsClient, authorizationapi.Action{
Namespace: metav1.NamespaceDefault,
Verb: "get",
Group: deployapi.GroupName,
Resource: "deploymentconfigs",
ResourceName: routerName,
})
if err != nil {
return false, types.DiagnosticError{ID: "DClu2010", LogMessage: fmt.Sprintf(clientAccessError, err), Cause: err}
} else if !can {
return false, types.DiagnosticError{ID: "DClu2011", LogMessage: "Client does not have cluster-admin access", Cause: err}
}
return true, nil
}
func (d *ClusterRouter) Check() types.DiagnosticResult {
r := types.NewDiagnosticResult(ClusterRouterName)
if dc := d.getRouterDC(r); dc != nil {
// Check that it actually has running pod(s) selected
if podList := d.getRouterPods(dc, r); podList != nil {
for _, pod := range podList.Items {
// Check the logs for that pod for common issues (credentials, DNS resolution failure)
d.checkRouterLogs(&pod, r)
}
}
}
return r
}
func (d *ClusterRouter) getRouterDC(r types.DiagnosticResult) *deployapi.DeploymentConfig {
dc, err := d.OsClient.DeploymentConfigs(metav1.NamespaceDefault).Get(routerName, metav1.GetOptions{})
if err != nil && reflect.TypeOf(err) == reflect.TypeOf(&kerrs.StatusError{}) {
r.Warn("DClu2001", err, fmt.Sprintf(clGetRtNone, routerName))
return nil
} else if err != nil {
r.Error("DClu2002", err, fmt.Sprintf(clGetRtFailed, routerName, err))
return nil
}
r.Debug("DClu2003", fmt.Sprintf("Found default router DC"))
return dc
}
func (d *ClusterRouter) getRouterPods(dc *deployapi.DeploymentConfig, r types.DiagnosticResult) *kapi.PodList {
pods, err := d.KubeClient.Core().Pods(metav1.NamespaceDefault).List(metav1.ListOptions{LabelSelector: labels.SelectorFromSet(dc.Spec.Selector).String()})
if err != nil {
r.Error("DClu2004", err, fmt.Sprintf("Finding pods for '%s' DeploymentConfig failed. This should never happen. Error: (%[2]T) %[2]v", routerName, err))
return nil
}
running := []kapi.Pod{}
for _, pod := range pods.Items {
if pod.Status.Phase != kapi.PodRunning {
r.Debug("DClu2005", fmt.Sprintf("router pod with name %s is not running", pod.ObjectMeta.Name))
} else {
running = append(running, pod)
r.Debug("DClu2006", fmt.Sprintf("Found running router pod with name %s", pod.ObjectMeta.Name))
}
}
pods.Items = running
if len(running) == 0 {
r.Error("DClu2007", nil, fmt.Sprintf(clRtNoPods, routerName))
return nil
}
return pods
}
// It's like a ReadCloser that gives back lines of text and you still have to Close().
type lineScanner struct {
Scanner *bufio.Scanner
ReadCloser io.ReadCloser
}
func (s *lineScanner) Scan() bool { return s.Scanner.Scan() }
func (s *lineScanner) Text() string { return s.Scanner.Text() }
func (s *lineScanner) Close() error { return s.ReadCloser.Close() }
func (d *ClusterRouter) getPodLogScanner(pod *kapi.Pod) (*lineScanner, error) {
readCloser, err := d.KubeClient.Core().RESTClient().Get().
Namespace(pod.ObjectMeta.Namespace).
Name(pod.ObjectMeta.Name).
Resource("pods").SubResource("log").
Param("follow", "false").
Param("container", pod.Spec.Containers[0].Name).
Stream()
if err != nil {
return nil, err
}
return &lineScanner{bufio.NewScanner(readCloser), readCloser}, nil
}
// http://golang.org/pkg/time/#Parse
// reference time is Mon Jan 2 15:04:05 -0700 MST 2006
var referenceTimestampLayout = "2006-01-02T15:04:05.000000000Z"
func (d *ClusterRouter) checkRouterLogs(pod *kapi.Pod, r types.DiagnosticResult) {
scanner, err := d.getPodLogScanner(pod)
if err != nil {
r.Warn("DClu2008", err, fmt.Sprintf(clRtPodLog, pod.ObjectMeta.Name, fmt.Sprintf("(%T) %[1]v", err)))
return
}
defer scanner.Close()
for scanner.Scan() {
matches := regexp.MustCompile(`^(\S+).*Failed to list \*api.Route: (.*)`).FindStringSubmatch(scanner.Text())
if len(matches) > 0 {
stamp, err := time.Parse(referenceTimestampLayout, matches[1])
// router checks every second. error only if failure is recent.
// of course... we cannot always trust the local clock.
if err == nil && time.Since(stamp).Seconds() < 30.0 {
r.Error("DClu2009", nil, fmt.Sprintf(clRtPodConn, pod.ObjectMeta.Name, matches[2], matches[1]))
break
}
}
}
}