Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

HADOOP-18610. ABFS OAuth2 Token Provider support for Azure Workload Identity #5953

Closed
wants to merge 14 commits into from
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,7 @@
import org.apache.hadoop.fs.azurebfs.oauth2.MsiTokenProvider;
import org.apache.hadoop.fs.azurebfs.oauth2.RefreshTokenBasedTokenProvider;
import org.apache.hadoop.fs.azurebfs.oauth2.UserPasswordTokenProvider;
import org.apache.hadoop.fs.azurebfs.oauth2.WorkloadIdentityTokenProvider;
import org.apache.hadoop.fs.azurebfs.security.AbfsDelegationTokenManager;
import org.apache.hadoop.fs.azurebfs.services.AuthType;
import org.apache.hadoop.fs.azurebfs.services.ExponentialRetryPolicy;
Expand Down Expand Up @@ -884,6 +885,19 @@ public AccessTokenProvider getTokenProvider() throws TokenAccessProviderExceptio
tokenProvider = new RefreshTokenBasedTokenProvider(authEndpoint,
clientId, refreshToken);
LOG.trace("RefreshTokenBasedTokenProvider initialized");
} else if (tokenProviderClass == WorkloadIdentityTokenProvider.class) {
String authority = getTrimmedPasswordString(
FS_AZURE_ACCOUNT_OAUTH_MSI_AUTHORITY,
AuthConfigurations.DEFAULT_FS_AZURE_ACCOUNT_OAUTH_MSI_AUTHORITY);
authority = appendSlashIfNeeded(authority);
String tenantId = getPasswordString(FS_AZURE_ACCOUNT_OAUTH_MSI_TENANT);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

always good to trim this so if someone splits a value with newlines it is trimmed properly

String clientId = getPasswordString(FS_AZURE_ACCOUNT_OAUTH_CLIENT_ID);
String tokenFile = getTrimmedPasswordString(
FS_AZURE_ACCOUNT_OAUTH_TOKEN_FILE,
AuthConfigurations.DEFAULT_FS_AZURE_ACCOUNT_OAUTH_TOKEN_FILE);
tokenProvider = new WorkloadIdentityTokenProvider(authority, tenantId,
clientId, tokenFile);
LOG.trace("WorkloadIdentityTokenProvider initialized");
} else {
throw new IllegalArgumentException("Failed to initialize " + tokenProviderClass);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,10 @@ public final class AuthConfigurations {
public static final String
DEFAULT_FS_AZURE_ACCOUNT_OAUTH_REFRESH_TOKEN_ENDPOINT =
"https://login.microsoftonline.com/Common/oauth2/token";
/** Default OAuth token file path for the workload identity flow. */
public static final String
DEFAULT_FS_AZURE_ACCOUNT_OAUTH_TOKEN_FILE =
"/var/run/secrets/azure/tokens/azure-identity-token";

private AuthConfigurations() {
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -232,6 +232,8 @@ public final class ConfigurationKeys {
public static final String FS_AZURE_ACCOUNT_OAUTH_REFRESH_TOKEN = "fs.azure.account.oauth2.refresh.token";
/** Key for oauth AAD refresh token endpoint: {@value}. */
public static final String FS_AZURE_ACCOUNT_OAUTH_REFRESH_TOKEN_ENDPOINT = "fs.azure.account.oauth2.refresh.token.endpoint";
/** Key for oauth AAD workload identity token file path: {@value}. */
public static final String FS_AZURE_ACCOUNT_OAUTH_TOKEN_FILE = "fs.azure.account.oauth2.token.file";
/** Key for enabling the tracking of ABFS API latency and sending the latency numbers to the ABFS API service */
public static final String FS_AZURE_ABFS_LATENCY_TRACK = "fs.azure.abfs.latency.track";

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,8 @@ public final class AzureADAuthenticator {
private static final Logger LOG = LoggerFactory.getLogger(AzureADAuthenticator.class);
private static final String RESOURCE_NAME = "https://storage.azure.com/";
private static final String SCOPE = "https://storage.azure.com/.default";
private static final String JWT_BEARER_ASSERTION = "urn:ietf:params:oauth:client-assertion-type:jwt-bearer";
private static final String CLIENT_CREDENTIALS = "client_credentials";
private static final int CONNECT_TIMEOUT = 30 * 1000;
private static final int READ_TIMEOUT = 30 * 1000;

Expand Down Expand Up @@ -103,14 +105,55 @@ public static AzureADToken getTokenUsingClientCreds(String authEndpoint,
} else {
qp.add("resource", RESOURCE_NAME);
}
qp.add("grant_type", "client_credentials");
qp.add("grant_type", CLIENT_CREDENTIALS);
qp.add("client_id", clientId);
qp.add("client_secret", clientSecret);
LOG.debug("AADToken: starting to fetch token using client creds for client ID " + clientId);

return getTokenCall(authEndpoint, qp.serialize(), null, null);
}

/**
* Gets Azure Active Directory token using the user ID and a JWT assertion
* generated by a federated authentication process.
*
* The federation process uses a feature from Azure Active Directory
* called workload identity. A workload identity is an identity used
* by a software workload (such as an application, service, script,
* or container) to authenticate and access other services and resources.
*
*
* @param authEndpoint the OAuth 2.0 token endpoint associated
* with the user's directory (obtain from
* Active Directory configuration)
* @param clientId the client ID (GUID) of the client web app
* obtained from Azure Active Directory configuration
* @param clientAssertion the JWT assertion token
* @return {@link AzureADToken} obtained using the creds
* @throws IOException throws IOException if there is a failure in connecting to Azure AD
*/
public static AzureADToken getTokenUsingJWTAssertion(String authEndpoint,
String clientId, String clientAssertion) throws IOException {
Preconditions.checkNotNull(authEndpoint, "authEndpoint");
Preconditions.checkNotNull(clientId, "clientId");
Preconditions.checkNotNull(clientAssertion, "clientAssertion");
boolean isVersion2AuthenticationEndpoint = authEndpoint.contains("/oauth2/v2.0/");

QueryParams qp = new QueryParams();
if (isVersion2AuthenticationEndpoint) {
qp.add("scope", SCOPE);
} else {
qp.add("resource", RESOURCE_NAME);
}
qp.add("grant_type", CLIENT_CREDENTIALS);
qp.add("client_id", clientId);
qp.add("client_assertion", clientAssertion);
qp.add("client_assertion_type", JWT_BEARER_ASSERTION);
LOG.debug("AADToken: starting to fetch token using client assertion for client ID " + clientId);

return getTokenCall(authEndpoint, qp.serialize(), null, "POST");
}

/**
* Gets AAD token from the local virtual machine's VM extension. This only works on
* an Azure VM with MSI extension
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,150 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package org.apache.hadoop.fs.azurebfs.oauth2;

import java.io.File;
import java.io.IOException;

import org.apache.hadoop.classification.VisibleForTesting;
import org.apache.hadoop.thirdparty.com.google.common.base.Strings;
import org.apache.hadoop.util.Preconditions;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import org.apache.commons.io.FileUtils;


/**
* Provides tokens based on Azure AD Workload Identity.
*/
public class WorkloadIdentityTokenProvider extends AccessTokenProvider {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We can add tests around this class. Following is something around which we can write tests which can help prevent regressions in future:

  1. how refreshing the token work.
    • We can have a protected method getTokenTtl() which on production code would give ONE_HOUR, but in test, we can mock it as per the test requirement.
    • We can mock the external call, the super call.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thank you for the guidance. I implemented several unit tests.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I see, we have added tests around hasEnoughTimeElapsedSinceLastRefresh. Was expecting if token refreshing could also be tested. And, also we could see refreshing on a upper layer (still an unit test):

  1. We have AccessTokenProvider object (instance of WorkloadIdentityTokenProvider ), and we could call AccessTokenProvider 's method getToken(), and then test different scenarios.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I understand the request, but I don't see a reasonable way to mock the code for a unit test of AccessTokenProvider. Note that AccessTokenProvider.getToken() calls WorkloadIdentityProvider.refreshToken(), which calls AzureADAuthenticator.getTokenUsingJWTAssertion(), which is a static method that eventually makes HTTP requests. That is a problem because:

  • All unit tests currently use Mockito version 2.28.2, which does not support mocking static methods.
  • The TestAzureADAuthenticator unit tests do not show how to mock the HTTP requests made by AzureADAuthenticator.

Without a way to mock the calls made by WorkloadIdentityProvider.refreshToken() or AzureADAuthenticator.getTokenUsingJWTAssertion(), all unit tests will try to make real HTTP requests which will always fail.

Do you have any ideas on how to work around the limitations of the code to implement the desired unit test?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, we dont need to mock the static method. What can be done is:
we have a method: getTokenUsingJWTAssertion() which calls AzureADAuthenticator .getTokenUsingJWTAssertion(authEndpoint, clientId, clientAssertion);. Now, this new method is mockable, and in the test, we can give the required behavior.
Also, the methods which are mockable from the test, we add an annotation VisibleForTesting

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thank you for the help. I added more unit tests per your instructions.


private static final String OAUTH2_TOKEN_PATH = "/oauth2/v2.0/token";
private final String authEndpoint;

private final String clientId;

private final String tokenFile;

private long tokenFetchTime = -1;

private static final long ONE_HOUR = 3600 * 1000;

private static final Logger LOG = LoggerFactory.getLogger(AccessTokenProvider.class);

public WorkloadIdentityTokenProvider(final String authority, final String tenantId,
final String clientId, final String tokenFile) {
Preconditions.checkNotNull(authority, "authority");
Preconditions.checkNotNull(tenantId, "tenantId");
Preconditions.checkNotNull(clientId, "clientId");
Preconditions.checkNotNull(tokenFile, "tokenFile");

this.authEndpoint = authority + tenantId + OAUTH2_TOKEN_PATH;
this.clientId = clientId;
this.tokenFile = tokenFile;
}

@Override
protected AzureADToken refreshToken() throws IOException {
LOG.debug("AADToken: refreshing token from JWT Assertion");
String clientAssertion = getClientAssertion();
AzureADToken token = getTokenUsingJWTAssertion(clientAssertion);
tokenFetchTime = System.currentTimeMillis();
return token;
}

/**
* Gets the Azure AD token from a client assertion in JWT format.
* This method exists to make unit testing possible.
*
* @param clientAssertion the client assertion.
* @return the Azure AD token.
* @throws IOException if there is a failure in connecting to Azure AD.
*/
@VisibleForTesting
AzureADToken getTokenUsingJWTAssertion(String clientAssertion) throws IOException {
return AzureADAuthenticator
.getTokenUsingJWTAssertion(authEndpoint, clientId, clientAssertion);
}

/**
* Checks if the token is about to expire as per base expiry logic.
* Otherwise try to expire if enough time has elapsed since the last refresh.
*
* @return true if the token is expiring in next 1 hour or if a token has
* never been fetched
*/
@Override
protected boolean isTokenAboutToExpire() {
return super.isTokenAboutToExpire() || hasEnoughTimeElapsedSinceLastRefresh();
}

/**
* Checks to see if enough time has elapsed since the last token refresh.
*
* @return true if the token was last refreshed more than an hour ago.
*/
protected boolean hasEnoughTimeElapsedSinceLastRefresh() {
if (getTokenFetchTime() == -1) {
return true;
}
boolean expiring = false;
long elapsedTimeSinceLastTokenRefreshInMillis =
System.currentTimeMillis() - getTokenFetchTime();
// In case token is not refreshed for 1 hr or any clock skew issues,
// refresh token.
expiring = elapsedTimeSinceLastTokenRefreshInMillis >= ONE_HOUR
|| elapsedTimeSinceLastTokenRefreshInMillis < 0;
if (expiring) {
LOG.debug("JWTToken: token renewing. Time elapsed since last token fetch:"
+ " {} milliseconds", elapsedTimeSinceLastTokenRefreshInMillis);
}
return expiring;
}

/**
* Gets the client assertion from the token file. The token in the file
* is automatically refreshed by Azure at least once every 24 hours.
* See <a href="https://azure.github.io/azure-workload-identity/docs/faq.html#does-workload-identity-work-in-disconnected-environments">
* Azure Workload Identity FAQ</a>.
*
* @return the client assertion.
* @throws IOException if the token file is empty.
*/
private String getClientAssertion()
throws IOException {
File file = new File(tokenFile);
String clientAssertion = FileUtils.readFileToString(file, "UTF-8");
if (Strings.isNullOrEmpty(clientAssertion)) {
throw new IOException("Empty token file.");
}
return clientAssertion;
}

/**
* Returns the last time the token was fetched from the token file.
* This method exists to make unit testing possible.
*
* @return the time the token was last fetched.
*/
@VisibleForTesting
long getTokenFetchTime() {
return tokenFetchTime;
}
}
52 changes: 50 additions & 2 deletions hadoop-tools/hadoop-azure/src/site/markdown/abfs.md
Original file line number Diff line number Diff line change
Expand Up @@ -322,8 +322,8 @@ The authentication mechanism is set in `fs.azure.account.auth.type` (or the
account specific variant). The possible values are SharedKey, OAuth, Custom
and SAS. For the various OAuth options use the config `fs.azure.account
.oauth.provider.type`. Following are the implementations supported
ClientCredsTokenProvider, UserPasswordTokenProvider, MsiTokenProvider and
RefreshTokenBasedTokenProvider. An IllegalArgumentException is thrown if
ClientCredsTokenProvider, UserPasswordTokenProvider, MsiTokenProvider,
RefreshTokenBasedTokenProvider and WorkloadIdentityTokenProvider. An IllegalArgumentException is thrown if
the specified provider type is not one of the supported.

All secrets can be stored in JCEKS files. These are encrypted and password
Expand Down Expand Up @@ -560,6 +560,54 @@ The Azure Portal/CLI is used to create the service identity.
</property>
```

### <a name="workload-identity"></a> Azure Workload Identity

[Azure Workload Identities](https://docs.microsoft.com/en-us/azure/active-directory/managed-identities-azure-resources/overview), formerly "Azure AD pod identity".

OAuth 2.0 tokens are written to a file that is only accessible
from the executing pod (`/var/run/secrets/azure/tokens/azure-identity-token`).
The issued credentials can be used to authenticate.

The Azure Portal/CLI is used to create the service identity.

```xml
<property>
<name>fs.azure.account.auth.type</name>
<value>OAuth</value>
<description>
Use OAuth authentication
</description>
</property>
<property>
<name>fs.azure.account.oauth.provider.type</name>
<value>org.apache.hadoop.fs.azurebfs.oauth2.WorkloadIdentityTokenProvider</value>
<description>
Use Workload Identity for issuing OAuth tokens
</description>
</property>
<property>
<name>fs.azure.account.oauth2.msi.tenant</name>
<value>${env.AZURE_TENANT_ID}</value>
<description>
Optional MSI Tenant ID
</description>
</property>
<property>
<name>fs.azure.account.oauth2.client.id</name>
<value>${env.AZURE_CLIENT_ID}</value>
<description>
Optional Client ID
</description>
</property>
<property>
<name>fs.azure.account.oauth2.token.file</name>
<value>${env.AZURE_FEDERATED_TOKEN_FILE}</value>
<description>
Token file path
</description>
</property>
```

### Custom OAuth 2.0 Token Provider

A Custom OAuth 2.0 token provider supplies the ABFS connector with an OAuth 2.0
Expand Down
36 changes: 36 additions & 0 deletions hadoop-tools/hadoop-azure/src/site/markdown/testing_azure.md
Original file line number Diff line number Diff line change
Expand Up @@ -837,6 +837,42 @@ hierarchical namespace enabled, and set the following configuration settings:
</property>
-->

<!--2.5. If "WorkloadIdentityTokenProvider" is set as key provider, uncomment below and
set tenant, client id and token file path.

All service principals must have federated identity credentials for Kubernetes.
See Azure docs: https://learn.microsoft.com/en-us/azure/active-directory/workload-identities/workload-identity-federation-create-trust?pivots=identity-wif-apps-methods-azp#kubernetes

Retrieve the Azure identity token from kubernetes:
1. Create AKS cluster with Workload Identity: https://learn.microsoft.com/en-us/azure/aks/workload-identity-deploy-cluster
2. Create the pod:
kubectl apply -f src/test/resources/workload-identity-pod.yaml
3. After the pod is running, retrieve the identity token from the pod logs:
kubectl logs pod/workload-identity
4. Save the identity token to the token file path specified below.

The Azure identity token expires after 1 hour.
-->
<!--
<property>
<name>fs.azure.account.oauth2.msi.tenant.{ABFS_ACCOUNT_NAME}</name>
<value>{tenantGuid}</value>
<description>msi tenantGuid.</description>
</property>

<property>
<name>fs.azure.account.oauth2.client.id.{ABFS_ACCOUNT_NAME}</name>
<value>{client id}</value>
<description>AAD client id.</description>
</property>

<property>
<name>fs.azure.account.oauth2.client.token.file.{ABFS_ACCOUNT_NAME}</name>
<value>{token file path}</value>
<description>Azure identity token file path.</description>
</property>
-->

<!--
<property>
<name>fs.azure.identity.transformer.enable.short.name</name>
Expand Down
Loading