-
Notifications
You must be signed in to change notification settings - Fork 13.7k
/
salesforce.py
412 lines (348 loc) · 17.7 KB
/
salesforce.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
#
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
"""
Connect to your Salesforce instance, retrieve data from it, and write that data to a file for other uses.
.. note:: this hook also relies on the simple_salesforce package:
https://github.com/simple-salesforce/simple-salesforce
"""
from __future__ import annotations
import logging
import time
from functools import cached_property
from typing import TYPE_CHECKING, Any, Iterable
from simple_salesforce import Salesforce, api
from airflow.hooks.base import BaseHook
if TYPE_CHECKING:
import pandas as pd
from requests import Session
log = logging.getLogger(__name__)
class SalesforceHook(BaseHook):
"""
Creates new connection to Salesforce and allows you to pull data out of SFDC and save it to a file.
You can then use that file with other Airflow operators to move the data into another data source.
:param conn_id: The name of the connection that has the parameters needed to connect to Salesforce.
The connection should be of type `Salesforce`.
:param session_id: The access token for a given HTTP request session.
:param session: A custom HTTP request session. This enables the use of requests Session features not
otherwise exposed by `simple_salesforce`.
.. note::
A connection to Salesforce can be created via several authentication options:
* Password: Provide Username, Password, and Security Token
* Direct Session: Provide a `session_id` and either Instance or Instance URL
* OAuth 2.0 JWT: Provide a Consumer Key and either a Private Key or Private Key File Path
* IP Filtering: Provide Username, Password, and an Organization ID
If in sandbox, enter a Domain value of 'test'.
"""
conn_name_attr = "salesforce_conn_id"
default_conn_name = "salesforce_default"
conn_type = "salesforce"
hook_name = "Salesforce"
def __init__(
self,
salesforce_conn_id: str = default_conn_name,
session_id: str | None = None,
session: Session | None = None,
) -> None:
super().__init__()
self.conn_id = salesforce_conn_id
self.session_id = session_id
self.session = session
def _get_field(self, extras: dict, field_name: str):
"""Get field from extra, first checking short name, then for backcompat we check for prefixed name."""
backcompat_prefix = "extra__salesforce__"
if field_name.startswith("extra__"):
raise ValueError(
f"Got prefixed name {field_name}; please remove the '{backcompat_prefix}' prefix "
"when using this method."
)
if field_name in extras:
return extras[field_name] or None
prefixed_name = f"{backcompat_prefix}{field_name}"
return extras.get(prefixed_name) or None
@staticmethod
def get_connection_form_widgets() -> dict[str, Any]:
"""Returns connection widgets to add to connection form."""
from flask_appbuilder.fieldwidgets import BS3PasswordFieldWidget, BS3TextFieldWidget
from flask_babel import lazy_gettext
from wtforms import PasswordField, StringField
return {
"security_token": PasswordField(lazy_gettext("Security Token"), widget=BS3PasswordFieldWidget()),
"domain": StringField(lazy_gettext("Domain"), widget=BS3TextFieldWidget()),
"consumer_key": StringField(lazy_gettext("Consumer Key"), widget=BS3TextFieldWidget()),
"private_key_file_path": PasswordField(
lazy_gettext("Private Key File Path"), widget=BS3PasswordFieldWidget()
),
"private_key": PasswordField(lazy_gettext("Private Key"), widget=BS3PasswordFieldWidget()),
"organization_id": StringField(lazy_gettext("Organization ID"), widget=BS3TextFieldWidget()),
"instance": StringField(lazy_gettext("Instance"), widget=BS3TextFieldWidget()),
"instance_url": StringField(lazy_gettext("Instance URL"), widget=BS3TextFieldWidget()),
"proxies": StringField(lazy_gettext("Proxies"), widget=BS3TextFieldWidget()),
"version": StringField(lazy_gettext("API Version"), widget=BS3TextFieldWidget()),
"client_id": StringField(lazy_gettext("Client ID"), widget=BS3TextFieldWidget()),
}
@staticmethod
def get_ui_field_behaviour() -> dict[str, Any]:
"""Returns custom field behaviour."""
return {
"hidden_fields": ["schema", "port", "extra", "host"],
"relabeling": {
"login": "Username",
},
}
@cached_property
def conn(self) -> api.Salesforce:
"""Returns a Salesforce instance. (cached)."""
connection = self.get_connection(self.conn_id)
extras = connection.extra_dejson
# all extras below (besides the version one) are explicitly defaulted to None
# because simple-salesforce has a built-in authentication-choosing method that
# relies on which arguments are None and without "or None" setting this connection
# in the UI will result in the blank extras being empty strings instead of None,
# which would break the connection if "get" was used on its own.
conn = Salesforce(
username=connection.login,
password=connection.password,
security_token=self._get_field(extras, "security_token") or None,
domain=self._get_field(extras, "domain") or None,
session_id=self.session_id,
instance=self._get_field(extras, "instance") or None,
instance_url=self._get_field(extras, "instance_url") or None,
organizationId=self._get_field(extras, "organization_id") or None,
version=self._get_field(extras, "version") or api.DEFAULT_API_VERSION,
proxies=self._get_field(extras, "proxies") or None,
session=self.session,
client_id=self._get_field(extras, "client_id") or None,
consumer_key=self._get_field(extras, "consumer_key") or None,
privatekey_file=self._get_field(extras, "private_key_file_path") or None,
privatekey=self._get_field(extras, "private_key") or None,
)
return conn
def get_conn(self) -> api.Salesforce:
"""Returns a Salesforce instance. (cached)."""
return self.conn
def make_query(self, query: str, include_deleted: bool = False, query_params: dict | None = None) -> dict:
"""
Make a query to Salesforce.
:param query: The query to make to Salesforce.
:param include_deleted: True if the query should include deleted records.
:param query_params: Additional optional arguments
:return: The query result.
"""
conn = self.get_conn()
self.log.info("Querying for all objects")
query_params = query_params or {}
query_results = conn.query_all(query, include_deleted=include_deleted, **query_params)
self.log.info(
"Received results: Total size: %s; Done: %s", query_results["totalSize"], query_results["done"]
)
return query_results
def describe_object(self, obj: str) -> dict:
"""
Get the description of an object from Salesforce.
This description is the object's schema and
some extra metadata that Salesforce stores for each object.
:param obj: The name of the Salesforce object that we are getting a description of.
:return: the description of the Salesforce object.
"""
conn = self.get_conn()
return conn.__getattr__(obj).describe()
def get_available_fields(self, obj: str) -> list[str]:
"""
Get a list of all available fields for an object.
:param obj: The name of the Salesforce object that we are getting a description of.
:return: the names of the fields.
"""
obj_description = self.describe_object(obj)
return [field["name"] for field in obj_description["fields"]]
def get_object_from_salesforce(self, obj: str, fields: Iterable[str]) -> dict:
"""
Get all instances of the `object` from Salesforce.
For each model, only get the fields specified in fields.
All we really do underneath the hood is run:
SELECT <fields> FROM <obj>;
:param obj: The object name to get from Salesforce.
:param fields: The fields to get from the object.
:return: all instances of the object from Salesforce.
"""
query = f"SELECT {','.join(fields)} FROM {obj}"
self.log.info(
"Making query to Salesforce: %s",
query if len(query) < 30 else " ... ".join([query[:15], query[-15:]]),
)
return self.make_query(query)
@classmethod
def _to_timestamp(cls, column: pd.Series) -> pd.Series:
"""
Convert a column of a dataframe to UNIX timestamps if applicable.
:param column: A Series object representing a column of a dataframe.
:return: a new series that maintains the same index as the original
"""
# try and convert the column to datetimes
# the column MUST have a four digit year somewhere in the string
# there should be a better way to do this,
# but just letting pandas try and convert every column without a format
# caused it to convert floats as well
# For example, a column of integers
# between 0 and 10 are turned into timestamps
# if the column cannot be converted,
# just return the original column untouched
import numpy as np
import pandas as pd
try:
column = pd.to_datetime(column)
except ValueError:
log.error("Could not convert field to timestamps: %s", column.name)
return column
# now convert the newly created datetimes into timestamps
# we have to be careful here
# because NaT cannot be converted to a timestamp
# so we have to return NaN
converted = []
for value in column:
try:
converted.append(value.timestamp())
except (ValueError, AttributeError):
converted.append(np.NaN)
return pd.Series(converted, index=column.index)
def write_object_to_file(
self,
query_results: list[dict],
filename: str,
fmt: str = "csv",
coerce_to_timestamp: bool = False,
record_time_added: bool = False,
) -> pd.DataFrame:
"""
Write query results to file.
Acceptable formats are:
- csv:
comma-separated-values file. This is the default format.
- json:
JSON array. Each element in the array is a different row.
- ndjson:
JSON array but each element is new-line delimited instead of comma delimited like in `json`
This requires a significant amount of cleanup.
Pandas doesn't handle output to CSV and json in a uniform way.
This is especially painful for datetime types.
Pandas wants to write them as strings in CSV, but as millisecond Unix timestamps.
By default, this function will try and leave all values as they are represented in Salesforce.
You use the `coerce_to_timestamp` flag to force all datetimes to become Unix timestamps (UTC).
This is can be greatly beneficial as it will make all of your datetime fields look the same,
and makes it easier to work with in other database environments
:param query_results: the results from a SQL query
:param filename: the name of the file where the data should be dumped to
:param fmt: the format you want the output in. Default: 'csv'
:param coerce_to_timestamp: True if you want all datetime fields to be converted into Unix timestamps.
False if you want them to be left in the same format as they were in Salesforce.
Leaving the value as False will result in datetimes being strings. Default: False
:param record_time_added: True if you want to add a Unix timestamp field
to the resulting data that marks when the data was fetched from Salesforce. Default: False
:return: the dataframe that gets written to the file.
"""
fmt = fmt.lower()
if fmt not in ["csv", "json", "ndjson"]:
raise ValueError(f"Format value is not recognized: {fmt}")
df = self.object_to_df(
query_results=query_results,
coerce_to_timestamp=coerce_to_timestamp,
record_time_added=record_time_added,
)
# write the CSV or JSON file depending on the option
# NOTE:
# datetimes here are an issue.
# There is no good way to manage the difference
# for to_json, the options are an epoch or a ISO string
# but for to_csv, it will be a string output by datetime
# For JSON we decided to output the epoch timestamp in seconds
# (as is fairly standard for JavaScript)
# And for csv, we do a string
if fmt == "csv":
# there are also a ton of newline objects that mess up our ability to write to csv
# we remove these newlines so that the output is a valid CSV format
self.log.info("Cleaning data and writing to CSV")
possible_strings = df.columns[df.dtypes == "object"]
df[possible_strings] = (
df[possible_strings]
.astype(str)
.apply(lambda x: x.str.replace("\r\n", "").str.replace("\n", ""))
)
# write the dataframe
df.to_csv(filename, index=False)
elif fmt == "json":
df.to_json(filename, "records", date_unit="s")
elif fmt == "ndjson":
df.to_json(filename, "records", lines=True, date_unit="s")
return df
def object_to_df(
self, query_results: list[dict], coerce_to_timestamp: bool = False, record_time_added: bool = False
) -> pd.DataFrame:
"""
Export query results to dataframe.
By default, this function will try and leave all values as they are represented in Salesforce.
You use the `coerce_to_timestamp` flag to force all datetimes to become Unix timestamps (UTC).
This is can be greatly beneficial as it will make all of your datetime fields look the same,
and makes it easier to work with in other database environments
:param query_results: the results from a SQL query
:param coerce_to_timestamp: True if you want all datetime fields to be converted into Unix timestamps.
False if you want them to be left in the same format as they were in Salesforce.
Leaving the value as False will result in datetimes being strings. Default: False
:param record_time_added: True if you want to add a Unix timestamp field
to the resulting data that marks when the data was fetched from Salesforce. Default: False
:return: the dataframe.
"""
import pandas as pd
# this line right here will convert all integers to floats
# if there are any None/np.nan values in the column
# that's because None/np.nan cannot exist in an integer column
# we should write all of our timestamps as FLOATS in our final schema
df = pd.DataFrame.from_records(query_results, exclude=["attributes"])
df.rename(columns=str.lower, inplace=True)
# convert columns with datetime strings to datetimes
# not all strings will be datetimes, so we ignore any errors that occur
# we get the object's definition at this point and only consider
# features that are DATE or DATETIME
if coerce_to_timestamp and df.shape[0] > 0:
# get the object name out of the query results
# it's stored in the "attributes" dictionary
# for each returned record
object_name = query_results[0]["attributes"]["type"]
self.log.info("Coercing timestamps for: %s", object_name)
schema = self.describe_object(object_name)
# possible columns that can be converted to timestamps
# are the ones that are either date or datetime types
# strings are too general and we risk unintentional conversion
possible_timestamp_cols = [
field["name"].lower()
for field in schema["fields"]
if field["type"] in ["date", "datetime"] and field["name"].lower() in df.columns
]
df[possible_timestamp_cols] = df[possible_timestamp_cols].apply(self._to_timestamp)
if record_time_added:
fetched_time = time.time()
df["time_fetched_from_salesforce"] = fetched_time
return df
def test_connection(self):
"""Test the Salesforce connectivity."""
try:
self.describe_object("Account")
status = True
message = "Connection successfully tested"
except Exception as e:
status = False
message = str(e)
return status, message