-
Notifications
You must be signed in to change notification settings - Fork 157
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
8a7f9cc
commit 0420368
Showing
297 changed files
with
73,700 additions
and
1 deletion.
There are no files selected for viewing
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file added
BIN
+22.6 KB
docs/0.10.2/doctrees/dataprofiler.data_readers.filepath_or_buffer.doctree
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file added
BIN
+11.7 KB
docs/0.10.2/doctrees/dataprofiler.data_readers.structured_mixins.doctree
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file added
BIN
+87.4 KB
docs/0.10.2/doctrees/dataprofiler.labelers.char_load_tf_model.doctree
Binary file not shown.
Binary file added
BIN
+97.5 KB
docs/0.10.2/doctrees/dataprofiler.labelers.character_level_cnn_model.doctree
Binary file not shown.
Binary file added
BIN
+82 KB
docs/0.10.2/doctrees/dataprofiler.labelers.classification_report_utils.doctree
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file added
BIN
+73.9 KB
docs/0.10.2/doctrees/dataprofiler.profilers.base_column_profilers.doctree
Binary file not shown.
Binary file added
BIN
+59.1 KB
docs/0.10.2/doctrees/dataprofiler.profilers.categorical_column_profile.doctree
Binary file not shown.
Binary file added
BIN
+173 KB
docs/0.10.2/doctrees/dataprofiler.profilers.column_profile_compilers.doctree
Binary file not shown.
Binary file added
BIN
+64 KB
docs/0.10.2/doctrees/dataprofiler.profilers.data_labeler_column_profile.doctree
Binary file not shown.
Binary file added
BIN
+46.5 KB
docs/0.10.2/doctrees/dataprofiler.profilers.datetime_column_profile.doctree
Binary file not shown.
Binary file not shown.
Binary file added
BIN
+81.2 KB
docs/0.10.2/doctrees/dataprofiler.profilers.float_column_profile.doctree
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file added
BIN
+18.3 KB
docs/0.10.2/doctrees/dataprofiler.profilers.helpers.report_helpers.doctree
Binary file not shown.
Binary file not shown.
Binary file added
BIN
+78.8 KB
docs/0.10.2/doctrees/dataprofiler.profilers.int_column_profile.doctree
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file added
BIN
+81.2 KB
docs/0.10.2/doctrees/dataprofiler.profilers.numerical_column_stats.doctree
Binary file not shown.
Binary file added
BIN
+45.8 KB
docs/0.10.2/doctrees/dataprofiler.profilers.order_column_profile.doctree
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file added
BIN
+102 KB
docs/0.10.2/doctrees/dataprofiler.profilers.text_column_profile.doctree
Binary file not shown.
Binary file added
BIN
+29.5 KB
docs/0.10.2/doctrees/dataprofiler.profilers.unstructured_labeler_profile.doctree
Binary file not shown.
Binary file added
BIN
+27.5 KB
docs/0.10.2/doctrees/dataprofiler.profilers.unstructured_text_profile.doctree
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
438 changes: 438 additions & 0 deletions
438
docs/0.10.2/doctrees/nbsphinx/add_new_model_to_data_labeler.ipynb
Large diffs are not rendered by default.
Oops, something went wrong.
364 changes: 364 additions & 0 deletions
364
docs/0.10.2/doctrees/nbsphinx/column_name_labeler_example.ipynb
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,364 @@ | ||
{ | ||
"cells": [ | ||
{ | ||
"cell_type": "markdown", | ||
"id": "e04c382a-7c49-452b-b9bf-e448951c64fe", | ||
"metadata": {}, | ||
"source": [ | ||
"# ColumnName Labeler Tutorial" | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"id": "6fb3ecb9-bc51-4c18-93d5-7991bbee5165", | ||
"metadata": {}, | ||
"source": [ | ||
"This notebook teaches how to use the existing `ColumnNameModel`:\n", | ||
"\n", | ||
"1. Loading and utilizing the pre-existing `ColumnNameModel`\n", | ||
"2. Run the labeler\n", | ||
"\n", | ||
"First, let's import the libraries needed for this example." | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"id": "a67c197b-d3ee-4896-a96f-cc3d043601d3", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"import os\n", | ||
"import sys\n", | ||
"import json\n", | ||
"from pprint import pprint\n", | ||
"\n", | ||
"import pandas as pd\n", | ||
"\n", | ||
"try:\n", | ||
" import dataprofiler as dp\n", | ||
"except ImportError:\n", | ||
" sys.path.insert(0, '../..')\n", | ||
" import dataprofiler as dp" | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"id": "35841215", | ||
"metadata": {}, | ||
"source": [ | ||
"## Loading and predicting using a pre-existing model using `load_from_library`\n", | ||
"\n", | ||
"The easiest option for users is to `load_from_library` by specifying the name for the labeler in the `resources/` folder. Quickly import and start predicting with any model from the Data Profiler's library of models available." | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"id": "46e36dd6", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"labeler_from_library = dp.DataLabeler.load_from_library('column_name_labeler')" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"id": "dfa94868", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"labeler_from_library.predict(data=[\"ssn\"])" | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"id": "c71356f4-9020-4862-a1e1-816effbb5443", | ||
"metadata": {}, | ||
"source": [ | ||
"## Loading and using the pre-existing column name labeler using `load_with_components`\n", | ||
"\n", | ||
"For example purposes here, we will import the exsting `ColumnName` labeler via the `load_with_components` command from the `dp.DataLabeler`. This shows a bit more of the details of the data labeler's flow." | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"id": "818c5b88", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"parameters = {\n", | ||
" \"true_positive_dict\": [\n", | ||
" {\"attribute\": \"ssn\", \"label\": \"ssn\"},\n", | ||
" {\"attribute\": \"suffix\", \"label\": \"name\"},\n", | ||
" {\"attribute\": \"my_home_address\", \"label\": \"address\"},\n", | ||
" ],\n", | ||
" \"false_positive_dict\": [\n", | ||
" {\n", | ||
" \"attribute\": \"contract_number\",\n", | ||
" \"label\": \"ssn\",\n", | ||
" },\n", | ||
" {\n", | ||
" \"attribute\": \"role\",\n", | ||
" \"label\": \"name\",\n", | ||
" },\n", | ||
" {\n", | ||
" \"attribute\": \"send_address\",\n", | ||
" \"label\": \"address\",\n", | ||
" },\n", | ||
" ],\n", | ||
" \"negative_threshold_config\": 50,\n", | ||
" \"positive_threshold_config\": 85,\n", | ||
" \"include_label\": True,\n", | ||
" }\n", | ||
"\n", | ||
"label_mapping = {\"ssn\": 1, \"name\": 2, \"address\": 3}" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"id": "9098329e", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"# pre processor \n", | ||
"preprocessor = dp.labelers.data_processing.DirectPassPreprocessor()\n", | ||
"\n", | ||
"# model\n", | ||
"from dataprofiler.labelers.column_name_model import ColumnNameModel\n", | ||
"model = ColumnNameModel(\n", | ||
" parameters=parameters,\n", | ||
" label_mapping=label_mapping,\n", | ||
")\n", | ||
"\n", | ||
"\n", | ||
"# post processor\n", | ||
"postprocessor = dp.labelers.data_processing.ColumnNameModelPostprocessor()" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"id": "113d6655-4bca-4d8e-9e6f-b972e29d5684", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"data_labeler = dp.DataLabeler.load_with_components(\n", | ||
" preprocessor=preprocessor,\n", | ||
" model=model,\n", | ||
" postprocessor=postprocessor,\n", | ||
")\n", | ||
"data_labeler.model.help()" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"id": "0b405887-2b92-44ca-b8d7-29c384f6dd9c", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"pprint(data_labeler.label_mapping)" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"id": "11916a48-098c-4056-ac6c-b9542d85fa86", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"pprint(data_labeler.model._parameters)" | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"id": "da0e97ee-8d6d-4631-9b55-78ed904d5f41", | ||
"metadata": {}, | ||
"source": [ | ||
"### Predicting with the ColumnName labeler\n", | ||
"\n", | ||
"In the prediction below, the data will be passed into to stages in the background\n", | ||
"- 1) `compare_negative`: The idea behind the `compare_negative` is to first filter out any possibility of flagging a false positive in the model prediction. In this step, the confidence value is checked and if the similarity is too close to being a false positive, that particular string in the `data` is removed and not returned to the `compare_positive`.\n", | ||
"- 2) `compare_positive`: Finally the `data` is passed to the `compare_positive` step and checked for similarity with the the `true_positive_dict` values. Again, during this stage the `positive_threshold_config` is used to filter the results to only those `data` values that are greater than or equal to the `positive_threshold_config` provided by the user." | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"id": "fe519e65-36a7-4f42-8314-5369de8635c7", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"# evaluate a prediction using the default parameters\n", | ||
"data_labeler.predict(data=[\"ssn\", \"name\", \"address\"])" | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"id": "b41d834d-e47b-45a6-8970-d2d2033e2ade", | ||
"metadata": {}, | ||
"source": [ | ||
"## Replacing the parameters in the existing labeler\n", | ||
"\n", | ||
"We can achieve this by:\n", | ||
"1. Setting the label mapping to the new labels\n", | ||
"2. Setting the model parameters which include: `true_positive_dict`, `false_positive_dict`, `negative_threshold_config`, `positive_threshold_config`, and `include_label`\n", | ||
"\n", | ||
"where `true_positive_dict` and `false_positive_dict` are `lists` of `dicts`, `negative_threshold_config` and `positive_threshold_config` are integer values between `0` and `100`, and `include_label` is a `boolean` value that determines if the output should include the prediction labels or only the confidence values." | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"id": "c6bb010a-406f-4fd8-abd0-3355a5ad0ded", | ||
"metadata": {}, | ||
"source": [ | ||
"Below, we created 4 labels where `other` is the `default_label`." | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"id": "f86584cf-a7af-4bae-bf44-d87caa68833a", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"data_labeler.set_labels({'other': 0, \"funky_one\": 1, \"funky_two\": 2, \"funky_three\": 3})\n", | ||
"data_labeler.model.set_params(\n", | ||
" true_positive_dict= [\n", | ||
" {\"attribute\": \"ssn\", \"label\": \"funky_one\"},\n", | ||
" {\"attribute\": \"suffix\", \"label\": \"funky_two\"},\n", | ||
" {\"attribute\": \"my_home_address\", \"label\": \"funky_three\"},\n", | ||
" ],\n", | ||
" false_positive_dict=[\n", | ||
" {\n", | ||
" \"attribute\": \"contract_number\",\n", | ||
" \"label\": \"ssn\",\n", | ||
" },\n", | ||
" {\n", | ||
" \"attribute\": \"role\",\n", | ||
" \"label\": \"name\",\n", | ||
" },\n", | ||
" {\n", | ||
" \"attribute\": \"not_my_address\",\n", | ||
" \"label\": \"address\",\n", | ||
" },\n", | ||
" ],\n", | ||
" negative_threshold_config=50,\n", | ||
" positive_threshold_config=85,\n", | ||
" include_label=True,\n", | ||
")\n", | ||
"data_labeler.label_mapping" | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"id": "1ece1c8c-18a5-46fc-b563-6458e6e71e53", | ||
"metadata": {}, | ||
"source": [ | ||
"### Predicting with the new labels\n", | ||
"\n", | ||
"Here we are testing the `predict()` method with brand new labels for label_mapping. As we can see the new labels flow throught to the output of the data labeler." | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"id": "92842e14-2ea6-4879-b58c-c52b607dc94c", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"data_labeler.predict(data=[\"ssn\", \"suffix\"], predict_options=dict(show_confidences=True))" | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"id": "261b903f-8f4c-403f-839b-ab8813f850e9", | ||
"metadata": {}, | ||
"source": [ | ||
"## Saving the Data Labeler for future use" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"id": "b6ffbaf2-9400-486a-ba83-5fc9ba9334d7", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"if not os.path.isdir('new_column_name_labeler'):\n", | ||
" os.mkdir('new_column_name_labeler')\n", | ||
"data_labeler.save_to_disk('new_column_name_labeler')" | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"id": "09e40cb6-9d89-41c4-ae28-3dca498f8c68", | ||
"metadata": {}, | ||
"source": [ | ||
"## Loading the saved Data Labeler" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"id": "52615b25-70a6-4ebb-8a32-14aaf1e747d9", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"saved_labeler = dp.DataLabeler.load_from_disk('new_column_name_labeler')" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"id": "d1ccc0b3-1dc2-4847-95c2-d6b8769b1590", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"# ensuring the parametesr are what we saved.\n", | ||
"print(\"label_mapping:\")\n", | ||
"pprint(saved_labeler.label_mapping)\n", | ||
"print(\"\\nmodel parameters:\")\n", | ||
"pprint(saved_labeler.model._parameters)\n", | ||
"print()\n", | ||
"print(\"postprocessor: \" + saved_labeler.postprocessor.__class__.__name__)" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"id": "c827f2ae-4af6-4f3f-9651-9ee9ebea9fa0", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"# predicting with the loaded labeler.\n", | ||
"saved_labeler.predict([\"ssn\", \"name\", \"address\"])" | ||
] | ||
} | ||
], | ||
"metadata": { | ||
"kernelspec": { | ||
"display_name": "Python 3", | ||
"language": "python", | ||
"name": "python3" | ||
}, | ||
"language_info": { | ||
"codemirror_mode": { | ||
"name": "ipython", | ||
"version": 3 | ||
}, | ||
"file_extension": ".py", | ||
"mimetype": "text/x-python", | ||
"name": "python", | ||
"nbconvert_exporter": "python", | ||
"pygments_lexer": "ipython3", | ||
"version": "3.7.12" | ||
} | ||
}, | ||
"nbformat": 4, | ||
"nbformat_minor": 5 | ||
} |
Oops, something went wrong.