## Using AI for Anomalies Detection in Data Quality
**Description**: Implement an AI-based approach to detect anomalies in data quality.

**Steps**:
1. Use an Anomaly Detection Algorithm:
    - Use sklearn's Isolation Forest for anomaly detection.

**Example data:**

data = np.array([[25, 50000], [30, 60000], [35, 75000], [40, None], [45, 100000]])

2. Integrate with Great Expectations:
    - Generate alerts if anomalies are detected:

In [None]:
# Write your code from here

In [1]:
import numpy as np
import pandas as pd
from sklearn.ensemble import IsolationForest
from great_expectations.data_context import FileDataContext
from great_expectations.validator.validator import Validator

# 1. Prepare the data
data = np.array([[25, 50000],
                 [30, 60000],
                 [35, 75000],
                 [40, None],
                 [45, 100000],
                 [28, 55000],
                 [60, 15000],  # Potential anomaly
                 [32, 70000],
                 [50, 120000], # Potential anomaly
                 [38, None]])

df = pd.DataFrame(data, columns=['feature1', 'feature2'])

# Handle missing values for Isolation Forest (replace None with NaN)
df_filled = df.fillna(np.nan)

# 2. Train the Anomaly Detection Model (Isolation Forest)
model = IsolationForest(contamination='auto', random_state=42)
model.fit(df_filled)

# Predict anomalies (-1 for anomaly, 1 for inlier)
anomaly_scores = model.decision_function(df_filled)
anomaly_predictions = model.predict(df_filled)

# Add anomaly information back to the DataFrame
df['anomaly_score'] = anomaly_scores
df['is_anomaly'] = anomaly_predictions == -1

print("Data with Anomaly Scores and Predictions:")
print(df)

# 3. Integrate with Great Expectations
# Initialize Data Context (if you haven't already)
# Assuming you have a Great Expectations project initialized.
# If not, you can initialize one using:
# from great_expectations.cli import init
# init --interactive

context = FileDataContext.create(
    project_root_dir='great_expectations',  # Replace with your project directory
)

# Create a Pandas Datasource
datasource_name = "pandas_anomaly_data"
if datasource_name not in context.list_datasources():
    context.add_pandas(name=datasource_name, batch_kwargs_generators={
        "default": {
            "class_name": "BatchKwargsGenerator",
            "datasource_name": datasource_name,
            "method_name": "add_dataframe",
            "kwargs": {
                "df": df,
                "batch_kwargs": {"table": "anomaly_data"},
            },
        }
    })

# Create a Validator
batch = context.get_validator(
    datasource_name=datasource_name,
    data_connector_name="default",
    data_asset_name="anomaly_data",
)

# Define Expectations for Data Quality (including anomaly detection)
batch.expect_column_values_to_not_be_null(column='feature1')
# We can't directly "expect_anomalies" with built-in GE, so we'll check the 'is_anomaly' column

# Expectation: No anomalies should be present (ideally)
batch.expect_column_values_to_equal(column='is_anomaly', value=False, mostly=0.95)
# 'mostly' allows for a small percentage of anomalies

# Run the validation
results = batch.validate()

print("\nGreat Expectations Validation Results:")
print(results.to_json_dict())

# 4. Generate Alerts if Anomalies are Detected (based on GE validation)
if not results["success"]:
    print("\nPotential Data Quality Anomalies Detected by AI and Reported by Great Expectations:")
    for expectation_result in results["results"]:
        if not expectation_result["success"]:
            print(f"  - Expectation '{expectation_result['expectation_config']['expectation_type']}' failed: {expectation_result['result']}")
    print("\nReview the 'is_anomaly' column in the data for specific instances flagged by the AI model.")
else:
    print("\nNo data quality anomalies detected based on the defined expectations.")

ModuleNotFoundError: No module named 'great_expectations'