## Using AI for Anomalies Detection in Data Quality
**Description**: Implement an AI-based approach to detect anomalies in data quality.

**Steps**:
1. Use an Anomaly Detection Algorithm:
    - Use sklearn's Isolation Forest for anomaly detection.

**Example data:**

data = np.array([[25, 50000], [30, 60000], [35, 75000], [40, None], [45, 100000]])

2. Integrate with Great Expectations:
    - Generate alerts if anomalies are detected:

In [1]:
# Write your code from here
import numpy as np
import pandas as pd
from sklearn.ensemble import IsolationForest
from sklearn.impute import KNNImputer
import great_expectations as ge
from great_expectations.dataset import PandasDataset

# Function to validate the data
def validate_data(df):
    # Check if required columns exist
    required_columns = ['Age', 'Salary']
    if not all(col in df.columns for col in required_columns):
        raise ValueError(f"Missing required columns: {set(required_columns) - set(df.columns)}")
    
    # Check for non-numeric values in 'Salary' column
    if not pd.api.types.is_numeric_dtype(df['Salary']):
        raise ValueError("Salary column must be numeric.")
    
    return df

# Function to handle missing data using KNN Imputation
def handle_missing_data(df):
    imputer = KNNImputer(n_neighbors=2)
    df['Salary'] = imputer.fit_transform(df[['Salary']])
    return df

# Function to apply Isolation Forest for anomaly detection
def detect_anomalies(df):
    # Setting contamination=0.2 means that we expect 20% of the data to be anomalies
    model = IsolationForest(contamination=0.2)
    anomalies = model.fit_predict(df[['Age', 'Salary']])
    df['Anomaly'] = anomalies
    return df

# Function to generate alerts based on anomalies
def generate_alerts(df):
    if -1 in df['Anomaly'].values:
        print("Alert: Anomalies detected in the dataset!")
    else:
        print("Data quality is fine. No anomalies detected.")

# Main code execution
def main(data):
    # Convert to DataFrame
    df = pd.DataFrame(data, columns=['Age', 'Salary'])
    
    # Validate and preprocess the data
    df = validate_data(df)
    df = handle_missing_data(df)
    
    # Detect anomalies
    df = detect_anomalies(df)
    
    # Integrate with Great Expectations for validation
    df_ge = PandasDataset(df)
    
    # Create a simple expectation for checking anomalies in the 'Anomaly' column
    expectation_result = df_ge.expect_column_values_to_be_in_set('Anomaly', [1])
    
    # Generate alert based on anomaly detection
    generate_alerts(df)
    
    # Print the resulting DataFrame
    print(df)

# Example data (including a None value that represents a missing entry)
data = np.array([[25, 50000], [30, 60000], [35, 75000], [40, None], [45, 100000]])

# Execute the main function
main(data)

ModuleNotFoundError: No module named 'great_expectations.dataset'