In [5]:
from pyspark.sql import SparkSession
import requests
import json

# Initialize Spark
spark = SparkSession.builder.appName("DataQualityCheck").getOrCreate()

In [6]:
from pyspark.sql import SparkSession
import requests
import json

# Initialize Spark
spark = SparkSession.builder.appName("DataQualityCheck").getOrCreate()

# Sample Spark DataFrame
data = [("Alice", 25), ("Bob", -5), ("Charlie", 130), ("Dana", None)]
columns = ["name", "age"]
df = spark.createDataFrame(data, columns)

# Show records (for logging)
print("\n=== Raw Spark Data ===")
df.show()

# Collect only minimal records: filter suspicious ones
bad_records = df.filter(
    (df.age < 0) | (df.age > 120) | (df.age.isNull())
)

# Collect suspicious records to Python list (should be small)
records = [row.asDict() for row in bad_records.collect()]

if not records:
    print("\n✅ No suspicious records found. Data looks good.")
else:
    # Prepare a clear prompt for Ollama
    prompt_text = (
        "Please check the following records for data quality issues, "
        "especially focusing on negative ages, missing values, or outliers "
        "(age < 0 or age > 120). Print names with age Null. Provide a brief assessment:\n"
    )
    for row in records:
        prompt_text += json.dumps(row) + "\n"

    print("\nSending to Ollama:\n", prompt_text)

    # Call local Ollama API
    ollama_url = "http://host.docker.internal:11434/api/generate"
    payload = {
        "model": "llama2",
        "prompt": prompt_text,
        "stream": False
    }

    response = requests.post(ollama_url, json=payload)
    if response.ok:
        result = response.json()
        print("\n=== Ollama Response (Quality Report) ===\n", result.get("response"))
    else:
        print("\n❌ Failed to get response from Ollama:", response.text)

spark.stop()


=== Raw Spark Data ===
+-------+----+
|   name| age|
+-------+----+
|  Alice|  25|
|    Bob|  -5|
|Charlie| 130|
|   Dana|null|
+-------+----+


Sending to Ollama:
 Please check the following records for data quality issues, especially focusing on negative ages, missing values, or outliers (age < 0 or age > 120). Print names with age Null. Provide a brief assessment:
{"name": "Bob", "age": -5}
{"name": "Charlie", "age": 130}
{"name": "Dana", "age": null}


=== Ollama Response (Quality Report) ===
 Sure, I'd be happy to check the records for data quality issues. Here are the results of my assessment:

1. {"name": "Bob", "age": -5}
	* Data quality issue: The age value is negative, which can indicate an error in input or processing.
	* Assessment: High risk
2. {"name": "Charlie", "age": 130}
	* Data quality issue: The age value is greater than the maximum possible value (120), which can indicate an error in input or processing.
	* Assessment: High risk
3. {"name": "Dana", "age": null}
	*