In [0]:
%sql

USE CATALOG WORKSPACE;

CREATE SCHEMA workspace.vcc;

CREATE VOLUME upload_location;

In [0]:
from pyspark.sql.functions import col, current_timestamp
import time
from datetime import datetime

volume_path = "/Volumes/workspace/vcc/upload_location/"
polling_interval = 5  # seconds between checks
total_duration = 120  # total monitoring duration in seconds

locations=[]

# Initialize file tracking
processed_files = set()
start_time = time.time()

print(f"Starting file monitor at {datetime.now().strftime('%H:%M:%S')}")
print(f"Will run for {total_duration} seconds")
print(f"Upload files to: {volume_path}\n")

def process_single_file(file_path):
    """Process one file using Spark's streaming API"""
    try:
        print(f"\nProcessing file: {file_path}")
        
        streaming_df = (spark.readStream
            .format("cloudFiles")
            .option("cloudFiles.format", "csv")
            .option("cloudFiles.schemaLocation", f"{volume_path}_schema")
            .option("cloudFiles.schemaEvolutionMode", "rescue")
            .option("header", "true")
            .load(file_path)  # Single file path
            .withColumn("file_path", col("_metadata.file_path"))
            .withColumn("processed_at", current_timestamp()))
        
        def process_batch(batch_df, batch_id):
            print(f"📢 Processing batch from {file_path}")
            batch_df.show(5, truncate=False)
        
        stream = (streaming_df.writeStream
            .foreachBatch(process_batch)
            .option("checkpointLocation", f"{volume_path}/_checkpoints/{hash(file_path)}")
            .trigger(availableNow=True)
            .start())
        
        stream.awaitTermination()
        return True
        
    except Exception as e:
        print(f"Error processing {file_path}: {str(e)}")
        return False

while time.time() - start_time < total_duration:
    current_time = datetime.now().strftime('%H:%M:%S')
    print(f"\n=== Checking for files at {current_time} ===")
    
    # List all CSV files in the volume
    try:
        files = [f.path for f in dbutils.fs.ls(volume_path) 
                if f.path.lower().endswith('.csv') and f.path not in processed_files]
        
        if not files:
            print("No new files found")
        else:
            for file_path in files:
                if process_single_file(file_path):
                    processed_files.add(file_path)
                    print(f"✅ Successfully processed {file_path}")
    
    except Exception as e:
        print(f"Error listing files: {str(e)}")
    
    time.sleep(polling_interval)

print("\n" + "="*50)
print("Monitoring completed. Summary:")
print(f"Total files processed: {len(processed_files)}")
if processed_files:
    print("Processed files:")
    for file in processed_files:
        print(f"- {file}")
        locations.append(file)
else:
    print("No files were processed during this session")

Starting file monitor at 16:09:08
Will run for 120 seconds
Upload files to: /Volumes/workspace/vcc/upload_location/


=== Checking for files at 16:09:08 ===

Processing file: dbfs:/Volumes/workspace/vcc/upload_location/covid.csv
✅ Successfully processed dbfs:/Volumes/workspace/vcc/upload_location/covid.csv

Processing file: dbfs:/Volumes/workspace/vcc/upload_location/stock.csv
✅ Successfully processed dbfs:/Volumes/workspace/vcc/upload_location/stock.csv

=== Checking for files at 16:09:59 ===

Processing file: dbfs:/Volumes/workspace/vcc/upload_location/products.csv
✅ Successfully processed dbfs:/Volumes/workspace/vcc/upload_location/products.csv

=== Checking for files at 16:10:12 ===
No new files found

=== Checking for files at 16:10:18 ===
No new files found

=== Checking for files at 16:10:23 ===
No new files found

=== Checking for files at 16:10:28 ===

Processing file: dbfs:/Volumes/workspace/vcc/upload_location/weather.csv
✅ Successfully processed dbfs:/Volumes/workspace/vcc/

In [0]:
print(locations,"\n\n")

for file in locations:
    df = spark.read.csv(file, header=True, inferSchema=True)
    
    # Rename columns to remove invalid characters
    for col in df.columns:
        new_col = col.replace(" ", "_").replace(";", "_").replace("{", "_") \
                     .replace("}", "_").replace("(", "_").replace(")", "_") \
                     .replace("\n", "_").replace("\t", "_").replace("=", "_")
        df = df.withColumnRenamed(col, new_col)
    
    print(f"saving table workspace.vcc.{file.split('/')[-1].split('.')[0]}_data")
    df.write.mode("overwrite").saveAsTable(f"workspace.vcc.{file.split('/')[-1].split('.')[0]}_data")

['dbfs:/Volumes/workspace/vcc/upload_location/stock.csv', 'dbfs:/Volumes/workspace/vcc/upload_location/products.csv', 'dbfs:/Volumes/workspace/vcc/upload_location/covid.csv', 'dbfs:/Volumes/workspace/vcc/upload_location/weather.csv'] 


saving table workspace.vcc.stock_data
saving table workspace.vcc.products_data
saving table workspace.vcc.covid_data
saving table workspace.vcc.weather_data


In [0]:
# from openai import OpenAI
# import pandas as pd
# from pyspark.sql.functions import lit

# # Initialize the OpenAI client - REMOVE API KEY BEFORE SHARING!
# client = OpenAI(api_key="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")

# def get_data_sample(file_path, sample_size=10):
#     """Read first n rows from a CSV file"""
#     try:
#         df = spark.read.format("csv").option("header", "true").load(file_path).limit(sample_size)
#         return df.toPandas()
#     except Exception as e:
#         print(f"Error reading {file_path}: {str(e)}")
#         return None

# def generate_analysis_prompt(data_sample, filename):
#     """Create a detailed prompt for analysis suggestions"""
#     return f"""
#     As a data analyst, please examine this dataset sample from {filename} and provide:
    
#     1. Three specific business questions this data could answer
#     2. Recommended statistical analyses
#     3. Suggested visualizations
#     4. Potential data quality issues to check
    
#     Sample data (first {len(data_sample)} rows):
#     {data_sample.to_string()}
    
#     Please format your response with clear sections for each recommendation type.
#     """

# def get_ai_analysis(prompt):
#     """Get analysis suggestions from OpenAI"""
#     try:
#         response = client.chat.completions.create(
#             model="gpt-3.5-turbo",  # Using widely available model
#             messages=[
#                 {"role": "system", "content": "You are a data analysis expert providing actionable insights."},
#                 {"role": "user", "content": prompt}
#             ],
#             temperature=0.7
#         )
#         return response.choices[0].message.content
#     except Exception as e:
#         print(f"AI analysis failed: {str(e)}")
#         return None

# # Process each file
# analysis_results = []
# for file_path in locations:
#     print(f"\nProcessing {file_path.split('/')[-1]}...")
    
#     # Get data sample
#     sample_data = get_data_sample(file_path)
#     if sample_data is None:
#         continue
        
#     display(sample_data)
    
#     # Generate and display analysis
#     prompt = generate_analysis_prompt(sample_data, file_path.split('/')[-1])
#     analysis = get_ai_analysis(prompt)
    
#     if analysis:
#         displayHTML(f"""
#         <div style='border:1px solid #eee; padding:10px; margin:10px 0;'>
#             <h3>Analysis for {file_path.split('/')[-1]}</h3>
#             <pre style='white-space:pre-wrap;'>{analysis}</pre>
#         </div>
#         """)
#         analysis_results.append((file_path, analysis))
#     else:
#         print("Failed to get AI analysis for this file")

# # Save results to Delta table if needed
# if analysis_results:
#     (spark.createDataFrame(analysis_results, ["file_path", "analysis"])
#      .write.mode("overwrite").saveAsTable("file_analyses"))
#     print("\nResults saved to 'file_analyses' table")
# else:
#     print("\nNo analyses were generated")


Processing satisfaction.csv...


Satisfaction
Very Satisfied
Satisfied
Neutral
Unsatisfied
Very Unsatisfied


AI analysis failed: Error code: 429 - {'error': {'message': 'You exceeded your current quota, please check your plan and billing details. For more information on this error, read the docs: https://platform.openai.com/docs/guides/error-codes/api-errors.', 'type': 'insufficient_quota', 'param': None, 'code': 'insufficient_quota'}}
Failed to get AI analysis for this file

Processing probability.csv...


Probability
Definitely
Probably
Not Sure
Probably Not
Definitely Not


AI analysis failed: Error code: 429 - {'error': {'message': 'You exceeded your current quota, please check your plan and billing details. For more information on this error, read the docs: https://platform.openai.com/docs/guides/error-codes/api-errors.', 'type': 'insufficient_quota', 'param': None, 'code': 'insufficient_quota'}}
Failed to get AI analysis for this file

No analyses were generated


In [0]:
# from pyspark.sql.functions import col
# import pandas as pd
# import json
# import re

# # List of CSV file locations
# locations = [
#     "/Volumes/workspace/vcc/upload_location/satisfaction.csv",
#     "/Volumes/workspace/vcc/upload_location/probability.csv"
# ]

# def get_sample_data(file_path, sample_size=6):
#     """Read first n rows from a CSV file"""
#     try:
#         df = spark.read.format("csv").option("header", "true").load(file_path).limit(sample_size)
#         return df.toPandas().to_string()
#     except Exception as e:
#         print(f"Error reading {file_path}: {str(e)}")
#         return None

# def analyze_with_genie(data_sample, file_name):
#     """Use Databricks Assistant to analyze the data"""
#     try:
#         # Prepare the prompt
#         prompt = f"""
#         Analyze this dataset sample from {file_name} and provide:
#         1. Three specific business questions this data could answer
#         2. Recommended statistical analyses
#         3. Suggested visualizations
#         4. Potential data quality issues to check
        
#         Sample data (first 6 rows with headers):
#         {data_sample}
        
#         Please format your response with clear sections for each recommendation type.
#         """
        
#         # Use Databricks Assistant (Genie)
#         response = dbutils.notebook.run(
#             path="/_assistant/query",
#             timeout_seconds=60,
#             arguments={
#                 "query": prompt,
#                 "session_id": "csv_analysis_session"
#             }
#         )
        
#         # Clean up the response
#         cleaned_response = re.sub(r'^Assistant: ', '', response.strip())
#         return cleaned_response
        
#     except Exception as e:
#         print(f"Genie analysis failed: {str(e)}")
#         return None

# def process_file(file_path):
#     """Process a single CSV file with Genie"""
#     try:
#         file_name = file_path.split('/')[-1]
#         print(f"\nProcessing {file_name}...")
        
#         # Get sample data
#         data_sample = get_sample_data(file_path)
#         if data_sample is None:
#             return None
            
#         print("\nSample data:")
#         display(spark.read.format("csv").option("header", "true").load(file_path).limit(6))
        
#         # Get AI analysis
#         print("\nRequesting analysis from Databricks Assistant...")
#         analysis = analyze_with_genie(data_sample, file_name)
        
#         if analysis:
#             displayHTML(f"""
#             <div style='border:1px solid #eee; padding:10px; margin:10px 0;'>
#                 <h3>AI Analysis for {file_name}</h3>
#                 <pre style='white-space:pre-wrap;'>{analysis}</pre>
#             </div>
#             """)
#             return (file_path, analysis)
#         return None
        
#     except Exception as e:
#         print(f"Error processing {file_path}: {str(e)}")
#         return None

# # Process each file
# analysis_results = []
# for file_path in locations:
#     result = process_file(file_path)
#     if result:
#         analysis_results.append(result)

# # Save results to Delta table if needed
# if analysis_results:
#     (spark.createDataFrame(analysis_results, ["file_path", "analysis"])
#      .write.mode("overwrite").saveAsTable("genie_analyses"))
#     print("\nResults saved to 'genie_analyses' table")
# else:
#     print("\nNo analyses were generated")


Processing satisfaction.csv...

Sample data:


Satisfaction
Very Satisfied
Satisfied
Neutral
Unsatisfied
Very Unsatisfied



Requesting analysis from Databricks Assistant...
Genie analysis failed: An error occurred while calling o1342.run.
: com.databricks.WorkflowException: com.databricks.NotebookExecutionException: FAILED: Unable to access the notebook "/_assistant/query". Either it does not exist, or the identity used to run this job, officialworkmails@gmail.com, lacks the required permissions.
	at com.databricks.workflow.WorkflowDriver.run(WorkflowDriver.scala:98)
	at com.databricks.dbutils_v1.impl.NotebookUtilsImpl.run(NotebookUtilsImpl.scala:130)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:77)
	at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.base/java.lang.reflect.Method.invoke(Method.java:569)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invok

Probability
Definitely
Probably
Not Sure
Probably Not
Definitely Not



Requesting analysis from Databricks Assistant...
Genie analysis failed: An error occurred while calling o1349.run.
: com.databricks.WorkflowException: com.databricks.NotebookExecutionException: FAILED: Unable to access the notebook "/_assistant/query". Either it does not exist, or the identity used to run this job, officialworkmails@gmail.com, lacks the required permissions.
	at com.databricks.workflow.WorkflowDriver.run(WorkflowDriver.scala:98)
	at com.databricks.dbutils_v1.impl.NotebookUtilsImpl.run(NotebookUtilsImpl.scala:130)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:77)
	at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.base/java.lang.reflect.Method.invoke(Method.java:569)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invok

In [0]:
# for file in locations:
#     print(file)
#     df = spark.read.csv(file, header=True, inferSchema=True)

#     display(df)

#     display(spark.sql(f"""SELECT ai_gen('can you please give me your thoughts on {df}')"""))

/Volumes/workspace/vcc/upload_location/satisfaction.csv


Satisfaction
Very Satisfied
Satisfied
Neutral
Unsatisfied
Very Unsatisfied


ai_gen('can you please give me your thoughts on DataFrame[Satisfaction: string]')
"`DataFrame[Satisfaction: string]` appears to be a type annotation or a declaration for a DataFrame, specifically indicating that it contains a column named ""Satisfaction"" with string values. Here are some thoughts on this: 1. **Data Type**: The fact that ""Satisfaction"" is annotated as a string suggests that the values in this column are textual or categorical. This could imply that the satisfaction levels are being recorded as text, such as ""Very Satisfied"", ""Satisfied"", ""Neutral"", ""Dissatisfied"", or ""Very Dissatisfied"". 2. **Analysis Implications**: When working with a column of string values, certain types of analysis are more appropriate than others. For example, you might perform frequency counts of different satisfaction levels, or you could use techniques like sentiment analysis if the strings contain more free-form text describing satisfaction. 3. **Potential for Categorical Treatment**: Even though the values are strings, if they represent a finite set of distinct categories (like different levels of satisfaction), it might be beneficial to treat this column as categorical data. Many data analysis libraries (like pandas in Python) support categorical data types, which can be more memory-efficient and enable certain types of analysis that are specific to categorical data. 4. **Encoding for Modeling**: If the goal is to use this DataFrame in a machine learning model, the string values in the ""Satisfaction"" column would likely need to be encoded into numerical values. Common strategies include one-hot encoding, label encoding, or using an ordinal encoding if there's a natural order to the satisfaction levels. 5. **Data Cleaning**: Working with string data often requires careful data cleaning. This might involve handling missing values, standardizing the case of the strings (e.g., converting all to lowercase), removing leading or trailing whitespace, and possibly dealing with typos or inconsistencies in how satisfaction levels are recorded. 6. **Visualization**: For exploratory data analysis, visualizations like bar charts or histograms can be very effective for understanding the distribution of satisfaction levels in the dataset. In summary, `DataFrame[Satisfaction: string]` suggests a dataset focused on capturing textual or categorical satisfaction levels. The analysis and processing of this data would need to account for its string nature, potentially involving encoding, cleaning, and choosing appropriate analytical and visualization techniques."


/Volumes/workspace/vcc/upload_location/probability.csv


Probability
Definitely
Probably
Not Sure
Probably Not
Definitely Not


ai_gen('can you please give me your thoughts on DataFrame[Probability: string]')
"`DataFrame[Probability: string]` appears to be a type annotation or a declaration in a programming language, likely TypeScript, given the syntax. This declaration seems to be defining a type for a DataFrame, specifically one that contains a column named ""Probability"" with string values. Here are a few thoughts on this declaration: 1. **Data Type Mismatch**: The term ""Probability"" typically implies a numerical value between 0 and 1, representing the likelihood of an event occurring. However, in this declaration, the ""Probability"" column is defined as a string. This could be a design choice, but it might also indicate a potential issue if the intention is to perform numerical computations on these values. Strings would need to be parsed or converted to numbers (e.g., floats) for such operations. 2. **Type Safety and Clarity**: The use of explicit type definitions like `DataFrame[Probability: string]` can enhance code readability and maintainability. It clearly communicates the structure and data types within the DataFrame to other developers (and to the compiler or interpreter, depending on the language). This can help catch type-related errors early in the development process. 3. **Potential for Error**: If the ""Probability"" values are indeed meant to be numerical (as the name suggests), storing them as strings could lead to errors. For example, sorting, filtering, or performing statistical analyses on these values as strings could yield unexpected results. It might be more appropriate to define the type as `DataFrame[Probability: number]` or a more specific type if the language supports it (e.g., a type that enforces a value between 0 and 1). 4. **Serialization and Deserialization**: In scenarios where data is being serialized (e.g., to JSON) and then deserialized, ensuring that the types are correctly interpreted is crucial. A string representation of a probability (e.g., ""0.5"") would need to be converted back to a numerical type for most mathematical operations. 5. **Language and Framework Considerations**: The implications and best practices around `DataFrame[Probability: string]` can vary depending on the programming language and frameworks being used. For instance, in Python with pandas, you might explicitly define the data type of a column, but the syntax and type system would differ from what's shown here. In summary, while `DataFrame[Probability: string]` is a clear declaration of a DataFrame's structure, careful consideration should be given to the choice of data type for the ""Probability"" column to ensure it aligns with the intended use and manipulations of the data."


/Volumes/workspace/vcc/upload_location/weather.csv


Data.Precipitation,Date.Full,Date.Month,Date.Week of,Date.Year,Station.City,Station.Code,Station.Location,Station.State,Data.Temperature.Avg Temp,Data.Temperature.Max Temp,Data.Temperature.Min Temp,Data.Wind.Direction,Data.Wind.Speed
0.0,2016-01-03,1,3,2016,Birmingham,BHM,"Birmingham, AL",Alabama,39,46,32,33,4.33
0.0,2016-01-03,1,3,2016,Huntsville,HSV,"Huntsville, AL",Alabama,39,47,31,32,3.86
0.16,2016-01-03,1,3,2016,Mobile,MOB,"Mobile, AL",Alabama,46,51,41,35,9.73
0.0,2016-01-03,1,3,2016,Montgomery,MGM,"Montgomery, AL",Alabama,45,52,38,32,6.86
0.01,2016-01-03,1,3,2016,Anchorage,ANC,"Anchorage, AK",Alaska,34,38,29,19,7.8
0.09,2016-01-03,1,3,2016,Annette,ANN,"Annette, AK",Alaska,38,44,31,9,8.7
0.05,2016-01-03,1,3,2016,Bethel,BET,"Bethel, AK",Alaska,30,36,24,9,16.46
0.15,2016-01-03,1,3,2016,Bettles,BTT,"Bettles, AK",Alaska,22,32,9,2,3.1
0.6,2016-01-03,1,3,2016,Cold Bay,CDB,"Cold Bay, AK",Alaska,34,36,31,20,9.1
2.15,2016-01-03,1,3,2016,Cordova,CDV,"Cordova, AK",Alaska,38,43,33,9,9.76


"ai_gen('can you please give me your thoughts on DataFrame[Data.Precipitation: double, Date.Full: date, Date.Month: int, Date.Week of: int, Date.Year: int, Station.City: string, Station.Code: string, Station.Location: string, Station.State: string, Data.Temperature.Avg Temp: int, Data.Temperature.Max Temp: int, Data.Temperature.Min Temp: int, Data.Wind.Direction: int, Data.Wind.Speed: double]')"
"The DataFrame you've provided appears to be a collection of weather data, including precipitation, temperature, wind direction, and wind speed, along with information about the location (city, code, location, state) and time (date, month, week, year) of the data collection. Here are some observations and potential insights based on the structure of this DataFrame: ### 1. **Data Types and Structure** - **Precipitation** is recorded as a double, which is appropriate for capturing decimal values of precipitation. - **Date** is broken down into full date, month, week, and year, which is useful for time-series analysis at different granularities. - **Station** information includes city, code, location, and state, providing a clear identification of where the data was collected. - **Temperature** is recorded in integers for average, maximum, and minimum temperatures. Using integers might limit the precision of temperature recordings, as temperatures can vary in decimal points. - **Wind** data includes direction (as an integer) and speed (as a double). The direction being an integer might imply it's recorded in degrees or another quantized measure. ### 2. **Potential Analysis and Uses** - **Time Series Analysis**: With the date broken down into various components, this data is well-suited for time-series analysis, including trends over years, seasonal variations, and weekly patterns. - **Location-Based Analysis**: Comparing weather patterns across different cities or states could provide insights into regional climate differences. - **Correlation Analysis**: Investigating correlations between different weather parameters (e.g., temperature vs. precipitation, wind speed vs. temperature) could reveal interesting patterns or relationships. - **Predictive Modeling**: This data could be used to train models that predict future weather conditions based on historical trends and patterns. ### 3. **Potential Issues or Considerations** - **Data Quality**: Ensuring the accuracy and consistency of the data across different stations and over time is crucial. Missing values, outliers, or inconsistent recording practices could affect analysis outcomes. - **Temperature Precision**: The use of integers for temperature might not capture the full variability of temperature changes, potentially limiting the precision of analyses or models based on this data. - **Wind Direction**: If wind direction is recorded as an integer, it might be in degrees (0-360), but clarification on the measurement scale is necessary for proper interpretation. ### 4. **Future Enhancements** - **Additional Parameters**: Incorporating other weather parameters, such as humidity, atmospheric pressure, or sunshine hours, could enrich the dataset and enable more comprehensive analyses. - **Higher Temporal Resolution**: If possible, collecting data at a finer temporal resolution (e.g., hourly) could reveal more detailed patterns and fluctuations in weather conditions. Overall, the DataFrame seems well-structured for exploring and analyzing weather patterns across different locations and times. However, careful consideration of data quality, potential limitations in measurement precision, and the addition of other relevant parameters could further enhance the utility and insights derived from this dataset."


In [0]:
from pyspark.sql.functions import lit
import pandas as pd

def analyze_with_genai(df, file_name):
    """Generate comprehensive analysis using Databricks GenAI"""
    # Create a sample string of the first 5 rows
    sample_data = df.limit(5).toPandas().to_string()
    
    # Craft a detailed prompt, explicitly asking for HTML
    prompt = f"""
    Analyze this dataset from {file_name} and provide the analysis formatted as HTML.
    Each section should be enclosed in appropriate HTML tags (e.g., <h3> for headings, <p> for paragraphs, <ul> for lists).
    Ensure proper line breaks and structure using HTML tags like <br> or separating paragraphs.

    Here are the specific sections I need:
    
    <h3>1. Key observations about the data structure and content</h3>
    <p>Describe the columns, data types, and any initial insights.</p>
    
    <h3>2. Business questions this data could help answer</h3>
    <p>List relevant business questions that can be explored with this dataset along with chart types that can be used like bar, pie, trend line, heatmap etc other basic visualizations.</p>
    
    <h3>3. Recommended analytical approaches</h3>
    <p>Suggest methods or models that could be applied for analysis.</p>
    
    <h3>4. Potential data quality issues to check</h3>
    <p>Identify common data quality problems to look out for (e.g., missing values, outliers, inconsistencies).</p>
    
    <h3>5. Suggested visualizations</h3>
    <p>Propose charts or graphs that would be useful for understanding the data.</p>
    
    Sample data (first 5 rows):
    <pre><code>{sample_data}</code></pre>
    
    Please provide ONLY the HTML content for the analysis. Do not include any introductory or concluding remarks outside the HTML structure.
    """
    
    # Get GenAI analysis
    return spark.sql(f"""
    SELECT ai_gen('{prompt.replace("'", "''")}') AS analysis
    """).collect()[0]['analysis']

# Process each file
for file_path in locations:
    print(f"\nProcessing file: {file_path}")
    
    # Read the CSV file
    df = spark.read.csv(file_path, header=True, inferSchema=True)
    
    # Display the raw data
    print("\nSample data:")
    display(df.limit(5))
    
    # Get and display GenAI analysis
    print(f"\nGenAI Analysis for {file_path.split('/')[-1].split('.')[0]} dataset:")
    analysis = analyze_with_genai(df, file_path.split('/')[-1])
    displayHTML(analysis) # This will now render proper HTML
    
    # Optional: Save results to a table
    # spark.createDataFrame([(file_path, analysis)], ["file_path", "analysis"]) \
    #      .write.mode("append").saveAsTable("file_analyses")

print("\nAnalysis complete for all files!")


Processing file: dbfs:/Volumes/workspace/vcc/upload_location/stock.csv

Sample data:


year,industry_code_ANZSIC,industry_name_ANZSIC,rme_size_grp,variable,value,unit,_c7,_c8,_c9,_c10,_c11,_c12,_c13,_c14,_c15
2011,A,"Agriculture, Forestry and Fishing",a_0,Activity unit,46134,COUNT,,,,,,,,,
2011,A,"Agriculture, Forestry and Fishing",a_0,Rolling mean employees,0,COUNT,,,,,,,,,
2011,A,"Agriculture, Forestry and Fishing",a_0,Salaries and wages paid,279,DOLLARS(millions),,,,,,,,,
2011,A,"Agriculture, Forestry and Fishing",a_0,"Sales, government funding, grants and subsidies",8187,DOLLARS(millions),,,,,,,,,
2011,A,"Agriculture, Forestry and Fishing",a_0,Total income,8866,DOLLARS(millions),,,,,,,,,



GenAI Analysis for stock dataset:



Processing file: dbfs:/Volumes/workspace/vcc/upload_location/products.csv

Sample data:


Index,Name,Description,Brand,Category,Price,Currency,Stock,EAN,Color,Size,Availability,Internal ID
1,Compact Printer Air Advanced Digital,Situation organization these memory much off.,"Garner, Boyle and Flynn",Books & Stationery,265,USD,774,2091465262179,ForestGreen,Large,pre_order,56
2,Tablet,Discussion loss politics free one thousand.,Mueller Inc,Shoes & Footwear,502,USD,81,5286196620740,Black,8x10 in,in_stock,29
3,Smart Blender Cooker,No situation per.,"Lawson, Keller and Winters",Kitchen Appliances,227,USD,726,1282898648918,SlateGray,XS,in_stock,70
4,Advanced Router Rechargeable,For force gas energy six laugh.,Gallagher and Sons,Kitchen Appliances,121,USD,896,3879177514583,PaleGreen,L,discontinued,31
5,Portable Mouse Monitor Phone,Feeling back religious however author room scientist.,Irwin LLC,Kids' Clothing,1,USD,925,9055773261265,SeaShell,100x200 mm,discontinued,10



GenAI Analysis for products dataset:



Processing file: dbfs:/Volumes/workspace/vcc/upload_location/covid.csv

Sample data:


Entity,Day,ConfirmedCasesPerMillion
Afghanistan,2020-01-09,0.0
Afghanistan,2020-01-10,0.0
Afghanistan,2020-01-11,0.0
Afghanistan,2020-01-12,0.0
Afghanistan,2020-01-13,0.0



GenAI Analysis for covid dataset:



Processing file: dbfs:/Volumes/workspace/vcc/upload_location/weather.csv

Sample data:


Data.Precipitation,Date.Full,Date.Month,Date.Week of,Date.Year,Station.City,Station.Code,Station.Location,Station.State,Data.Temperature.Avg Temp,Data.Temperature.Max Temp,Data.Temperature.Min Temp,Data.Wind.Direction,Data.Wind.Speed
0.0,2016-01-03,1,3,2016,Birmingham,BHM,"Birmingham, AL",Alabama,39,46,32,33,4.33
0.0,2016-01-03,1,3,2016,Huntsville,HSV,"Huntsville, AL",Alabama,39,47,31,32,3.86
0.16,2016-01-03,1,3,2016,Mobile,MOB,"Mobile, AL",Alabama,46,51,41,35,9.73
0.0,2016-01-03,1,3,2016,Montgomery,MGM,"Montgomery, AL",Alabama,45,52,38,32,6.86
0.01,2016-01-03,1,3,2016,Anchorage,ANC,"Anchorage, AK",Alaska,34,38,29,19,7.8



GenAI Analysis for weather dataset:



Analysis complete for all files!



Stock Suggestion:
    
Time series analysis of revenue to examine trends in industry performance over the years.
Provide the total revenue for each industry by year, focusing specifically on total income. The data is organized by year and industry name, allowing for a clear view of revenue trends over time.

-----------------------------------------------------------------------------------------------------

Covid Suggestion:
    
How have COVID-19 cases evolved over time for different countries show with a line chart
. Provide a detailed view of confirmed COVID-19 cases per million people, organized by entity and date. 

-----------------------------------------------------------------------------------------------------


Weather Suggestion:
    
pie chart to summarize weather data based on columns like Data.Precipitation and Data.Temperature.Avg_Temp focusing on distribution of different states

-----------------------------------------------------------------------------------------------------


Product Suggestion:
    
Which product categories have the highest average price, potentially using a bar chart to visualize the results?