In [0]:
pip install langchain-community databricks-sql-connector databricks_langchain openai databricks-sqlalchemy~=1.0

In [0]:
dbutils.library.restartPython()

In [0]:
from langchain.agents import create_sql_agent
from langchain.agents.agent_toolkits import SQLDatabaseToolkit
from langchain.sql_database import SQLDatabase
from langchain import OpenAI
from databricks_langchain import ChatDatabricks

In [0]:
try:
    df = spark.read.table("forecast.schema1.calendar") #Replace with your catalog,schema and table name
    print("Successfully read the table!")
    df.show()
except Exception as e:
    print(f"Error reading table directly with Spark: {e}")

In [0]:
catalog = "forecast"
schema = "schema1"

from langchain.sql_database import SQLDatabase
import pandas as pd
from statsmodels.tsa.seasonal import seasonal_decompose
from statsmodels.tsa.stattools import adfuller
from statsmodels.tsa.arima.model import ARIMA
import numpy as np
from pyspark.sql import SparkSession

# Initialize Spark Session
spark = SparkSession.builder.appName("TimeSeriesAnalysis").getOrCreate()

# 1. Initialize LangChain's SQLDatabase (ONLY if using the agent)
try:
    db = SQLDatabase.from_databricks(catalog="forecast", schema="schema1", engine_args={"pool_pre_ping": True})
except Exception as e:
    print(f"Error initializing LangChain DB: {e}")
    exit()

try:
    query = "SHOW TABLES"
    tables_list = db.query(query) 
    tables = {}

    for table_info in tables_list:
        table_name = table_info['tableName']  
        df = db.get_table(table_name)
        tables[table_name] = df

    print(f"Tables extracted: {tables.keys()}")  
except Exception as e:
    print(f"Error extracting tables: {e}")


def get_available_tables(catalog, schema):
    try:
        tables = spark.sql(f"SHOW TABLES IN {catalog}.{schema}").collect()
        table_names = [row.tableName for row in tables if row.tableName != 'small_sales' and row.tableName != '_sqldf']
        return table_names
    except Exception as e:
        print(f"Error retrieving tables from {catalog}.{schema}: {e}")
        return []

tables = {}

try:
    table_names = get_available_tables(catalog="forecast", schema="schema1")
    
    for table_name in table_names:
        try:
            # Read the table using Spark
            df = spark.read.table(f"forecast.schema1.{table_name}")
            tables[table_name] = df
            print(f"Successfully read table: {table_name}")
        except Exception as e:
            print(f"Error reading table {table_name}: {e}")
    
    print(f"Tables extracted: {tables.keys()}")

    merged_df = None

    for table_name, df in tables.items():
        pandas_df = df.toPandas()

        if merged_df is None:
            merged_df = pandas_df
        else:
            merged_df = pd.concat([merged_df, pandas_df], ignore_index=True)
    
    print("Successfully merged all tables")
    print(merged_df.head())  

except Exception as e:
    print(f"Error extracting or merging tables: {e}")



def get_columns_from_db(catalog, schema, tables):
    schema_col = {}
    for table in tables:
        try:
            cols = spark.table(f"{catalog}.{schema}.{table}").columns  
            schema_col[table] = cols
        except Exception as e:
            print(f"Error retrieving columns for {catalog}.{schema}.{table}: {e}")
            schema_col[table] = [] 
    return schema_col

  
table_names = get_available_tables(catalog, schema)

columns_schema = get_columns_from_db(catalog, schema, table_names)

In [0]:
pip install langchain_databricks

In [0]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
from statsmodels.tsa.stattools import adfuller, acf, pacf, grangercausalitytests
from pyspark.sql import SparkSession
from databricks_langchain import ChatDatabricks
from scipy.stats import chi2_contingency

spark = SparkSession.builder.appName("TimeSeriesAnalysis").getOrCreate()

catalog = "forecast"
schema = "schema1"

def get_available_tables(catalog, schema):
    try:
        tables = spark.sql(f"SHOW TABLES IN {catalog}.{schema}").collect()
        return [row.tableName for row in tables if row.tableName != '_sqldf']
    except Exception as e:
        print(f"Error retrieving tables from {catalog}.{schema}: {e}")
        return []

def get_columns_from_db(catalog, schema, tables):
    schema_col = {}
    for table in tables:
        try:
            cols = spark.table(f"{catalog}.{schema}.{table}").columns  
            schema_col[table] = cols
        except Exception as e:
            print(f"Error retrieving columns for {catalog}.{schema}.{table}: {e}")
            schema_col[table] = []
    return schema_col

tables = {}
table_names = get_available_tables(catalog, schema)

for table_name in table_names:
    try:
        df = spark.read.table(f"{catalog}.{schema}.{table_name}").toPandas()
        tables[table_name] = df
        print(f"Successfully loaded table: {table_name}")
    except Exception as e:
        print(f"Error loading table {table_name}: {e}")

columns_schema = get_columns_from_db(catalog, schema, table_names)

def compute_statistical_relation(df, target_column):
    results = []
    target_data = df[target_column].dropna()  
    
    for col in df.columns:
        if col == target_column:
            continue
        
        if df[col].dtype in ['int64', 'float64']:
            col_data = df[col].dropna() 
            
           
            aligned_data = pd.concat([col_data, target_data], axis=1).dropna()
            
            if aligned_data.shape[0] < 2:  
                continue
            
            corr, _ = stats.pearsonr(aligned_data[col], aligned_data[target_column])
            score = abs(corr)
        else:
            contingency_table = pd.crosstab(df[col], target_data)
            chi2, _, _, _ = chi2_contingency(contingency_table)
            score = chi2
        
        results.append((col, score))
    
    return sorted(results, key=lambda x: x[1], reverse=True)


def forecasting_statistical_tests(df, target_column):
    results = {}
    target_data = df[target_column].dropna()

    if df[target_column].dtype not in ['int64', 'float64']:
        return "Target column must be numerical for forecasting tests."

    results["Correlation Matrix"] = df.corr()
    results["ACF"] = acf(target_data, nlags=20)
    results["PACF"] = pacf(target_data, nlags=20)
    results["Augmented Dickey-Fuller Test"] = adfuller(target_data)
    results["Ljung-Box Test"] = sm.stats.acorr_ljungbox(target_data, lags=[10])

    granger_results = {}
    for col in df.columns:
        if col != target_column and df[col].dtype in ['int64', 'float64']:
            try:
                test_result = grangercausalitytests(df[[target_column, col]].dropna(), maxlag=5, verbose=False)
                granger_results[col] = test_result
            except:
                continue
    results["Granger Causality Test"] = granger_results
    
    return results

llm = ChatDatabricks(
    endpoint="databricks-meta-llama-3-1-70b-instruct",
    temperature=0.1,
    max_tokens=800,
)

def extract_target_from_query(user_query, llm, table_names, columns_schema):
    prompt_template = """
    Given the following table names and schema, determine the most relevant numerical target column for forecasting future trends.
    
    Tables: {tables}
    Schema: {columns_schema}
    Query: {query}

    Return ONLY the best target column name as a single word. Do NOT return extra text.
    """

    response = llm.invoke(prompt_template.format(
        tables=table_names, columns_schema=columns_schema, query=user_query
    )).content.strip()  

    print(f"🔍 LLM Response: {response}") 

    for table_name, columns in columns_schema.items():
        if response in columns:
            return table_name, response

    print("⚠️ No valid target column identified!")
    return None, None


def get_best_forecasting_feature(tables, user_query, llm, table_names, columns_schema):
    table_name, target_column = extract_target_from_query(user_query, llm, table_names, columns_schema)

    if not table_name or not target_column:
        return " Target column not found.", None, None

    final_results = []
    for name, df in tables.items():
        if name != table_name:
            continue

        related_features = compute_statistical_relation(df, target_column)
        for feature, score in related_features:
            final_results.append((name, feature, score))

    best_feature = sorted(final_results, key=lambda x: x[2], reverse=True)[0] if final_results else None
    return best_feature, table_name, target_column

user_query = "I want to forecast future sales trends."
best_feature, table_name, target_column = get_best_forecasting_feature(tables, user_query, llm, table_names, columns_schema)

if best_feature:
    print(f" Best Feature for Forecasting: {best_feature[1]} in {best_feature[0]} with score {best_feature[2]}")
    forecasting_results = forecasting_statistical_tests(tables[table_name], target_column)
    print("Forecasting Statistical Tests:", forecasting_results)
else:
    print(" No suitable feature found for forecasting.")
