In [0]:
pip install langchain-community databricks-sql-connector databricks_langchain openai databricks-sqlalchemy~=1.0

In [0]:
dbutils.library.restartPython()

In [0]:
from langchain.agents import create_sql_agent
from langchain.chains import LLMChain
from langchain.prompts import PromptTemplate
from pydantic import BaseModel, Field
from langchain.agents.agent_toolkits import SQLDatabaseToolkit
from langchain.sql_database import SQLDatabase
from langchain import OpenAI
from databricks_langchain import ChatDatabricks
from pyspark.sql import SparkSession

In [0]:
spark = SparkSession.builder.appName("TimeSeriesAnalysis").getOrCreate()
try:
    db = SQLDatabase.from_databricks(catalog="forecast", schema="schema1", engine_args={"pool_pre_ping": True})
except Exception as e:
    print(f"Error initializing LangChain DB: {e}")
    exit()

In [0]:
llm = ChatDatabricks(
    endpoint="databricks-meta-llama-3-1-70b-instruct",
    temperature=0.1,
    max_tokens=800,
)

toolkit = SQLDatabaseToolkit(db=db, llm=llm)
agent = create_sql_agent(llm=llm, toolkit=toolkit, verbose=True)

In [0]:
extract_catalog_schema_prompt = PromptTemplate(
    input_variables=["input"],
    template="""
    You are a helpful assistant that extracts catalog and schema names from a database query.
    The user has provided the following query: "{input}"

    Please extract the catalog and schema mentioned in the query.
    If either the catalog or schema is not provided, return 'Not provided' for the missing one.
    Format the response like: Catalog: <catalog_name>, Schema: <schema_name>.
    """
)

def process_user_query(user_query):
    catalog_schema_chain = LLMChain(prompt=extract_catalog_schema_prompt, llm=llm)
    catalog_schema_response = catalog_schema_chain.invoke({"input": user_query})
    # Extract catalog and schema from the response
    catalog, schema = 'Not provided', 'Not provided'
    if 'Catalog:' in catalog_schema_response['text'] and 'Schema:' in catalog_schema_response['text']:
        catalog = catalog_schema_response['text'].split('Catalog:')[1].split(',')[0].strip()
        schema = catalog_schema_response['text'].split('Schema:')[1].strip()
    return catalog.replace('.', ''), schema.replace('.', '')

In [0]:
user_query = "connect to the forecast catalog and schema1 schema"
catalog, schema= process_user_query(user_query)
print("Extracted Catalog:", catalog)
print("Extracted Schema:", schema)

In [0]:
# Ensure the database connection uses the extracted catalog and schema
db = SQLDatabase.from_databricks(
    catalog=catalog,
    schema=schema,
    engine_args={"pool_pre_ping": True}  # Prevent session expiration issues
)

# Fetch table names
try:
    tables = db.get_usable_table_names()
    print(f"Tables in {catalog}.{schema}: {tables}")
except Exception as e:
    print(f"Error retrieving tables from {catalog}.{schema}: {e}")


In [0]:
import pandas as pd

dataframes = {}

for table in tables:
    try:
        spark_df = spark.table(f"forecast.schema1.{table}")
        pandas_df = spark_df.toPandas()
        dataframes[table] = pandas_df
        print(f"Successfully retrieved data for table: {table}")
    except Exception as e:
        print(f"Error retrieving data from {table}: {e}")


In [0]:
for table_name, df in dataframes.items():
    print(f"Data for table: {table_name}")
    print(df.head())  
    print("\n")

In [0]:
for table_name, df in dataframes.items():
    df = df.infer_objects()
    for col in df.columns:
        dtype = df[col].dtype
        print(f'{table_name},{col},{dtype}')
             

In [0]:
import pandas as pd
import numpy as np
from scipy.stats import entropy, skew, kurtosis

def compute_statistics(dataframes):
    stats = {}
    
    for table_name, df in dataframes.items():
        df = df.infer_objects()
        table_stats = {}
        
        for col in df.columns:
            col_data = df[col].dropna()
            col_stats = {}
            col_stats["table_column"] = f"{table_name} | {col}"
            col_stats["count"] = col_data.count()
            
            
            dtype_switch = {
                'number': lambda: col_stats.update({
                    "mean": col_data.mean(),
                    "median": col_data.median(),
                    "mode": col_data.mode().iloc[0] if not col_data.mode().empty else None,
                    "variance": col_data.var(),
                    "std_dev": col_data.std(),
                    "min": col_data.min(),
                    "max": col_data.max(),
                    "range": col_data.max() - col_data.min(),
                    "iqr": col_data.quantile(0.75) - col_data.quantile(0.25),
                    "skewness": skew(col_data),
                    "kurtosis": kurtosis(col_data),
                    "sum": col_data.sum()
                }),
                'bool': lambda: col_stats.update({
                    "true_count": col_data.sum(),
                    "false_count": len(col_data) - col_data.sum(),
                    "true_ratio": col_data.mean()
                }),
                'category': lambda: col_stats.update({
                    "unique": col_data.nunique(),
                    "mode": col_data.mode().iloc[0] if not col_data.mode().empty else None,
                    "entropy": entropy(col_data.value_counts(normalize=True), base=2) if not col_data.value_counts().empty else None
                }),
                'boolean_like': lambda: col_stats.update({
                    "true_count": col_data.str.upper().replace({'YES': True, 'NO': False, '1': True, '0': False}).sum(),
                    "false_count": len(col_data) - col_data.str.upper().replace({'YES': True, 'NO': False, '1': True, '0': False}).sum(),
                    "true_ratio": col_data.str.upper().replace({'YES': True, 'NO': False, '1': True, '0': False}).mean()
                }),
                'datetime': lambda: col_stats.update({
                    "min_date": col_data.min(),
                    "max_date": col_data.max(),
                    "range_days": (col_data.max() - col_data.min()).days
                })
            }
            
            dtype_key = 'number' if np.issubdtype(col_data.dtype, np.number) else \
                        'bool' if col_data.dtype == 'bool' else \
                        'category' if col_data.dtype == 'object' or col_data.dtype.name == 'category' else \
                        'boolean_like' if col_data.str.upper().replace({'YES': True, 'NO': False, '1': True, '0': False}).isin([True, False]).all() else \
                        'datetime' if np.issubdtype(col_data.dtype, np.datetime64) else None
            
            if dtype_key:
                dtype_switch[dtype_key]()
            
            table_stats[col] = col_stats
        
        stats[table_name] = table_stats
    
    stats_df = pd.DataFrame(
        {(table, col): stats[table][col] for table in stats for col in stats[table]}).T
    
    return stats_df


In [0]:
stats_df = compute_statistics(dataframes)  


In [0]:
stats_df.head()

In [0]:
import numpy as np
import pandas as pd
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, FloatType, IntegerType

stats_df = stats_df.fillna(np.nan)

for col in stats_df.select_dtypes(include=[np.number]).columns:
    stats_df[col] = stats_df[col].astype(float)

for col in stats_df.select_dtypes(include=[object]).columns:
    stats_df[col] = stats_df[col].astype(str)

spark = SparkSession.builder.getOrCreate()
spark_df = spark.createDataFrame(stats_df)
catalog_name = "forecast"
spark.sql(f"CREATE SCHEMA IF NOT EXISTS {catalog_name}.stats_schema")
stats_schema_name = "stats_schema"
table_name = "stats_table"

spark_df.write.format("delta").mode("overwrite").saveAsTable(f"{catalog_name}.{stats_schema_name}.{table_name}")



In [0]:
from langchain.chat_models import ChatDatabricks
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain


In [0]:
def get_table_schema(table_name):
    query = f"DESCRIBE {table_name}"
    result = db.run(query) 
    return result

In [0]:
# Step 2: Provide the LLM with a prompt to decide on the best merge strategy
merge_strategy_prompt = PromptTemplate(
    input_variables=["tables_schema"],
    template="""
    The user has provided the schema information of the following tables:

    {table_schema}

    Based on the provided schemas, suggest the best strategy for merging these tables. 
    Consider factors such as common columns, column data types, and potential keys for joining.
    All tables are stored in 'dataframes' which refers to a dictionary where the keys are the table names (as strings), and the values are the corresponding Pandas DataFrames that hold the data from those tables.
    Merge the tables as a pandas DataFrame by providing the appropriate python code.
    Make merges stepwise, only providing the necessary code and little explanation for each step.
    Make sure the column datatypes are correctly matched when merging.
    """
)

def generate_merge_strategy(tables):
    tables_schema = ""
    for table in tables:
        schema = get_table_schema(table)
        tables_schema += f"Table {table}: {schema}\n\n"
    
    merge_strategy_chain = LLMChain(prompt=merge_strategy_prompt, llm=llm)
    merge_strategy_response = merge_strategy_chain.invoke({"table_schema": tables_schema})
    return merge_strategy_response['text']

merge_strategy = generate_merge_strategy(tables)

print("Recommended Merge Strategy:")
print(merge_strategy)

In [0]:
for table in tables:
    print(f"Schema for table: {table}")
    
    display_data = get_table_schema(table)
    
    if isinstance(display_data, str):
        import ast
        display_data = ast.literal_eval(display_data) 
    
    for item in display_data:
        print(f"Column Name: {item[0]}, Datatype: {item[1]}, Comment: {item[2]}")
    
    print("\n" + "-"*50 + "\n")


In [0]:
from databricks_langchain import ChatDatabricks
import re
import json

tables_schema = ""
for table in tables:
    schema = get_table_schema(table)
    tables_schema += f"Table {table}: {schema}\n\n"

llm = ChatDatabricks(
    endpoint="databricks-meta-llama-3-1-70b-instruct",
    temperature=0.1,
    max_tokens=8000,
)

def split_into_chunks(text, max_tokens=500):
    chunks = []
    current_chunk = ""
    for line in text.split("\n"):
        if len(current_chunk.split()) + len(line.split()) <= max_tokens:
            current_chunk += line + "\n"
        else:
            chunks.append(current_chunk.strip())
            current_chunk = line + "\n"
    if current_chunk:
        chunks.append(current_chunk.strip())  
    return chunks

chunks = split_into_chunks(tables_schema)

memory = []
previous_features = set()  

user_query = "I want to forecast future sales based on relevant features."

for chunk in chunks:
    prompt = f"""
    You are an expert in time series forecasting. The user wants to forecast future values based on relevant features. 
    Given the following table schema, classify each column as either 'Relevant' or 'Irrelevant' for time series forecasting, and provide a reason for your classification.

    {memory}{chunk}

    User request: {user_query}

    Please provide your response in the following text format:
    - "column1": "The column represents time-based data, essential for forecasting."
    - "column2": "The column contains historical sales data, which is critical for forecasting."

    IMPORTANT: Please do not repeat any features that have already been mentioned in previous responses. Only mention new relevant features with their reasons.
    """

    response = llm.invoke(prompt)
    llm_response = response.text if hasattr(response, "text") else response.content

    for line in llm_response.splitlines():
        match = re.match(r'- "(.*?)": "(.*?)"', line.strip())
        if match:
            feature_name = match.group(1)
            if feature_name not in previous_features:
                previous_features.add(feature_name)
            else:
                llm_response = llm_response.replace(line.strip(), "")

    memory.append(llm_response)

final_response = "\n".join(memory)
print("Final Relevant Features and Reasons:")
print(final_response)

In [0]:
############### Filtering using Correlation Metrics #####################
import pandas as pd
import numpy as np
threshold = 0.85

filtered_dataframes = {}

for table_name, df in dataframes.items():
    print(f"Processing correlation analysis for table: {table_name}")

    numeric_df = df.select_dtypes(include=[np.number])
    correlation_matrix = numeric_df.corr()
    correlated_features = set()
    for i in range(len(correlation_matrix.columns)):
        for j in range(i):
            if abs(correlation_matrix.iloc[i, j]) > threshold:
                correlated_features.add(correlation_matrix.columns[i])

    print(f"Highly Correlated Features to Remove in {table_name}: {correlated_features}\n")
    filtered_dataframes[table_name] = df.drop(columns=correlated_features)


In [0]:
for table_name, df in filtered_dataframes.items():
    print(f"Filtered Data for table: {table_name}")
    print(df.head()) 
    print("\n")

In [0]:
import numpy as np
import pandas as pd
from scipy.stats import skew, kurtosis
from sklearn.preprocessing import RobustScaler

def handle_special_columns(df):
    date_columns = df.select_dtypes(include=[np.datetime64]).columns
    for col in date_columns:
        df[f'{col}_day'] = df[col].dt.day
        df[f'{col}_month'] = df[col].dt.month
        df[f'{col}_weekday'] = df[col].dt.weekday
        df[f'{col}_year'] = df[col].dt.year

    categorical_columns = df.select_dtypes(include=['object', 'category']).columns
    for col in categorical_columns:
        df = pd.get_dummies(df, columns=[col], drop_first=True)

    return df

def apply_transformations(df, stats_df, table_name):
    for column in df.columns:
        table_column = f"{table_name} | {column}"

        if table_column not in stats_df['table_column'].values:
            print(f"Skipping feature {column} as no statistics found for {table_column}.")
            continue  

        stat_row = stats_df[stats_df['table_column'] == table_column].iloc[0]
        skewness = stat_row['skewness']
        kurtosis_value = stat_row['kurtosis']
        entropy_value = stat_row['entropy']

        if skewness > 1:
            print(f"Applying log transformation to skewed feature: {column}")
            df[column] = np.log1p(df[column])  

        if kurtosis_value > 3:
            print(f"Applying robust scaing to feature: {column}")
            scaler = RobustScaler()
            df[column] = scaler.fit_transform(df[column].values.reshape(-1, 1))

        if entropy_value < 0.1:
            print(f"Removing low-entropy feature: {column}")
            df = df.drop(columns=[column])

    return df

for table_name, df in filtered_dataframes.items():
    print(f"Applying transformations to table: {table_name}")
    
    df_transformed = apply_transformations(df, stats_df, table_name)
    
    df_transformed = handle_special_columns(df_transformed)
    
    print(f"Transformed data for table: {table_name}")
    print(df_transformed.head())

