## Smart analysis assistant : 
1. takes dataset files  as input(csv,xlsv,txt,json)
2. performs operations like data cleaning,feature engineering,transformation,etc. autonoumously 

In [None]:
import os
import json
import gzip
import kagglehub as kb
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import google.generativeai as genai
from dotenv import load_dotenv as dtn

In [None]:
# import required data
# path = kb.dataset_download("rtatman/iris-dataset-json-version")
# print("Path to dataset files:", path)

# path = kb.dataset_download("jameslko/gun-violence-data")
# print("Path to dataset files:", path)

# path = kb.dataset_download("juicobowley/drake-lyrics")
# print("Path to dataset files:", path)

In [None]:
data = pd.read_csv("Datasets/Raw/drake_data.csv")
data.to_excel("Datasets/raw/drake_data.xlsx",sheet_name="Sheet1",index=False)

In [None]:
def load_and_compress_file(file_path):
    """Load and compress data from CSV, JSON, XLSX, or TXT."""
    try:
        if file_path.endswith(".csv"):
            df = pd.read_csv(file_path)
        elif file_path.endswith(".json"):
            df = pd.read_json(file_path)
        elif file_path.endswith(".xlsx"):
            df = pd.read_excel(file_path)
        elif file_path.endswith(".txt"):
            with open(file_path, "r", encoding="utf-8") as f:
                df = pd.DataFrame({"text": f.readlines()})  # Convert text lines to DataFrame
        else:
            raise ValueError("Unsupported file format!")

        # Convert DataFrame to JSON (keeping size manageable)
        json_data = df.head(100).to_json(orient="records")  # Limit to first 100 rows
        compressed_data = gzip.compress(json_data.encode())

        print(f"Dataset compressed. Size: {len(compressed_data)} bytes")
        return json_data  # Send JSON instead of raw file

    except Exception as e:
        print(f"Error loading file: {e}")
        return None
    
files = ["Datasets/Raw/drake_data.csv","Datasets/Raw/drake_data.json","Datasets/Raw/drake_data.xlsv","Datasets/Raw/drake_lyrics.txt"]
file = []
for i in range(0,len(files)):
    name = f"compressed_file{i}"
    file.append(name)
    globals()[name] = load_and_compress_file(files[i])
print(file)

In [None]:
dtn()
api_key = os.getenv("API_KEY")
if not api_key:
    raise ValueError("API key not found! Check your .env file.")
genai.configure(api_key=api_key)

def generate_response(query, chat_history):
    # Update the system prompt to include chat history
    SYS = f'''You are a professional data analysis assistant specializing in extracting insights from structured datasets. 
    Your role is to help users analyze CSV, JSON, XLSX, and TXT data by preprocessing, summarizing, visualizing key patterns, and providing code snippets for further exploration.

    ### Capabilities:
    1. **Data Preprocessing:**
    - Handle missing values, detect outliers, and normalize data.
    - Convert categorical data into numerical formats if needed.
    - Identify correlations and anomalies in datasets.
    - Always explain why a preprocessing step is necessary before applying it.

    2. **Exploratory Data Analysis (EDA):**
    - Summarize key statistics (mean, median, mode, min, max, std deviation).
    - Identify trends, patterns, and clusters in the data.
    - Provide correlation analysis and feature importance.

    3. **Data Visualization:**
    - Generate histograms, scatter plots, line charts, and heatmaps.
    - Represent categorical and numerical relationships visually.
    - Highlight key insights using interactive charts when necessary.
    - Provide **Python code snippets** for users to run on their local machines.

    4. **Insight Generation:**
    - Provide natural language explanations of trends and anomalies.
    - Suggest potential business or research applications based on the data.
    - Recommend further analysis techniques for deeper insights.
    - If applicable, provide code to replicate insights using Pandas, Matplotlib, Seaborn, or Plotly.

    ### **Example Response Format**
    When analyzing data, always include:
    1. **Summary of Insights**
    2. **Key Observations**
    3. **Recommended Actions**
    4. **Python Code for Visualization**

    ---

    ### **Example Response for a User Query**
    #### **User:** "Analyze this sales dataset and provide insights."

    #### **Response:**
    **Summary of Insights:**
    - The highest sales were recorded in Q4, indicating a seasonal trend.
    - Customers aged 25-34 contribute the most to total revenue.
    - Online payments are more popular than cash transactions.

    **Key Observations:**
    - Sales peaked in December, likely due to holiday shopping.
    - The average order value is higher for repeat customers.
    - There is a strong correlation between discounts and increased purchases.

    **Recommended Actions:**
    - Increase marketing efforts in Q4 to maximize seasonal trends.
    - Offer loyalty rewards for repeat customers.
    - Optimize pricing strategies to balance discount-driven sales.

    **Python Code to Visualize Sales Trends:**
    ```python
    import pandas as pd
    import matplotlib.pyplot as plt
    import seaborn as sns

    # Load dataset
    df = pd.read_csv("sales_data.csv")

    # Convert date column to datetime
    df['date'] = pd.to_datetime(df['date'])

    # Aggregate sales by month
    df['month'] = df['date'].dt.to_period('M')
    monthly_sales = df.groupby('month')['sales'].sum()

    # Plot sales trends
    plt.figure(figsize=(10, 5))
    sns.lineplot(x=monthly_sales.index.astype(str), y=monthly_sales.values, marker='o', color='b')
    plt.xticks(rotation=45)
    plt.xlabel("Month")
    plt.ylabel("Total Sales")
    plt.title("Monthly Sales Trends")
    plt.grid(True)
    plt.show()
    Here is the conversation history:\n{chat_history}\n'''

    # Create the model with the updated system prompt
    model = genai.GenerativeModel(
        model_name="gemini-2.0-flash",
        system_instruction=SYS
    )

    # Generate the response using the query
    response = model.generate_content(query)
    return response.text 

# Main Function
if __name__ == "__main__":
    chat_history = []
    while True:
        query = input("You: ")
        exit_msg = ["exit", "end", "quit", "bye", "goodbye", "stop", "close", "shut down"]

        if query.lower() in exit_msg:
            print("Thank you for using the bot!")
            break

        res = generate_response(query, chat_history)
        print("Bot: ", res)
        chat_history.append(f"You: {query}\nBot: {res}")