In [1]:
import langchain_core
print(langchain_core.__version__)


0.3.66


In [2]:
import sys
from pathlib import Path

# sdd /src to sys.path so you can import custom modules
sys.path.insert(0, str(Path.cwd().parent / "src"))

from cleaning import load_dataset, prepare_dataset

# Change the dataset name if needed
dataset_name = "FloridaBikeRentals.csv"

# Load and clean the dataset
df = load_dataset(dataset_name)
df = prepare_dataset(df)
df.head()


Unnamed: 0,date,rented_bike_count,hour,temperaturec,humidity,wind_speed_ms,visibility_10m,dew_point_temperaturec,solar_radiation_mjm2,rainfallmm,snowfall_cm,seasons,holiday,functioning_day
0,01-12-2017,254,0,-5.2,37,2.2,2000,-17.6,0.0,0.0,0.0,Winter,No Holiday,Yes
1,01-12-2017,204,1,-5.5,38,0.8,2000,-17.6,0.0,0.0,0.0,Winter,No Holiday,Yes
2,01-12-2017,173,2,-6.0,39,1.0,2000,-17.7,0.0,0.0,0.0,Winter,No Holiday,Yes
3,01-12-2017,107,3,-6.2,40,0.9,2000,-17.6,0.0,0.0,0.0,Winter,No Holiday,Yes
4,01-12-2017,78,4,-6.0,36,2.3,2000,-18.6,0.0,0.0,0.0,Winter,No Holiday,Yes


In [3]:
import os
from dotenv import load_dotenv

from langchain.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser

# Token counting and cost tracking
from langchain_openai import ChatOpenAI
from langchain.callbacks import get_openai_callback



# Load .env
load_dotenv()
api_key = os.getenv("OPENAI_API_KEY")

# nitialize GPT-4o model
llm = ChatOpenAI(
    model="gpt-4o",
    temperature=0.3,
    api_key=api_key,
)

# Define prompt template
prompt = ChatPromptTemplate.from_messages([
    ("system", "You are an assistant helping a user analyze a dataset."),
    # ("user", "Here is a summary of the dataset:\n\n{summary}\n\nWhat are some insights or questions we can ask?")
    ("user", "This is a summary of a dataset:\n\n{summary}\n\nGive 2-3 key insights and 2 questions, keep it brief.")
    # ("user", "This is a dataset summary:\n\n{summary}\n\nWhat would be 1 interesting thing to look for?")


])

# Run the model and track cost
context_summary = df.describe(include="all").to_string()

chain = prompt | llm | StrOutputParser()

with get_openai_callback() as cb:
    response = chain.invoke({"summary": context_summary})
    print("🤖 Databot says:\n")
    print(response)
    print("\n--- Cost Summary ---")
    print(f"Tokens used: {cb.total_tokens}")
    print(f"Total cost: ${cb.total_cost:.6f}")



🤖 Databot says:

**Key Insights:**

1. **Seasonal Influence:** The most frequent season for bike rentals is Spring, with 2208 occurrences. This suggests that bike rentals might be more popular during this season, potentially due to favorable weather conditions.

2. **Weather Impact:** The dataset shows a wide range of temperatures from -17.8°C to 39.4°C. The mean temperature is 12.88°C, indicating that bike rentals occur across diverse weather conditions. However, extreme temperatures might affect rental counts.

3. **Holiday and Functioning Days:** The majority of the days in the dataset are non-holidays (8328 out of 8760) and functioning days (8465 out of 8760). This implies that bike rentals are more common on regular working days rather than holidays.

**Questions:**

1. **How does the rented bike count vary with different weather conditions, such as temperature and rainfall?** Understanding this could help in predicting demand based on weather forecasts.

2. **What is the impact o

In [4]:
from context import generate_context_summary
from cleaning import load_dataset, prepare_dataset

dataset_name = "FloridaBikeRentals.csv"
df = prepare_dataset(load_dataset(dataset_name))
context = generate_context_summary(df)


In [5]:
# 03_user_questions.ipynb

import sys
from pathlib import Path

# 🔧 Add src/ folder to import cleaning + context utils
sys.path.insert(0, str(Path.cwd().parent / "src"))

# 📦 Import functions
from cleaning import load_dataset, prepare_dataset, get_missing_value_summary
from context import generate_context_summary

# 🔧 Dataset selector (same as previous notebook)
dataset_name = "FloridaBikeRentals.csv"

# 📄 Load and prepare dataset
df = load_dataset(dataset_name)
df = prepare_dataset(df)

# 🧠 Generate context summary for chatbot to "know"
context_summary = generate_context_summary(df)


In [6]:
# Example simulated user question
user_question = "Are there any missing values?"

# Basic logic for now, later this will be handled by the chatbot
if "missing" in user_question.lower():
    print("🤖 Databot says:\n")
    print(context_summary.split("🕳️")[1].strip())


🤖 Databot says:

Missing Data Summary:
There are no missing values in this dataset.


In [7]:
import sys
from pathlib import Path

# 📁 Add /src to sys.path so you can import custom modules
sys.path.insert(0, str(Path.cwd().parent / "src"))

from cleaning import load_dataset, prepare_dataset

# 🔧 Change the dataset name if needed
dataset_name = "Iris.csv"

# 📄 Load and clean the dataset
df = load_dataset(dataset_name)
df = prepare_dataset(df)
df.head()


Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


In [8]:
# Example simulated user question
user_question = "Are there any missing values?"

# Basic logic for now, later this will be handled by the chatbot
if "missing" in user_question.lower():
    print("🤖 Databot says:\n")
    print(context_summary.split("🕳️")[1].strip())

🤖 Databot says:

Missing Data Summary:
There are no missing values in this dataset.


In [9]:
context_summary = df.describe(include="all").to_string()

chain = prompt | llm | StrOutputParser()

with get_openai_callback() as cb:
    response = chain.invoke({"summary": context_summary})
    print("🤖 Databot says:\n")
    print(response)
    print("\n--- Cost Summary ---")
    print(f"Tokens used: {cb.total_tokens}")
    print(f"Total cost: ${cb.total_cost:.6f}")



🤖 Databot says:

**Key Insights:**

1. **Species Distribution:** The dataset consists of three unique species, with 'setosa' being the most frequent, appearing 50 times. This suggests that the dataset is evenly distributed among the species, assuming the other two species also appear 50 times each.

2. **Sepal and Petal Measurements:** The average sepal length and width are approximately 5.84 and 3.05, respectively, while the average petal length and width are 3.76 and 1.20. This indicates that, on average, sepals are longer and wider than petals.

3. **Variation in Measurements:** The standard deviation for petal length (1.76) is higher compared to other features, indicating greater variability in petal length across the dataset.

**Questions:**

1. **Species-Specific Analysis:** How do the sepal and petal measurements vary across the different species? Are there distinct patterns or overlaps in these measurements?

2. **Correlation Analysis:** What is the correlation between sepal an