In [None]:
# Warning control
import warnings
warnings.filterwarnings('ignore')

In [1]:
import os

from crewai_tools import SerperDevTool, ScrapeWebsiteTool, WebsiteSearchTool
from dotenv import load_dotenv, find_dotenv

load_dotenv(find_dotenv()) # important line if cannot load api key

# Getting the api keys from the .env file

os.environ['OPENAI_API_KEY'] = os.getenv('OPENAI_API_KEY')
os.environ["SERPER_API_KEY"] = os.getenv('SERPER_API_KEY')
os.environ['KAGGLE_USERNAME'] = os.getenv('KAGGLE_USERNAME')
os.environ['KAGGLE_KEY'] = os.getenv('KAGGLE_KEY')

# load tools
serper_search_tool = SerperDevTool()
scrap_tool = ScrapeWebsiteTool()


In [6]:

from crewai_tools import BaseTool
from kaggle.api.kaggle_api_extended import KaggleApi

class KaggleDatasetDownloader(BaseTool):
    name: str = "Kaggle Dataset Downloader"
    description: str = "Downloads datasets from Kaggle using a provided URL."

    def _run(self, url: str) -> str:
        try:
            # authenticate API
            api = KaggleApi()
            api.authenticate()
            # Extract dataset info from URL
            parts = url.split('/')
            owner = parts[-2]
            dataset_name = parts[-1]
            
            # Download the dataset
            api.dataset_download_files(f"{owner}/{dataset_name}", path='./data', unzip=True)
            return f"Successfully downloaded dataset: {owner}/{dataset_name} to ./data directory"
        except Exception as e:
            if '403' in str(e):
                return "Error 403: Forbidden. Please check your Kaggle API credentials and dataset permissions."
            else:
                return f"Error downloading dataset: {str(e)}"

# initialize tool
kaggle_tool = KaggleDatasetDownloader()

In [7]:
# local LLM 
from langchain_community.llms import Ollama

llm = Ollama(model="llama3")

In [8]:
from crewai import Agent, Task

data_acquisition_agent = Agent(
    role='Data Acquisition Specialist',
    goal='Find and download appropriate datasets on a given topic',
    backstory='Expert in acquiring datasets from various sources, specializing in climate data',
    tools=[serper_search_tool, kaggle_tool],
    llm=llm,
    verbose=True
)

download_task = Task(
    description="""
    Search for three appropriate datasets on the topic of {topic} and download one using the Kaggle Dataset Downloader. 
    You can search for datasets using refined queries. Note that the Kaggle Dataset Downloader only requires one input, i.e., the URL.
    """,
    expected_output = 'The dataset should be downloaded to data-scientist/data. Provide the full description of the downloaded dataset. ',
    agent=data_acquisition_agent
)

In [9]:
from crewai import Crew

crew = Crew(
    agents=[data_acquisition_agent],
    tasks=[download_task]
)

result = crew.kickoff(inputs={'topic': 'Housing'})



[1m> Entering new CrewAgentExecutor chain...[0m
[32;1m[1;3mThought: I need to search for datasets related to Housing and find one that can be downloaded using Kaggle Dataset Downloader.

Action: Kaggle Dataset Downloader
Action Input: {'url': 'string[0m[32;1m[1;3mI apologize for the mistake. It seems I missed the curly braces in the action input. Let me try again!

Thought:
I need to search for datasets related to Housing and find one that can be downloaded using Kaggle Dataset Downloader.

Action: Kaggle Dataset Downloader
Action Input: {'url':[0m[32;1m[1;3mIt seems like I missed the quotation marks and values in my previous attempt. Let me try again!

Thought:
I need to search for datasets related to Housing and find one that can be downloaded using Kaggle Dataset Downloader.

Action: Kaggle Dataset Downloader
Action Input: {"url": ""[0m[95m 

Error downloading dataset: list index out of range
[00m
[32;1m[1;3mThought:
I think I know what's going on! I need a URL for 