In [1]:
from dotenv import load_dotenv
import pandas as pd
import os
import pyarrow
from itertools import product
from datetime import datetime
import requests
import kaggle 
import json


In [3]:
import os
from dotenv import load_dotenv
import kaggle
import tkinter as tk
from tkinter import scrolledtext, simpledialog

# Load environment variables from the .env file
env_path = '/Users/paigeblackstone/Desktop/Portfolio29/Portfolio29/env/kaggle.env'
load_dotenv(env_path)

def authenticate_kaggle():
    try:
        kaggle.api.authenticate()  # Authenticate using credentials from ~/.kaggle
        print("API authenticated successfully.")
    except Exception as e:
        print(f"An error occurred during authentication: {e}")

def search_datasets(query):
    try:
        authenticate_kaggle()
        print(f"Fetching datasets for query '{query}'...")
        
        # Fetch datasets
        datasets = kaggle.api.dataset_list(search=query, sort_by='updated')
        
        if datasets is None:
            print("API call returned None.")
        else:
            print(f"Number of datasets returned: {len(datasets)}")
        
        # Print attributes of the first dataset for debugging
        if len(datasets) > 0:
            first_dataset = datasets[0]
            print("Attributes of first dataset:")
            print(dir(first_dataset))  # List all attributes of the first dataset
        
        return datasets
    except Exception as e:
        print(f"An error occurred during dataset search: {e}")
        return []

def download_dataset(dataset_id, path):
    try:
        authenticate_kaggle()
        kaggle.api.dataset_download_files(dataset_id, path=path, unzip=True)
        print(f"Dataset {dataset_id} downloaded successfully.")
    except Exception as e:
        print(f"An error occurred while downloading the dataset: {e}")

def on_search():
    query = search_entry.get()
    tags = tags_entry.get().split(',')
    
    print(f"Search Query: '{query}'")
    print(f"Tags: {tags}")
    
    datasets = search_datasets(query)
    
    result_text.delete('1.0', tk.END)
    dataset_listbox.delete(0, tk.END)
    
    if datasets is None or len(datasets) == 0:
        result_text.insert(tk.END, "No datasets found or an error occurred.\n")
    else:
        for dataset in datasets:
            title = getattr(dataset, 'title', "No title available")
            dataset_id = getattr(dataset, 'ref', "No ID available")  # Updated to 'ref' as an example
            url = getattr(dataset, 'url', "No URL available")
            description = getattr(dataset, 'description', "No description available")
            last_updated = getattr(dataset, 'last_updated', "No update date available")
            size = getattr(dataset, 'size', "No size information available")
            rows = getattr(dataset, 'total_rows', "No row information available")
            
            result_text.insert(tk.END, f"Title: {title}\n")
            result_text.insert(tk.END, f"ID: {dataset_id}\n")
            result_text.insert(tk.END, f"URL: {url}\n")
            result_text.insert(tk.END, f"Description: {description}\n")
            result_text.insert(tk.END, f"Last Updated: {last_updated}\n")
            result_text.insert(tk.END, f"Size: {size}\n")
            result_text.insert(tk.END, f"Rows: {rows}\n")
            result_text.insert(tk.END, "-----------\n")
            
            # Add dataset to listbox for selection
            dataset_listbox.insert(tk.END, dataset_id)

def on_download():
    selected_index = dataset_listbox.curselection()
    if not selected_index:
        result_text.insert(tk.END, "No dataset selected for download.\n")
        return

    dataset_id = dataset_listbox.get(selected_index[0])
    download_path = simpledialog.askstring("Download Path", "Enter path to save dataset:", initialvalue=os.getcwd())
    
    if download_path:
        print(f"Downloading dataset: {dataset_id}")
        download_dataset(dataset_id, download_path)
        result_text.insert(tk.END, f"Downloading dataset {dataset_id} to {download_path}...\n")

# Set up the main window
root = tk.Tk()
root.title("Kaggle Dataset Browser")

# Search Entry
search_label = tk.Label(root, text="Search Query:")
search_label.pack(pady=5)
search_entry = tk.Entry(root, width=50)
search_entry.pack(pady=5)

# Tags Entry (Optional)
tags_label = tk.Label(root, text="Tags (comma-separated):")
tags_label.pack(pady=5)
tags_entry = tk.Entry(root, width=50)
tags_entry.pack(pady=5)

# Search Button
search_button = tk.Button(root, text="Search", command=on_search)
search_button.pack(pady=5)

# Results Area
result_text = scrolledtext.ScrolledText(root, width=80, height=10)
result_text.pack(pady=5)

# Dataset Listbox
dataset_listbox = tk.Listbox(root, width=80, height=20)
dataset_listbox.pack(pady=5)

# Download Button
download_button = tk.Button(root, text="Download Selected Dataset", command=on_download)
download_button.pack(pady=5)

root.mainloop()


Search Query: 'Congress'
Tags: ['Investment', ' Finance']
API authenticated successfully.
Fetching datasets for query 'Congress'...
Number of datasets returned: 20
Attributes of first dataset:
['__class__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__getstate__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__le__', '__lt__', '__module__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__sizeof__', '__str__', '__subclasshook__', '__weakref__', 'creatorName', 'creatorNameNullable', 'creatorUrl', 'creatorUrlNullable', 'currentVersionNumber', 'currentVersionNumberNullable', 'description', 'descriptionNullable', 'downloadCount', 'files', 'hasCreatorName', 'hasCreatorUrl', 'hasCurrentVersionNumber', 'hasDescription', 'hasLicenseName', 'hasOwnerName', 'hasOwnerRef', 'hasSubtitle', 'hasTitle', 'hasTotalBytes', 'hasUrl', 'hasUsabilityRating', 'id', 'isFeatured', 'isPrivate', 'kernelCount',