<a href="https://colab.research.google.com/github/bijumanvya/BITS-Apex-Project/blob/main/Download_Kaggle_DataSet.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>


    ##Download and extract a Kaggle dataset into Google Drive using Kaggle API (without kaggle.json).
    If Folder existm, use the existing folder, else create one
    Download latest version into it. Clean the extracted files if exist and update with recent files.
    
    Parameters:
        dataset (str): Kaggle dataset identifier (e.g., "imakash3011/rental-bike-sharing")
        username (str): Your Kaggle username
        key (str): Your Kaggle API key
        drive_path (str): Path in Google Drive to save data (default: kaggle_data folder)
        
    Returns:
        List[str]: List of extracted file paths
       

In [7]:
import os
import zipfile
import shutil

def download_and_extract_kaggle_dataset(dataset: str, username: str, key: str, drive_path: str = "./kaggle_data"):

    # Set Kaggle credentials from Python
    os.environ["KAGGLE_USERNAME"] = username
    os.environ["KAGGLE_KEY"] = key

    # Ensure folder exists (reuse if already exists)
    if not os.path.exists(drive_path):
        os.makedirs(drive_path, exist_ok=True)

    # Download dataset zip (overwrite if exists)
    print(f"Downloading {dataset} to {drive_path} ...")
    os.system(f'kaggle datasets download -d {dataset} -p {drive_path} --force')

    # Zip file path
    zip_filename = dataset.split("/")[-1] + ".zip"
    zip_path = os.path.join(drive_path, zip_filename)

    # If old extracted content exists, clear it (but keep the zip file for re-extraction)
    for item in os.listdir(drive_path):
        item_path = os.path.join(drive_path, item)
        if item_path != zip_path:
            if os.path.isdir(item_path):
                shutil.rmtree(item_path)
            else:
                os.remove(item_path)

    # Extract files (overwrite old files if exist)
    extracted_files = []
    with zipfile.ZipFile(zip_path, "r") as zip_ref:
        zip_ref.extractall(drive_path)
        extracted_files = [os.path.join(drive_path, f) for f in zip_ref.namelist()]

    print("Extraction completed!")
    return extracted_files


In [9]:
# Example: Rental Bike Sharing dataset
from google.colab import drive
drive.mount('/content/drive')

files = download_and_extract_kaggle_dataset(
    dataset="imakash3011/rental-bike-sharing",
    username="bijumanvya",
    key="9b5c1a770ec8e48f92c338b723401cad",
    drive_path="/content/drive/MyDrive/BITS_APEX_PROJECT/Data"
)

print("Extracted files:", files)

# Load directly into Pandas
import pandas as pd
day_data = pd.read_csv([f for f in files if "day.csv" in f][0])
hour_data = pd.read_csv([f for f in files if "hour.csv" in f][0])

print(day_data.head())
print(hour_data.head())


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Downloading imakash3011/rental-bike-sharing to /content/drive/MyDrive/BITS_APEX_PROJECT/Data ...
Extraction completed!
Extracted files: ['/content/drive/MyDrive/BITS_APEX_PROJECT/Data/Readme.txt', '/content/drive/MyDrive/BITS_APEX_PROJECT/Data/day.csv', '/content/drive/MyDrive/BITS_APEX_PROJECT/Data/hour.csv']
   instant      dteday  season  yr  mnth  holiday  weekday  workingday  \
0        1  2011-01-01       1   0     1        0        6           0   
1        2  2011-01-02       1   0     1        0        0           0   
2        3  2011-01-03       1   0     1        0        1           1   
3        4  2011-01-04       1   0     1        0        2           1   
4        5  2011-01-05       1   0     1        0        3           1   

   weathersit      temp     atemp       hum  windspeed  casual  registered  \
0           2  0.344167  0.363625  0