In [2]:
! pip install kaggle


Collecting kaggle
  Downloading kaggle-1.6.17.tar.gz (82 kB)
  Preparing metadata (setup.py) ... [?25ldone
Collecting python-slugify (from kaggle)
  Downloading python_slugify-8.0.4-py2.py3-none-any.whl.metadata (8.5 kB)
Collecting bleach (from kaggle)
  Downloading bleach-6.2.0-py3-none-any.whl.metadata (30 kB)
Collecting webencodings (from bleach->kaggle)
  Downloading webencodings-0.5.1-py2.py3-none-any.whl.metadata (2.1 kB)
Collecting text-unidecode>=1.3 (from python-slugify->kaggle)
  Downloading text_unidecode-1.3-py2.py3-none-any.whl.metadata (2.4 kB)
Downloading bleach-6.2.0-py3-none-any.whl (163 kB)
Downloading python_slugify-8.0.4-py2.py3-none-any.whl (10 kB)
Downloading text_unidecode-1.3-py2.py3-none-any.whl (78 kB)
Downloading webencodings-0.5.1-py2.py3-none-any.whl (11 kB)
Building wheels for collected packages: kaggle
  Building wheel for kaggle (setup.py) ... [?25ldone
[?25h  Created wheel for kaggle: filename=kaggle-1.6.17-py3-none-any.whl size=105786 sha256=e97b624

In [6]:
!chmod 600 ~/.kaggle/kaggle.json

In [7]:
import os
import kaggle
import pandas as pd
from pathlib import Path
import zipfile

def setup_kaggle_credentials():
    """
    Check if Kaggle credentials exist, if not, guide user to set them up.
    """
    kaggle_dir = Path.home() / '.kaggle'
    kaggle_json = kaggle_dir / 'kaggle.json'
    
    if not kaggle_json.exists():
        print("Kaggle API credentials not found. Please follow these steps:")
        print("1. Go to https://www.kaggle.com/account")
        print("2. Scroll to 'API' section and click 'Create New API Token'")
        print("3. Move the downloaded 'kaggle.json' file to:", kaggle_dir)
        print("4. Run: chmod 600 ~/.kaggle/kaggle.json (on Unix-based systems)")
        return False
    return True

def download_dataset():
    """
    Download the Yahoo! Answers dataset from Kaggle.
    """
    dataset_name = "jarupula/yahoo-answers-dataset"
    
    print(f"Downloading dataset: {dataset_name}")
    try:
        kaggle.api.authenticate()
        kaggle.api.dataset_download_files(dataset_name, path='.', unzip=True)
        print("Dataset downloaded successfully!")
        return True
    except Exception as e:
        print(f"Error downloading dataset: {e}")
        return False

def verify_files():
    """
    Verify that all expected files are present and have content.
    """
    expected_files = ['classes.txt', 'train.csv', 'test.csv']
    missing_files = []
    
    for file in expected_files:
        if not os.path.exists(file):
            missing_files.append(file)
        elif os.path.getsize(file) == 0:
            missing_files.append(f"{file} (empty)")
    
    return missing_files

def load_and_verify_data():
    """
    Load the dataset and verify its structure matches the description.
    """
    try:
        # Load classes
        with open('classes.txt', 'r') as f:
            classes = [line.strip() for line in f.readlines()]
        print(f"Found {len(classes)} classes")
        
        # Load training data
        train_df = pd.read_csv('train.csv', names=['class', 'question_title', 'question_content', 'best_answer'])
        print(f"Training samples: {len(train_df)}")
        
        # Load test data
        test_df = pd.read_csv('test.csv', names=['class', 'question_title', 'question_content', 'best_answer'])
        print(f"Test samples: {len(test_df)}")
        
        # Verify counts
        assert len(train_df) == 1400000, "Expected 1,400,000 training samples"
        assert len(test_df) == 60000, "Expected 60,000 test samples"
        
        return True
    except Exception as e:
        print(f"Error verifying data: {e}")
        return False

def main():
    # Step 1: Check Kaggle credentials
    if not setup_kaggle_credentials():
        return
    
    # Step 2: Download dataset
    if not download_dataset():
        return
    
    # Step 3: Verify files
    missing_files = verify_files()
    if missing_files:
        print("Missing or empty files:", missing_files)
        return
    
    # Step 4: Load and verify data
    if load_and_verify_data():
        print("\nDataset successfully downloaded and verified!")
        print("You can now use the following files:")
        print("- classes.txt: List of class labels")
        print("- train.csv: Training data (1,400,000 samples)")
        print("- test.csv: Test data (60,000 samples)")
    else:
        print("\nDataset verification failed!")

if __name__ == "__main__":
    main()

Downloading dataset: jarupula/yahoo-answers-dataset
Dataset URL: https://www.kaggle.com/datasets/jarupula/yahoo-answers-dataset
Dataset downloaded successfully!
Found 10 classes
Training samples: 1400000
Test samples: 60000

Dataset successfully downloaded and verified!
You can now use the following files:
- classes.txt: List of class labels
- train.csv: Training data (1,400,000 samples)
- test.csv: Test data (60,000 samples)
