# Libraries

In [6]:
import requests
import os
import zipfile

import pandas as pd

# Function to gather multiple kaggle datasets

In [3]:
def get_kaggle_data(user, dataset, folder='./raw_data'):
    """ Download and unzip a dataset from Kaggle.

    Args:
        user (str): The Kaggle username of the dataset owner.
        dataset (str): The name of the dataset.
        folder (str): The folder where the dataset will be saved
    Returns:
        (None)
    """

    base_url = "https://www.kaggle.com/api/v1/datasets/download/"
    url = f'{base_url}{user}/{dataset}'
    zip_file_path = f'{folder}/{dataset}.zip'
    unzip_dir = f'{folder}/{dataset}/'
    
    response = requests.get(url, allow_redirects=True)
    with open(zip_file_path, 'wb') as file:
        file.write(response.content)

    # Unzip the file
    with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
        zip_ref.extractall(unzip_dir)
    os.remove(zip_file_path)

# Get relevant data

In [None]:
get_kaggle_data("ma7555", "cat-breeds-dataset")

# Explore the data

In [10]:
CAT_CSV = "./raw_data/cat-breeds-dataset/data/cats.csv"

df = pd.read_csv(CAT_CSV)
df.head()

Unnamed: 0.1,Unnamed: 0,id,url,type,age,gender,size,coat,breed,photos,med_photos
0,0,46744842,https://www.petfinder.com/cat/arianna-46744842...,Cat,Adult,Female,Medium,,Abyssinian,[{'small': 'https://dl5zpyw5k3jeb.cloudfront.n...,['https://dl5zpyw5k3jeb.cloudfront.net/photos/...
1,1,46717321,https://www.petfinder.com/cat/ozzy-46717321/oh...,Cat,Adult,Male,Extra Large,,Abyssinian,[{'small': 'https://dl5zpyw5k3jeb.cloudfront.n...,['https://dl5zpyw5k3jeb.cloudfront.net/photos/...
2,2,46626338,https://www.petfinder.com/cat/zena-46626338/ca...,Cat,Senior,Female,Medium,,Abyssinian,[{'small': 'https://dl5zpyw5k3jeb.cloudfront.n...,['https://dl5zpyw5k3jeb.cloudfront.net/photos/...
3,3,46620170,https://www.petfinder.com/cat/mika-46620170/ca...,Cat,Baby,Female,Small,Short,Abyssinian,[{'small': 'https://dl5zpyw5k3jeb.cloudfront.n...,['https://dl5zpyw5k3jeb.cloudfront.net/photos/...
4,4,46582751,https://www.petfinder.com/cat/ruby-46582751/fl...,Cat,Young,Female,Medium,,Abyssinian,[{'small': 'https://dl5zpyw5k3jeb.cloudfront.n...,['https://dl5zpyw5k3jeb.cloudfront.net/photos/...


In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 67145 entries, 0 to 67144
Data columns (total 11 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Unnamed: 0  67145 non-null  int64 
 1   id          67145 non-null  int64 
 2   url         67145 non-null  object
 3   type        67145 non-null  object
 4   age         67145 non-null  object
 5   gender      67145 non-null  object
 6   size        67145 non-null  object
 7   coat        42012 non-null  object
 8   breed       67145 non-null  object
 9   photos      67145 non-null  object
 10  med_photos  67145 non-null  object
dtypes: int64(2), object(9)
memory usage: 5.6+ MB


In [None]:
pd.set_option('display.max_rows', None)
df["breed"].value_counts()

breed
Domestic Short Hair                      4049
Persian                                  3999
Domestic Long Hair                       3967
American Shorthair                       3964
Domestic Medium Hair                     3819
Calico                                   3389
Dilute Calico                            3212
Dilute Tortoiseshell                     3144
Siamese                                  2695
Ragdoll                                  2656
Torbie                                   2525
Tuxedo                                   2240
Manx                                     2048
Bengal                                   2044
Tabby                                    1989
Russian Blue                             1842
Tortoiseshell                            1625
Bombay                                   1618
Snowshoe                                 1609
Tiger                                    1590
Maine Coon                               1397
Himalayan                   

# Postprocess the data

# Save the data